diff --git a/.circleci/config.yml b/.circleci/config.yml index 152db410..1674f36c 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -30,16 +30,6 @@ shared: &shared jobs: - py39: - <<: *shared - docker: - - image: cimg/python:3.9 - - py310: - <<: *shared - docker: - - image: cimg/python:3.10 - py311: <<: *shared docker: @@ -53,15 +43,13 @@ jobs: py313: <<: *shared docker: - - image: cimg/python:3.12 + - image: cimg/python:3.13 workflows: version: 2 build: jobs: - - py39 - - py310 - py311 - py312 - py313 diff --git a/README.md b/README.md index 45bf630a..e761efb7 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,7 @@ You can talk to me in advance through e-mails or the [Issues](https://github.com ## News +* 04/19/2026: `shorttext` 4.0.0 released. * 03/22/2026: `shorttext` 3.1.1 released. * 03/02/2026: `shorttext` 3.1.0 reelased. * 10/27/2025: `shorttext` 3.0.1 released. diff --git a/docs/codes.rst b/docs/codes.rst index b86ecd42..53c151cb 100644 --- a/docs/codes.rst +++ b/docs/codes.rst @@ -1,96 +1,339 @@ API === -API unlisted in tutorials are listed here. +Complete API reference for the shorttext library. -Shorttext Models Smart Loading ------------------------------- +.. contents:: + :local: + :backlinks: none + +Top-Level Modules +----------------- + +.. automodule:: shorttext + :members: + :undoc-members: + :show-inheritance: .. automodule:: shorttext.smartload :members: + :undoc-members: + :show-inheritance: -Supervised Classification using Word Embedding ----------------------------------------------- +Classifiers +----------- -Module `shorttext.generators.seq2seq.s2skeras` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. automodule:: shorttext.classifiers + :members: + :undoc-members: + :show-inheritance: -.. automodule:: shorttext.generators.seq2seq.s2skeras +Base Classifier +^^^^^^^^^^^^^^^ + +.. automodule:: shorttext.classifiers.base :members: + :undoc-members: + :show-inheritance: +Bag-of-Words Classifiers +^^^^^^^^^^^^^^^^^^^^^^^ -Module `shorttext.classifiers.embed.sumvec.VarNNSumEmbedVecClassification` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. automodule:: shorttext.classifiers.bow + :members: + :undoc-members: + :show-inheritance: -.. automodule:: shorttext.classifiers.embed.sumvec.VarNNSumEmbedVecClassification +.. automodule:: shorttext.classifiers.bow.topic.SkLearnClassification + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.classifiers.bow.topic.TopicVectorDistanceClassification :members: + :undoc-members: + :show-inheritance: +.. automodule:: shorttext.classifiers.bow.maxent.MaxEntClassification + :members: + :undoc-members: + :show-inheritance: -Neural Networks ---------------- +Embedding-Based Classifiers +^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Module `shorttext.classifiers.embed.sumvec.frameworks` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. automodule:: shorttext.classifiers.embed + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.classifiers.embed.sumvec.SumEmbedVecClassification + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.classifiers.embed.sumvec.VarNNSumEmbedVecClassification + :members: + :undoc-members: + :show-inheritance: .. automodule:: shorttext.classifiers.embed.sumvec.frameworks :members: + :undoc-members: + :show-inheritance: +.. automodule:: shorttext.classifiers.embed.nnlib.VarNNEmbedVecClassification + :members: + :undoc-members: + :show-inheritance: -Utilities ---------- +.. automodule:: shorttext.classifiers.embed.nnlib.frameworks + :members: + :undoc-members: + :show-inheritance: -Module `shorttext.utils.kerasmodel_io` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Generators +---------- -.. automodule:: shorttext.utils.kerasmodel_io +.. automodule:: shorttext.generators :members: + :undoc-members: + :show-inheritance: -Module `shorttext.utils.gensim_corpora` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Bag-of-Words Generators +^^^^^^^^^^^^^^^^^^^^^^ -.. automodule:: shorttext.utils.gensim_corpora +.. automodule:: shorttext.generators.bow :members: + :undoc-members: + :show-inheritance: -Module `shorttext.utils.compactmodel_io` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. automodule:: shorttext.generators.bow.GensimTopicModeling + :members: + :undoc-members: + :show-inheritance: -.. automodule:: shorttext.utils.compactmodel_io +.. automodule:: shorttext.generators.bow.LatentTopicModeling :members: + :undoc-members: + :show-inheritance: +.. automodule:: shorttext.generators.bow.AutoEncodingTopicModeling + :members: + :undoc-members: + :show-inheritance: + +Sequence-to-Sequence Generators +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. automodule:: shorttext.generators.seq2seq + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.generators.seq2seq.s2skeras + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.generators.seq2seq.charbaseS2S + :members: + :undoc-members: + :show-inheritance: + +Character-Based Generators +^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. automodule:: shorttext.generators.charbase + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.generators.charbase.char2vec + :members: + :undoc-members: + :show-inheritance: Metrics ------- -Module `shorttext.metrics.dynprog` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. automodule:: shorttext.metrics + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.metrics.dynprog + :members: + :undoc-members: + :show-inheritance: .. automodule:: shorttext.metrics.dynprog.jaccard :members: + :undoc-members: + :show-inheritance: .. automodule:: shorttext.metrics.dynprog.dldist :members: + :undoc-members: + :show-inheritance: .. automodule:: shorttext.metrics.dynprog.lcp :members: + :undoc-members: + :show-inheritance: -Module `shorttext.metrics.wassersterin` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. automodule:: shorttext.metrics.wasserstein + :members: + :undoc-members: + :show-inheritance: .. automodule:: shorttext.metrics.wasserstein.wordmoverdist - :members: word_mover_distance_linprog + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.metrics.embedfuzzy + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.metrics.embedfuzzy.jaccard + :members: + :undoc-members: + :show-inheritance: Spell Correction ---------------- -Module `shorttext.spell` -^^^^^^^^^^^^^^^^^^^^^^^^ +.. automodule:: shorttext.spell + :members: + :undoc-members: + :show-inheritance: .. automodule:: shorttext.spell.basespellcorrector :members: + :undoc-members: + :show-inheritance: +.. automodule:: shorttext.spell.norvig + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.spell.editor + :members: + :undoc-members: + :show-inheritance: + +Stacking +-------- +.. automodule:: shorttext.stack + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.stack.stacking + :members: + :undoc-members: + :show-inheritance: +Data +---- + +.. automodule:: shorttext.data + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.data.data_retrieval + :members: + :undoc-members: + :show-inheritance: + +Utilities +--------- + +.. automodule:: shorttext.utils + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.utils.kerasmodel_io + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.utils.compactmodel_io + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.utils.gensim_corpora + :members: + :undoc-members: + :show-inheritance: +.. automodule:: shorttext.utils.textpreprocessing + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.utils.wordembed + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.utils.compute + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.utils.misc + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.utils.dtm + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.utils.classification_exceptions + :members: + :undoc-members: + :show-inheritance: + +Schemas +------- + +.. automodule:: shorttext.schemas + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.schemas.models + :members: + :undoc-members: + :show-inheritance: + +CLI +--- + +.. automodule:: shorttext.cli + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.cli.categorization + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.cli.wordembedsim + :members: + :undoc-members: + :show-inheritance: Home: :doc:`index` \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index dfebbb67..578060a9 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -30,7 +30,8 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.mathjax', 'sphinx.ext.autodoc' + 'sphinx.ext.mathjax', 'sphinx.ext.autodoc', 'sphinx.ext.viewcode', + 'sphinx.ext.napoleon', 'sphinx.ext.intersphinx' ] @@ -58,9 +59,9 @@ # built documents. # # The short X.Y version. -version = u'3.1' +version = u'4.0' # The full version, including alpha/beta/rc tags. -release = u'3.1.1' +release = u'4.0.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -113,17 +114,28 @@ 'tensorflow', 'keras', 'gensim', 'numba', 'joblib' ] +autodoc_default_options = { + 'members': True, + 'undoc-members': True, + 'show-inheritance': True, +} + +autodoc_member_order = 'bysource' + # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'classic' - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -#html_theme_options = {} +html_theme = 'alabaster' + +html_theme_options = { + 'description': 'Short text classification toolkit', + 'github_user': 'analytics-warehouse', + 'github_repo': 'shorttext', + 'fixed_sidebar': False, + 'show_related': True, +} # Add any paths that contain custom themes here, relative to this directory. #html_theme_path = [] @@ -149,6 +161,15 @@ # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] +html_sidebars = { + '**': [ + 'about.html', + 'navigation.html', + 'searchbox.html', + 'related.html', + ], +} + # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. @@ -165,6 +186,10 @@ # Custom sidebar templates, maps document names to template names. #html_sidebars = {} +html_show_sourcelink = True +html_use_index = True +html_split_index = False + # Additional templates that should be rendered to pages, maps page names to # template names. #html_additional_pages = {} diff --git a/docs/news.rst b/docs/news.rst index e33eba5f..f3e58529 100644 --- a/docs/news.rst +++ b/docs/news.rst @@ -1,6 +1,7 @@ News ==== +* 04/19/2026: `shorttext` 4.0.0 released. * 03/22/2026: `shorttext` 3.1.1 released. * 03/02/2026: `shorttext` 3.1.0 reelased. * 10/27/2025: `shorttext` 3.0.1 released. @@ -89,6 +90,21 @@ News What's New ---------- +Release 4.0.0 (April 19, 2026) +------------------------------ + +* Removed support for Python 3.9 and 3.10; +* New style of documentation; +* New docstrings; +* Type hinting; +* Dependence on `gensim` reduced to topic modeling related functions and Word2Vec embedding; +* Modernizing the use of `keras`; +* The use of `loguru`, `orjson`, `sparse` and `npdict` libraries; +* Code cleanup and debugged; +* Removed the old implementation of document-term matrix, and replaced it with `NumpyDocumentTermMatrix`; +* Implementation of cosine similarity optimized by `numba` instead using the cosine distance from `scipy`; +* All unit tests and regression tests rewritten, and run by `pytest`; + Release 3.1.1 (March 22, 2026) ------------------------------ diff --git a/docs/requirements.txt b/docs/requirements.txt index db5188fa..478aa5fd 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -11,3 +11,6 @@ transformers==5.5.4 torch==2.11.0 numba==0.65.0 npdict==0.0.10 +orjson==3.11.8 +sparse==0.18.0 +loguru==0.7.3 diff --git a/docs/tutorial_charbaseonehot.rst b/docs/tutorial_charbaseonehot.rst index bd458756..0fb38993 100644 --- a/docs/tutorial_charbaseonehot.rst +++ b/docs/tutorial_charbaseonehot.rst @@ -19,7 +19,15 @@ the file `big.txt` in Peter Norvig's websites: Then instantiate the class using the function :func:`shorttext.generators.initSentenceToCharVecEncoder`: ->>> chartovec_encoder = shorttext.generators.initSentenceToCharVecEncoder(textfile) +>>> chartovec_encoder = shorttext.generators.initialize_SentenceToCharVecEncoder(textfile) + +Now, the object + +>>> chartovec_encoder = shorttext.generators.initialize_SentenceToCharVecEncoder(textfile) + +Now, the object + +>>> chartovec_encoder = shorttext.generators.initialize_SentenceToCharVecEncoder(textfile) Now, the object `chartovec_encoder` is an instance of :class:`shorttext.generators.SentenceToCharVecEncoder` . The default signal character is `\n`, which is also encoded, and can be checked by looking at the field: diff --git a/docs/tutorial_charbaseseq2seq.rst b/docs/tutorial_charbaseseq2seq.rst index 4b7d085e..2d6ca988 100644 --- a/docs/tutorial_charbaseseq2seq.rst +++ b/docs/tutorial_charbaseseq2seq.rst @@ -12,7 +12,21 @@ To use it, create an instance of the class :class:`shorttext.generators.Sentence >>> import numpy as np >>> import shorttext >>> from urllib.request import urlopen ->>> chartovec_encoder = shorttext.generators.initSentenceToCharVecEncoder(urlopen('http://norvig.com/big.txt', 'r')) +>>> chartovec_encoder = shorttext.generators.initialize_SentenceToCharVecEncoder(urlopen('http://norvig.com/big.txt', 'r')) + +The above code is the same as + +>>> import numpy as np +>>> import shorttext +>>> from urllib.request import urlopen +>>> chartovec_encoder = shorttext.generators.initialize_SentenceToCharVecEncoder(urlopen('http://norvig.com/big.txt', 'r')) + +The above code is the same as + +>>> import numpy as np +>>> import shorttext +>>> from urllib.request import urlopen +>>> chartovec_encoder = shorttext.generators.initialize_SentenceToCharVecEncoder(urlopen('http://norvig.com/big.txt', 'r')) The above code is the same as :doc:`tutorial_charbaseonehot` . diff --git a/docs/tutorial_dtm.rst b/docs/tutorial_dtm.rst index 005e625a..eb2355c9 100644 --- a/docs/tutorial_dtm.rst +++ b/docs/tutorial_dtm.rst @@ -26,38 +26,36 @@ Then now the variable `corpus` is a list of lists of tokens. For example, >>> corpus[0] # shows all the preprocessed tokens of the first Presidential Inaugural Addresses -Using Class `DocumentTermMatrix` --------------------------------- +Using Class `NumpyDocumentTermMatrix` +------------------------------------- -With the corpus ready in this form, we can create a `DocumentTermMatrix` class for DTM by: +Note: the old class `DocumentTermMatrix` has been removed in release 5.0.0. ->>> usprez_dtm = shorttext.utils.DocumentTermMatrix(corpus, docids=docids) +With the corpus ready in this form, we can create a `NumpyDocumentTermMatrix` class for DTM by: +(imposing tf-idf while creating the instance of the class by enforceing `tfidf` to be `True`) -.. autoclass:: shorttext.utils.dtm.DocumentTermMatrix +>>> dtm = shorttext.utils.NumpyDocumentTermMatrix(corpus, docids, tfidf=True) + +.. autoclass:: shorttext.utils.dtm.NumpyDocumentTermMatrix :members: One can get the document frequency of any token (the number of documents that the given token is in) by: ->>> usprez_dtm.get_doc_frequency('peopl') # gives 54, the document frequency of the token "peopl" +>>> dtm.get_doc_frequency('peopl') # gives 54, the document frequency of the token "peopl" or the total term frequencies (the total number of occurrences of the given tokens in all documents) by: ->>> usprez_dtm.get_total_termfreq('justic') # gives 134.0, the total term frequency of the token "justic" +>>> dtm.get_total_termfreq('justic') # gives 32.32, the total term frequency of the token "justic" or the term frequency for a token in a given document by: ->>> usprez_dtm.get_termfreq('2009-Obama', 'chang') # gives 2.0 +>>> dtm.get_termfreq('2009-Obama', 'chang') # gives 0.94 We can also query the number of occurrences of a particular word of all documents, stored in a dictionary, by: ->>> usprez_dtm.get_token_occurences('god') - -Of course, we can always reweigh the counts above (except document frequency) by imposing -tf-idf while creating the instance of the class by enforceing `tfidf` to be `True`: - ->>> usprez_dtm = shorttext.utils.DocumentTermMatrix(corpus, docids=docids, tfidf=True) +>>> dtm.get_token_occurences('god') To save the class, enter: @@ -65,7 +63,7 @@ To save the class, enter: To load this class later, enter: ->>> usprez_dtm2 = shorttext.utils.load_DocumentTermMatrix('/path/to/whatever.bin') +>>> usprez_dtm2 = shorttext.utils.load_numpy_documentmatrixmatrix('/path/to/whatever.bin') .. automodule:: shorttext.utils.dtm :members: load_DocumentTermMatrix diff --git a/docs/tutorial_maxent.rst b/docs/tutorial_maxent.rst index 2eec0911..02994c11 100644 --- a/docs/tutorial_maxent.rst +++ b/docs/tutorial_maxent.rst @@ -28,7 +28,7 @@ The classifier can be instantiated by: Train the classifier: ->>> classifier.train(classdict, nb_epochs=1000) +>>> classifier.train(classdict, nb_epochs=300) After training, it can be used for classification, such as diff --git a/docs/tutorial_topic.rst b/docs/tutorial_topic.rst index eab97915..4043420d 100644 --- a/docs/tutorial_topic.rst +++ b/docs/tutorial_topic.rst @@ -47,14 +47,17 @@ with the trained model. For example, >>> topicmodeler.retrieve_topicvec('stem cell research') ->>> topicmodeler.retrieve_topicvec('bioinformatics') +>>> topicmodeler.retrieve_topicvec('informatics') By default, the vectors are normalized. Another way to retrieve the topic vector representation is as follow: >>> topicmodeler['stem cell research'] ->>> topicmodeler['bioinformatics'] +>>> topicmodeler['informatics'] + +If the dictionary does not have the processed tokens, it will return a numpy +array with all values `nan`. In the training and the retrieval above, the same preprocessing process is applied. Users can provide their own preprocessor while initiating the topic modeler. @@ -73,22 +76,6 @@ The default is to weigh. To not weigh, initialize it as >>> topicmodeler3 = shorttext.generators.GensimTopicModeler(toweigh=False) -Appendix: Model I/O in Previous Versions -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -For previous versions of `shorttext`, the trained models are saved by calling: - ->>> topicmodeler.savemodel('/path/to/nihlda128') - -However, we discourage users using this anymore, because the model I/O for various models -in gensim have been different. It produces errors. - -All of them have to be present in order to be loaded. Note that the preprocessor is -not saved. To load the model, enter: - ->>> topicmodeler2 = shorttext.classifiers.load_gensimtopicmodel('/path/to/nihlda128', compact=False) - - .. automodule:: shorttext.generators.bow.GensimTopicModeling :members: @@ -96,9 +83,6 @@ not saved. To load the model, enter: AutoEncoder ----------- -Note: Previous version (<=0.2.1) of this autoencoder has a serious bug. Current version is -incompatible with the autoencoder of version <=0.2.1 . - Another way to find a new topic vector representation is to use the autoencoder, a neural network model which compresses a vector representation into another one of a shorter (or longer, rarely though) representation, by minimizing the difference between the input layer and the decoding layer. @@ -146,38 +130,6 @@ The default is to weigh. To not weigh, initialize it as: .. automodule:: shorttext.generators.bow.AutoEncodingTopicModeling :members: - -Appendix: Unzipping Model I/O -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -For previous versions of `shorttext`, the trained models are saved by calling: - ->>> autoencoder.savemodel('/path/to/sub_autoencoder8') - -The following files are produced for the autoencoder: - -:: - - /path/to/sub_autoencoder.json - /path/to/sub_autoencoder.gensimdict - /path/to/sub_autoencoder_encoder.json - /path/to/sub_autoencoder_encoder.h5 - /path/to/sub_autoencoder_classtopicvecs.pkl - -If specifying `save_complete_autoencoder=True`, then four more files are found: - -:: - - /path/to/sub_autoencoder_decoder.json - /path/to/sub_autoencoder_decoder.h5 - /path/to/sub_autoencoder_autoencoder.json - /path/to/sub_autoencoder_autoencoder.h5 - -Users can load the same model later by entering: - ->>> autoencoder2 = shorttext.classifiers.load_autoencoder_topic('/path/to/sub_autoencoder8', compact=False) - - Abstract Latent Topic Modeling Class ------------------------------------ @@ -194,29 +146,6 @@ this, he has to define the methods `train`, `retrieve_topic_vec`, `loadmodel`, a .. automodule:: shorttext.generators.bow.GensimTopicModeling :members: -Appendix: Namespaces for Topic Modeler in Previous Versions -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -All generative topic modeling algorithms were placed under the package `shorttext.classifiers` for version <=0.3.4. -In current version (>= 0.3.5), however, all generative models will be moved to `shorttext.generators`, -while any classifiers making use of these topic models are still kept under `shorttext.classifiers`. -A list include: - -:: - - shorttext.classifiers.GensimTopicModeler -> shorttext.generators.GensimTopicModeler - shorttext.classifiers.LDAModeler -> shorttext.generators.LDAModeler - shorttext.classifiers.LSIModeler -> shorttext.generators.LSIModeler - shorttext.classifiers.RPModeler -> shorttext.generators.RPModeler - shorttext.classifiers.AutoencodingTopicModeler -> shorttext.generators.AutoencodingTopicModeler - shorttext.classifiers.load_gensimtopicmodel -> shorttext.generators.load_gensimtopicmodel - shorttext.classifiers.load_autoencoder_topic -> shorttext.generators.load_autoencoder_topicmodel - - -Before release 0.5.6, for backward compatibility, developers can still call the topic models as if there were no such changes, -although they are advised to make this change. However, *effective release 0.5.7, this backward compatibility is no longer -available.* - Classification Using Cosine Similarity -------------------------------------- diff --git a/examples/sakaguchi_spell/binarize.py b/examples/sakaguchi_spell/binarize.py index 49b1fc02..835b8442 100644 --- a/examples/sakaguchi_spell/binarize.py +++ b/examples/sakaguchi_spell/binarize.py @@ -4,7 +4,7 @@ from functools import reduce import numpy as np -from shorttext.generators.charbase.char2vec import initSentenceToCharVecEncoder +from shorttext.generators.charbase.char2vec import initialize_SentenceToCharVecEncoder from shorttext.utils import OperationNotDefinedException @@ -16,7 +16,7 @@ class SpellingToConcatCharVecEncoder: def __init__(self, alph): - self.charevec_encoder = initSentenceToCharVecEncoder(alph) + self.charevec_encoder = initialize_SentenceToCharVecEncoder(alph) def encode_spelling(self, spelling): spmat = self.charevec_encoder.encode_sentence(spelling, len(spelling)) diff --git a/pyproject.toml b/pyproject.toml index 35b2b5b6..621af6a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "shorttext" -version = "3.1.1" +version = "4.0.0" authors = [ {name = "Kwan Yuet Stephen Ho", email = "stephenhky@yahoo.com.hk"} ] @@ -12,14 +12,12 @@ description = "Short Text Mining" readme = {file = "README.md", content-type = "text/markdown"} license = {text = "MIT"} keywords = ["shorttext", "natural language processing", "text mining"] -requires-python = ">=3.9" +requires-python = ">=3.11" classifiers = [ "Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Scientific/Engineering :: Mathematics", "Topic :: Text Processing :: Linguistic", "Topic :: Software Development :: Libraries :: Python Modules", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", @@ -36,13 +34,16 @@ dependencies = [ "joblib>=1.3.0", "scikit-learn>=1.2.0", "tensorflow>=2.13.0", - "keras>=2.13.0", + "keras>=3.0.0", "gensim>=4.0.0", "pandas>=1.2.0", "snowballstemmer>=3.0.0", "numba>=0.57.0", "deprecation>=2.0.0", - "npdict>=0.0.5" + "npdict>=0.0.10", + "sparse>=0.10.0", + "orjson>=3.0.0", + "loguru>=0.6.0" ] [project.urls] @@ -72,6 +73,7 @@ packages = [ "shorttext.metrics.dynprog", "shorttext.metrics.wasserstein", "shorttext.metrics.embedfuzzy", + "shorttext.schemas", "shorttext.spell" ] zip-safe = false @@ -82,4 +84,4 @@ ShortTextCategorizerConsole = "shorttext.cli.categorization:main" ShortTextWordEmbedSimilarity = "shorttext.cli.wordembedsim:main" [project.optional-dependencies] -test = ["unittest2", "pytest"] +test = ["pytest"] diff --git a/src/shorttext/classifiers/base.py b/src/shorttext/classifiers/base.py new file mode 100644 index 00000000..f2374024 --- /dev/null +++ b/src/shorttext/classifiers/base.py @@ -0,0 +1,18 @@ + +from abc import ABC, abstractmethod + + +class AbstractScorer(ABC): + """Abstract base class for scoring classifiers.""" + + @abstractmethod + def score(self, shorttext: str) -> dict[str, float]: + """Calculate classification scores. + + Args: + shorttext: Input text to classify. + + Returns: + Dictionary mapping class labels to scores. + """ + raise NotImplementedError() diff --git a/src/shorttext/classifiers/bow/maxent/MaxEntClassification.py b/src/shorttext/classifiers/bow/maxent/MaxEntClassification.py index 1db45fd2..5b644bae 100644 --- a/src/shorttext/classifiers/bow/maxent/MaxEntClassification.py +++ b/src/shorttext/classifiers/bow/maxent/MaxEntClassification.py @@ -1,37 +1,38 @@ -import pickle +from typing import Literal, Optional -from scipy.sparse import dok_matrix -from gensim.corpora import Dictionary -from tensorflow.keras.models import Sequential +import sparse +import orjson +from tensorflow.keras import Model, Sequential from tensorflow.keras.layers import Dense from tensorflow.keras.regularizers import l2 from ....utils import kerasmodel_io as kerasio from ....utils import tokenize -from ....utils import gensim_corpora as gc from ....utils import classification_exceptions as e from ....utils.compactmodel_io import CompactIOMachine - - -def logistic_framework(nb_features, nb_outputs, l2reg=0.01, bias_l2reg=0.01, optimizer='adam'): - """ Construct the neural network of maximum entropy classifier. - - Given the numbers of features and the output labels, return a keras neural network - for implementing maximum entropy (multinomial) classifier. - - :param nb_features: number of features - :param nb_outputs: number of output labels - :param l2reg: L2 regularization coefficient (Default: 0.01) - :param bias_l2reg: L2 regularization coefficient for bias (Default: 0.01) - :param optimizer: optimizer for gradient descent. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. (Default: adam) - :return: keras sequential model for maximum entropy classifier - :type nb_features: int - :type nb_outputs: int - :type l2reg: float - :type bias_l2reg: float - :type optimizer: str - :rtype: keras.model.Sequential +from ....utils.dtm import convert_classdict_to_xy +from ...base import AbstractScorer + + +def logistic_framework( + nb_features: int, + nb_outputs: int, + l2reg: float = 0.01, + bias_l2reg: float = 0.01, + optimizer: Literal["sgd", "rmsprop", "adagrad", "adadelta", "adam", "adamax", "nadam"] = "adam" +) -> Model: + """Create a maximum entropy classifier neural network. + + Args: + nb_features: Number of input features. + nb_outputs: Number of output classes. + l2reg: L2 regularization coefficient. Default: 0.01. + bias_l2reg: L2 regularization for bias. Default: 0.01. + optimizer: Optimizer. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. Default: adam. + + Returns: + Keras Sequential model for maximum entropy classification. """ kmodel = Sequential() kmodel.add(Dense(units=nb_outputs, @@ -44,203 +45,166 @@ def logistic_framework(nb_features, nb_outputs, l2reg=0.01, bias_l2reg=0.01, opt return kmodel -class MaxEntClassifier(CompactIOMachine): - """ - This is a classifier that implements the principle of maximum entropy. +class MaxEntClassifier(AbstractScorer, CompactIOMachine): + """Maximum entropy classifier. + + A classifier that implements the principle of maximum entropy + for text categorization using bag-of-words features. Reference: - * Adam L. Berger, Stephen A. Della Pietra, Vincent J. Della Pietra, "A Maximum Entropy Approach to Natural Language Processing," *Computational Linguistics* 22(1): 39-72 (1996). + Adam L. Berger et al., "A Maximum Entropy Approach to Natural + Language Processing," Computational Linguistics 22(1): 39-72 (1996). """ - def __init__(self, preprocessor=lambda s: s.lower()): - """ Initializer. - - :param preprocessor: text preprocessor - :type preprocessor: function - """ - CompactIOMachine.__init__(self, - {'classifier': 'maxent'}, - 'maxent', - ['_classlabels.txt', '.json', '.weights.h5', '_labelidx.pkl', '_dictionary.dict']) - self.preprocessor = preprocessor - self.trained = False - def shorttext_to_vec(self, shorttext): - """ Convert the shorttext into a sparse vector given the dictionary. + def __init__(self, preprocessor: Optional[callable] = None): + """Initialize the classifier. - According to the dictionary (gensim.corpora.Dictionary), convert the given text - into a vector representation, according to the occurence of tokens. - - This function is deprecated and no longer used because it is too slow to run in a loop. - But this is used while doing prediction. - - :param shorttext: short text to be converted. - :return: sparse vector of the vector representation - :type shorttext: str - :rtype: scipy.sparse.dok_matrix + Args: + preprocessor: Text preprocessing function. Default: lowercase. """ - # too slow, deprecated - tokens = tokenize(self.preprocessor(shorttext)) - - vec = dok_matrix((1, len(self.dictionary))) - for token in tokens: - if token in self.dictionary.token2id: - vec[0, self.dictionary.token2id[token]] = 1.0 - - return vec[0, :] + CompactIOMachine.__init__( + self, + {'classifier': 'maxent'}, + 'maxent', + ['_classlabels.txt', '.json', '.weights.h5', '_labels2idx.json', '_tokens2idx.json'] + ) - def index_classlabels(self): - """ Index the class outcome labels. + if preprocessor is None: + preprocessor = lambda s: s.lower() - Index the class outcome labels into integers, for neural network implementation. + self.preprocess_func = preprocessor + self.trained = False - """ - self.labels2idx = {label: idx for idx, label in enumerate(self.classlabels)} + def shorttext_to_vec(self, shorttext: str) -> sparse.SparseArray: + """Convert short text to sparse vector. - def convert_classdict_to_XY(self, classdict): - """ Convert the training data into sparse matrices for training. + Args: + shorttext: Input text. - :param classdict: training data - :return: a tuple, consisting of sparse matrices for X (training data) and y (the labels of the training data) - :type classdict: dict - :rtype: tuple + Returns: + Sparse vector representation. """ - nb_data = sum([len(classdict[k]) for k in classdict]) - X = dok_matrix((nb_data, len(self.dictionary))) - y = dok_matrix((nb_data, len(self.labels2idx))) - - rowid = 0 - for label in classdict: - if label in self.labels2idx.keys(): - for shorttext in classdict[label]: - tokens = tokenize(self.preprocessor(shorttext)) - for token in tokens: - X[rowid, self.dictionary.token2id[token]] += 1.0 - y[rowid, self.labels2idx[label]] = 1. - rowid += 1 - - return X, y - - def train(self, classdict, nb_epochs=500, l2reg=0.01, bias_l2reg=0.01, optimizer='adam'): - """ Train the classifier. - - Given the training data, train the classifier. - - :param classdict: training data - :param nb_epochs: number of epochs (Defauly: 500) - :param l2reg: L2 regularization coefficient (Default: 0.01) - :param bias_l2reg: L2 regularization coefficient for bias (Default: 0.01) - :param optimizer: optimizer for gradient descent. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. (Default: adam) - :return: None - :type classdict: dict - :type nb_epochs: int - :type l2reg: float - :type bias_l2reg: float - :type optimizer: str + tokens = tokenize(self.preprocess_func(shorttext)) + token_indices = [ + self.token2idx.get(token) + for token in tokens + if token in self.token2idx.keys() + ] + + vec = sparse.COO( + [[0]*len(token_indices), token_indices], + [1.0]*len(token_indices), + shape=(1, len(self.token2idx)) + ) + + return vec + + def train( + self, + classdict: dict[str, list[str]], + nb_epochs: int = 500, + l2reg: float = 0.01, + bias_l2reg: float = 0.01, + optimizer: Literal["sgd", "rmsprop", "adagrad", "adadelta", "adam", "adamax", "nadam"] = "adam" + ) -> None: + """Train the classifier. + + Args: + classdict: Training data. + nb_epochs: Number of training epochs. Default: 500. + l2reg: L2 regularization coefficient. Default: 0.01. + bias_l2reg: L2 regularization for bias. Default: 0.01. + optimizer: Optimizer. Default: adam. """ - self.dictionary, self.corpus, self.classlabels = gc.generate_gensim_corpora(classdict, - preprocess_and_tokenize=lambda s: tokenize(self.preprocessor(s))) - self.index_classlabels() - - X, y = self.convert_classdict_to_XY(classdict) + self.classlabels = sorted(classdict.keys()) + self.labels2idx = {label: idx for idx, label in enumerate(self.classlabels)} - kmodel = logistic_framework(len(self.dictionary), - len(self.classlabels), - l2reg=l2reg, - bias_l2reg=bias_l2reg, - optimizer=optimizer) - kmodel.fit(X.toarray(), y.toarray(), epochs=nb_epochs) + dtm_npdict_matrix, y = convert_classdict_to_xy( + classdict, self.labels2idx, preprocess_func=self.preprocess_func, tokenize_func=tokenize + ) + self.token2idx = { + token: idx + for idx, token in enumerate(dtm_npdict_matrix._lists_keystrings[1]) + } + + kmodel = logistic_framework( + dtm_npdict_matrix.dimension_sizes[1], + len(self.classlabels), + l2reg=l2reg, + bias_l2reg=bias_l2reg, + optimizer=optimizer + ) + kmodel.fit(dtm_npdict_matrix.to_numpy(), y.todense(), epochs=nb_epochs) self.model = kmodel self.trained = True - def savemodel(self, nameprefix): - """ Save the trained model into files. + def savemodel(self, nameprefix: str) -> None: + """Save the trained model to files. - Given the prefix of the file paths, save the model into files, with name given by the prefix. - There will be give files produced, one name ending with "_classlabels.txt", one with ".json", - one with ".weights.h5", one with "_labelidx.pkl", and one with "_dictionary.dict". + Args: + nameprefix: Prefix for output files. - If there is no trained model, a `ModelNotTrainedException` will be thrown. - - :param nameprefix: prefix of the file path - :return: None - :type nameprefix: str - :raise: ModelNotTrainedException + Raises: + ModelNotTrainedException: If not trained. """ if not self.trained: raise e.ModelNotTrainedException() kerasio.save_model(nameprefix, self.model) + open(nameprefix+'_tokens2idx.json', 'wb').write(orjson.dumps(self.token2idx)) + open(nameprefix+'_classlabels.txt', 'w').write('\n'.join(self.classlabels)) + open(nameprefix+'_labels2idx.json', 'wb').write(orjson.dumps(self.labels2idx)) - self.dictionary.save(nameprefix+'_dictionary.dict') - - labelfile = open(nameprefix+'_classlabels.txt', 'w') - labelfile.write('\n'.join(self.classlabels)) - labelfile.close() - - pickle.dump(self.labels2idx, open(nameprefix+'_labelidx.pkl', 'wb')) - - def loadmodel(self, nameprefix): - """ Load a trained model from files. - - Given the prefix of the file paths, load the model from files with name given by the prefix - followed by "_classlabels.txt", ".json", ".weights.h5", "_labelidx.pkl", and "_dictionary.dict". + def loadmodel(self, nameprefix: str) -> None: + """Load a trained model from files. - If this has not been run, or a model was not trained by :func:`~train`, - a `ModelNotTrainedException` will be raised while performing prediction or saving the model. - - :param nameprefix: prefix of the file path - :return: None - :type nameprefix: str + Args: + nameprefix: Prefix for input files. """ self.model = kerasio.load_model(nameprefix) - - self.dictionary = Dictionary.load(nameprefix+'_dictionary.dict') - - labelfile = open(nameprefix+'_classlabels.txt', 'r') - self.classlabels = [s.strip() for s in labelfile.readlines()] - labelfile.close() - - self.labels2idx = pickle.load(open(nameprefix+'_labelidx.pkl', 'rb')) - + self.token2idx = orjson.loads(open(nameprefix+"_tokens2idx.json", "rb").read()) + self.classlabels = [ + s.strip() + for s in open(nameprefix+'_classlabels.txt', 'r').readlines() + ] + self.labels2idx = orjson.loads(open(nameprefix+"_labels2idx.json", "rb").read()) self.trained = True - def score(self, shorttext): - """ Calculate the scores for all the class labels for the given short sentence. + def score(self, shorttext: str) -> dict[str, float]: + """Calculate classification scores for all class labels. + + Args: + shorttext: Input text. - Given a short sentence, calculate the classification scores for all class labels, - returned as a dictionary with key being the class labels, and values being the scores. - If the short sentence is empty, or if other numerical errors occur, the score will be `numpy.nan`. - If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. + Returns: + Dictionary mapping class labels to scores. - :param shorttext: a short sentence - :return: a dictionary with keys being the class labels, and values being the corresponding classification scores - :type shorttext: str - :rtype: dict - :raise: ModelNotTrainedException + Raises: + ModelNotTrainedException: If not trained. """ if not self.trained: raise e.ModelNotTrainedException() vec = self.shorttext_to_vec(shorttext) - predictions = self.model.predict(vec.toarray()) + predictions = self.model.predict(vec.todense()) - # wrangle output result - scoredict = {classlabel: predictions[0][idx] for idx, classlabel in enumerate(self.classlabels)} + scoredict = { + classlabel: predictions[0][idx] + for idx, classlabel in enumerate(self.classlabels) + } return scoredict -def load_maxent_classifier(name, compact=True): - """ Load the maximum entropy classifier from saved model. +def load_maxent_classifier(name: str, compact: bool=True) -> MaxEntClassifier: + """Load a MaxEntClassifier from file. - Given a moel file(s), load the maximum entropy classifier. + Args: + name: Model name (compact) or file prefix (non-compact). + compact: Whether to load compact model. Default: True. - :param name: name or prefix of the file, if compact is True or False respectively - :param compact: whether the model file is compact (Default:True) - :return: maximum entropy classifier - :type name: str - :type compact: bool - :rtype: MaxEntClassifier + Returns: + MaxEntClassifier instance. """ classifier = MaxEntClassifier() if compact: diff --git a/src/shorttext/classifiers/bow/topic/SkLearnClassification.py b/src/shorttext/classifiers/bow/topic/SkLearnClassification.py index a069eaba..a4cd0a5d 100644 --- a/src/shorttext/classifiers/bow/topic/SkLearnClassification.py +++ b/src/shorttext/classifiers/bow/topic/SkLearnClassification.py @@ -1,132 +1,141 @@ -import os +from typing import Optional, Literal +import numpy as np +import numpy.typing as npt import joblib +import sklearn from ....utils import textpreprocessing as textpreprocess from ....generators import load_autoencoder_topicmodel, load_gensimtopicmodel from ....generators import LDAModeler, LSIModeler, RPModeler, AutoencodingTopicModeler +from ....generators import LatentTopicModeler from ....utils import classification_exceptions as e from ....utils import compactmodel_io as cio +from ...base import AbstractScorer -class TopicVectorSkLearnClassifier: - """ - This is a classifier that wraps any supervised learning algorithm in `scikit-learn`, - and use the topic vectors output by the topic modeler :class:`LatentTopicModeler` that - wraps the topic models in `gensim`. +class TopicVectorSkLearnClassifier(AbstractScorer): + """Classifier using topic vectors with scikit-learn. - # Reference + Wraps any scikit-learn supervised learning algorithm and uses + topic vectors from LatentTopicModeler as features. - Xuan Hieu Phan, Cam-Tu Nguyen, Dieu-Thu Le, Minh Le Nguyen, Susumu Horiguchi, Quang-Thuy Ha, - "A Hidden Topic-Based Framework toward Building Applications with Short Web Documents," - *IEEE Trans. Knowl. Data Eng.* 23(7): 961-976 (2011). + Reference: + Xuan Hieu Phan et al., "A Hidden Topic-Based Framework toward + Building Applications with Short Web Documents," + IEEE Trans. Knowl. Data Eng. 23(7): 961-976 (2011). - Xuan Hieu Phan, Le-Minh Nguyen, Susumu Horiguchi, "Learning to Classify Short and Sparse Text & Web withHidden Topics from Large-scale Data Collections," - WWW '08 Proceedings of the 17th international conference on World Wide Web. (2008) [`ACL - `_] + Xuan Hieu Phan et al., "Learning to Classify Short and Sparse + Text & Web with Hidden Topics from Large-scale Data Collections," + WWW 2008. + http://dl.acm.org/citation.cfm?id=1367510 """ - def __init__(self, topicmodeler, sklearn_classifier): - """ Initialize the classifier. - :param topicmodeler: a topic modeler - :param sklearn_classifier: a scikit-learn classifier - :type topicmodeler: LatentTopicModeler - :type sklearn_classifier: sklearn.base.BaseEstimator + def __init__( + self, + topicmodeler: LatentTopicModeler, + sklearn_classifier: sklearn.base.BaseEstimator + ): + """Initialize the classifier. + + Args: + topicmodeler: A topic modeler instance. + sklearn_classifier: A scikit-learn classifier instance. """ self.topicmodeler = topicmodeler self.classifier = sklearn_classifier self.trained = False - def train(self, classdict, *args, **kwargs): - """ Train the classifier. + def train(self, classdict: dict[str, list[str]], *args, **kwargs) -> None: + """Train the classifier. - If the topic modeler does not have a trained model, it will raise `ModelNotTrainedException`. + Args: + classdict: Training data with class labels as keys and texts as values. + *args: Arguments passed to scikit-learn classifier fit(). + **kwargs: Arguments passed to scikit-learn classifier fit(). - :param classdict: training data - :param args: arguments to be passed to the `fit` method of the scikit-learn classifier - :param kwargs: arguments to be passed to the `fit` method of the scikit-learn classifier - :return: None - :raise: ModelNotTrainedException - :type classdict: dict + Raises: + ModelNotTrainedException: If topic modeler is not trained. """ - X = [] + x = [] y = [] self.classlabels = sorted(classdict.keys()) # classlabels must be sorted like the topic modelers - for classidx, classlabel in zip(range(len(self.classlabels)), self.classlabels): - topicvecs = [self.topicmodeler.retrieve_topicvec(topic) for topic in classdict[classlabel]] - X += topicvecs + for classidx, classlabel in enumerate(self.classlabels): + topicvecs = [ + self.topicmodeler.retrieve_topicvec(shorttext) + for shorttext in classdict[classlabel] + ] + x += topicvecs y += [classidx]*len(topicvecs) - self.classifier.fit(X, y, *args, **kwargs) + self.classifier.fit(x, y, *args, **kwargs) self.trained = True - def getvector(self, shorttext): - """ Retrieve the topic vector representation of the given short text. + def getvector(self, shorttext: str) -> npt.NDArray[np.float64]: + """Get topic vector for short text. + + Args: + shorttext: Input text. - If the topic modeler does not have a trained model, it will raise `ModelNotTrainedException`. + Returns: + Topic vector representation. - :param shorttext: short text - :return: topic vector representation - :raise: ModelNotTrainedException - :type shorttext: str - :rtype: numpy.ndarray + Raises: + ModelNotTrainedException: If model not trained. """ if not self.trained: raise e.ModelNotTrainedException() return self.topicmodeler.retrieve_topicvec(shorttext) - def classify(self, shorttext): - """ Give the highest-scoring class of the given short text according to the classifier. + def classify(self, shorttext: str) -> str: + """Classify short text into a class label. - If neither :func:`~train` nor :func:`~loadmodel` was run, or if the - topic model was not trained, it will raise `ModelNotTrainedException`. + Args: + shorttext: Input text to classify. - :param shorttext: short text - :return: class label of the classification result of the given short text - :raise: ModelNotTrainedException - :type shorttext: str - :rtype: str + Returns: + Predicted class label. + + Raises: + ModelNotTrainedException: If model not trained. """ if not self.trained: raise e.ModelNotTrainedException() topicvec = self.getvector(shorttext) return self.classlabels[self.classifier.predict([topicvec])[0]] - def score(self, shorttext): - """ Calculate the score, which is the cosine similarity with the topic vector of the model, - of the short text against each class labels. + def score(self, shorttext: str) -> dict[str, float]: + """Compute classification scores for all classes. + + Args: + shorttext: Input text. - If neither :func:`~train` nor :func:`~loadmodel` was run, or if the - topic model was not trained, it will raise `ModelNotTrainedException`. + Returns: + Dictionary mapping class labels to scores. - :param shorttext: short text - :return: dictionary of scores of the text to all classes - :raise: ModelNotTrainedException - :type shorttext: str - :rtype: dict + Raises: + ModelNotTrainedException: If model not trained. """ if not self.trained: raise e.ModelNotTrainedException() topicvec = self.getvector(shorttext) - scoredict = {classlabel: self.classifier.score([topicvec], [classidx]) - for classidx, classlabel in enumerate(self.classlabels)} + scoredict = { + classlabel: self.classifier.score([topicvec], [classidx]) + for classidx, classlabel in enumerate(self.classlabels) + } return scoredict - def savemodel(self, nameprefix): - """ Save the model. + def savemodel(self, nameprefix: str) -> None: + """Save model to files. - Save the topic model and the trained scikit-learn classification model. The scikit-learn - model will have the name `nameprefix` followed by the extension `.pkl`. The - topic model is the same as the one in `LatentTopicModeler`. + Saves the topic model, scikit-learn classifier, and class labels. - If neither :func:`~train` nor :func:`~loadmodel` was run, or if the - topic model was not trained, it will raise `ModelNotTrainedException`. + Args: + nameprefix: Prefix for output files. - :param nameprefix: prefix of the paths of the model files - :return: None - :raise: ModelNotTrainedException - :type nameprefix: str + Raises: + ModelNotTrainedException: If model not trained. """ if not self.trained: raise e.ModelNotTrainedException() @@ -136,254 +145,246 @@ def savemodel(self, nameprefix): labelfile.write('\n'.join(self.classlabels)) labelfile.close() - def loadmodel(self, nameprefix): - """ Load the classification model together with the topic model. + def loadmodel(self, nameprefix: str) -> None: + """Load model from files. - :param nameprefix: prefix of the paths of the model files - :return: None - :type nameprefix: str + Args: + nameprefix: Prefix for input files. """ self.topicmodeler.loadmodel(nameprefix) self.classifier = joblib.load(nameprefix+'.pkl') - # for backward compatibility, shorttext<1.0.0 does not have _classlabels.txt - if os.path.exists(nameprefix+'_classlabels.txt'): - labelfile = open(nameprefix+'_classlabels.txt', 'r') - self.classlabels = [s.strip() for s in labelfile.readlines()] - labelfile.close() - else: - self.classlabels = self.topicmodeler.classlabels - - def save_compact_model(self, name): - """ Save the model. + labelfile = open(nameprefix+'_classlabels.txt', 'r') + self.classlabels = [s.strip() for s in labelfile.readlines()] + labelfile.close() - Save the topic model and the trained scikit-learn classification model in one compact model file. + def save_compact_model(self, name: str) -> None: + """Save model as compact file. - If neither :func:`~train` nor :func:`~loadmodel` was run, or if the - topic model was not trained, it will raise `ModelNotTrainedException`. + Args: + name: Name of the compact model file. - :param name: name of the compact model file - :return: None - :type name: str + Raises: + ModelNotTrainedException: If model not trained. """ topicmodel_info = self.topicmodeler.get_info() - cio.save_compact_model(name, self.savemodel, 'topic_sklearn', - topicmodel_info['suffices']+['.pkl', '_classlabels.txt'], - {'classifier': 'topic_sklearn', 'topicmodel': topicmodel_info['classifier']}) - - def load_compact_model(self, name): - """ Load the classification model together with the topic model from a compact file. - - :param name: name of the compact model file - :return: None - :type name: str + cio.save_compact_model( + name, + self.savemodel, + 'topic_sklearn', + topicmodel_info['suffices']+['.pkl', '_classlabels.txt'], + { + 'classifier': 'topic_sklearn', + 'topicmodel': topicmodel_info['classifier'] + } + ) + + def load_compact_model(self, name: str) -> None: + """Load model from compact file. + + Args: + name: Name of the compact model file. """ - cio.load_compact_model(name, self.loadmodel, 'topic_sklearn', - {'classifier': 'topic_sklearn', 'topicmodel': None}) + cio.load_compact_model( + name, + self.loadmodel, + 'topic_sklearn', + {'classifier': 'topic_sklearn', 'topicmodel': None} + ) self.trained = True -def train_gensim_topicvec_sklearnclassifier(classdict, - nb_topics, - sklearn_classifier, - preprocessor=textpreprocess.standard_text_preprocessor_1(), - topicmodel_algorithm='lda', - toweigh=True, - normalize=True, - gensim_paramdict={}, - sklearn_paramdict={}): - """ Train the supervised learning classifier, with features given by topic vectors. - - It trains a topic model, and with its topic vector representation, train a supervised - learning classifier. The instantiated (not trained) scikit-learn classifier must be - passed into the argument. - - # Reference - - Xuan Hieu Phan, Cam-Tu Nguyen, Dieu-Thu Le, Minh Le Nguyen, Susumu Horiguchi, Quang-Thuy Ha, - "A Hidden Topic-Based Framework toward Building Applications with Short Web Documents," - *IEEE Trans. Knowl. Data Eng.* 23(7): 961-976 (2011). - - Xuan Hieu Phan, Le-Minh Nguyen, Susumu Horiguchi, "Learning to Classify Short and Sparse Text & Web withHidden Topics from Large-scale Data Collections," - WWW '08 Proceedings of the 17th international conference on World Wide Web. (2008) [`ACL - `_] - - :param classdict: training data - :param nb_topics: number of topics in the topic model - :param sklearn_classifier: instantiated scikit-learn classifier - :param preprocessor: function that preprocesses the text (Default: `utils.textpreprocess.standard_text_preprocessor_1`) - :param topicmodel_algorithm: topic model algorithm (Default: 'lda') - :param toweigh: whether to weigh the words using tf-idf (Default: True) - :param normalize: whether the retrieved topic vectors are normalized (Default: True) - :param gensim_paramdict: arguments to be passed on to the `train` method of the `gensim` topic model - :param sklearn_paramdict: arguments to be passed on to the `fit` method of the `sklearn` classification algorithm - :return: a trained classifier - :type classdict: dict - :type nb_topics: int - :type sklearn_classifier: sklearn.base.BaseEstimator - :type preprocessor: function - :type topicmodel_algorithm: str - :type toweigh: bool - :type normalize: bool - :type gensim_paramdict: dict - :type sklearn_paramdict: dict - :rtype: TopicVectorSkLearnClassifier +def train_gensim_topicvec_sklearnclassifier( + classdict: dict[str, list[str]], + nb_topics: int, + sklearn_classifier: sklearn.base.BaseEstimator, + preprocessor: Optional[callable] = None, + topicmodel_algorithm: Literal["lda", "lsi", "rp"] = 'lda', + toweigh: bool = True, + normalize: bool = True, + gensim_paramdict: Optional[dict] = None, + sklearn_paramdict: Optional[dict] = None +) -> TopicVectorSkLearnClassifier: + """Train a classifier with gensim topic vectors and scikit-learn. + + Trains a topic model (LDA, LSI, or RP), then uses the topic vectors + as features to train a scikit-learn classifier. + + Args: + classdict: Training data. + nb_topics: Number of topics. + sklearn_classifier: Scikit-learn classifier instance (not trained). + preprocessor: Text preprocessing function. Default: standard_text_preprocessor_1. + topicmodel_algorithm: Topic model algorithm. Default: lda. + toweigh: Apply tf-idf weighting. Default: True. + normalize: Normalize topic vectors. Default: True. + gensim_paramdict: Arguments for gensim topic model. + sklearn_paramdict: Arguments for scikit-learn classifier. + + Returns: + Trained TopicVectorSkLearnClassifier. + + Reference: + Xuan Hieu Phan et al., "A Hidden Topic-Based Framework toward + Building Applications with Short Web Documents," + IEEE Trans. Knowl. Data Eng. 23(7): 961-976 (2011). + + Xuan Hieu Phan et al., "Learning to Classify Short and Sparse + Text & Web with Hidden Topics from Large-scale Data Collections," + WWW 2008. + http://dl.acm.org/citation.cfm?id=1367510 """ - # topic model training + if preprocessor is None: + preprocessor = textpreprocess.standard_text_preprocessor_1() + if gensim_paramdict is None: + gensim_paramdict = {} + if sklearn_paramdict is None: + sklearn_paramdict = {} + modelerdict = {'lda': LDAModeler, 'lsi': LSIModeler, 'rp': RPModeler} - topicmodeler = modelerdict[topicmodel_algorithm](preprocessor=preprocessor, - toweigh=toweigh, - normalize=normalize) + topicmodeler = modelerdict[topicmodel_algorithm]( + preprocessor=preprocessor, + toweigh=toweigh, + normalize=normalize + ) topicmodeler.train(classdict, nb_topics, **gensim_paramdict) - # intermediate classification training classifier = TopicVectorSkLearnClassifier(topicmodeler, sklearn_classifier) classifier.train(classdict, **sklearn_paramdict) return classifier -def load_gensim_topicvec_sklearnclassifier(name, - preprocessor=textpreprocess.standard_text_preprocessor_1(), - compact=True): - """ Load the classifier, a wrapper that uses scikit-learn classifier, with - feature vectors given by a topic model, from files. +def load_gensim_topicvec_sklearnclassifier( + name: str, + preprocessor: Optional[callable] = None, + compact: bool = True +) -> TopicVectorSkLearnClassifier: + """Load a classifier with gensim topic vectors from files. - # Reference + Args: + name: Model name (compact) or file prefix (non-compact). + preprocessor: Text preprocessing function. Default: standard_text_preprocessor_1. + compact: Load compact model. Default: True. - Xuan Hieu Phan, Cam-Tu Nguyen, Dieu-Thu Le, Minh Le Nguyen, Susumu Horiguchi, Quang-Thuy Ha, - "A Hidden Topic-Based Framework toward Building Applications with Short Web Documents," - *IEEE Trans. Knowl. Data Eng.* 23(7): 961-976 (2011). + Returns: + TopicVectorSkLearnClassifier instance. - Xuan Hieu Phan, Le-Minh Nguyen, Susumu Horiguchi, "Learning to Classify Short and Sparse Text & Web withHidden Topics from Large-scale Data Collections," - WWW '08 Proceedings of the 17th international conference on World Wide Web. (2008) [`ACL - `_] + Reference: + Xuan Hieu Phan et al., "A Hidden Topic-Based Framework toward + Building Applications with Short Web Documents," + IEEE Trans. Knowl. Data Eng. 23(7): 961-976 (2011). - :param name: name (if compact==True) or prefix (if compact==False) of the paths of model files - :param preprocessor: function that preprocesses the text (Default: `utils.textpreprocess.standard_text_preprocessor_1`) - :param compact: whether model file is compact (Default: True) - :return: a trained classifier - :type name: str - :type preprocessor: function - :type compact: bool - :rtype: TopicVectorSkLearnClassifier + Xuan Hieu Phan et al., "Learning to Classify Short and Sparse + Text & Web with Hidden Topics from Large-scale Data Collections," + WWW 2008. + http://dl.acm.org/citation.cfm?id=1367510 """ + if preprocessor is None: + preprocessor = textpreprocess.standard_text_preprocessor_1() + if compact: - # load the compact model modelerdict = {'ldatopic': LDAModeler, 'lsitopic': LSIModeler, 'rptopic': RPModeler} topicmodel_name = cio.get_model_config_field(name, 'topicmodel') classifier = TopicVectorSkLearnClassifier(modelerdict[topicmodel_name](preprocessor=preprocessor), None) classifier.load_compact_model(name) classifier.trained = True - - # return the instance return classifier else: - # loading topic model topicmodeler = load_gensimtopicmodel(name, preprocessor=preprocessor) - - # loading intermediate model sklearn_classifier = joblib.load(name + '.pkl') - - # the wrapped classifier classifier = TopicVectorSkLearnClassifier(topicmodeler, sklearn_classifier) classifier.trained = True - - # return the instance return classifier -def train_autoencoder_topic_sklearnclassifier(classdict, - nb_topics, - sklearn_classifier, - preprocessor=textpreprocess.standard_text_preprocessor_1(), - normalize=True, - keras_paramdict={}, - sklearn_paramdict={}): - """ Train the supervised learning classifier, with features given by topic vectors. - - It trains an autoencoder topic model, and with its encoded vector representation, train a supervised - learning classifier. The instantiated (not trained) scikit-learn classifier must be - passed into the argument. - - # Reference - - Xuan Hieu Phan, Cam-Tu Nguyen, Dieu-Thu Le, Minh Le Nguyen, Susumu Horiguchi, Quang-Thuy Ha, - "A Hidden Topic-Based Framework toward Building Applications with Short Web Documents," - *IEEE Trans. Knowl. Data Eng.* 23(7): 961-976 (2011). - - Xuan Hieu Phan, Le-Minh Nguyen, Susumu Horiguchi, "Learning to Classify Short and Sparse Text & Web withHidden Topics from Large-scale Data Collections," - WWW '08 Proceedings of the 17th international conference on World Wide Web. (2008) [`ACL - `_] - - :param classdict: training data - :param nb_topics: number topics, i.e., number of encoding dimensions - :param sklearn_classifier: instantiated scikit-learn classifier - :param preprocessor: function that preprocesses the text (Default: `utils.textpreprocess.standard_text_preprocessor_1`) - :param normalize: whether the retrieved topic vectors are normalized (Default: True) - :param keras_paramdict: arguments to be passed to keras for training autoencoder - :param sklearn_paramdict: arguemtnst to be passed to scikit-learn for fitting the classifier - :return: a trained classifier - :type classdict: dict - :type nb_topics: int - :type sklearn_classifier: sklearn.base.BaseEstimator - :type preprocessor: function - :type normalize: bool - :rtype: TopicVectorSkLearnClassifier +def train_autoencoder_topic_sklearnclassifier( + classdict: dict[str, list[str]], + nb_topics: int, + sklearn_classifier: sklearn.base.BaseEstimator, + preprocessor: Optional[callable] = None, + normalize: bool = True, + keras_paramdict: Optional[dict] = None, + sklearn_paramdict: Optional[dict] = None +) -> TopicVectorSkLearnClassifier: + """Train a classifier with autoencoder topic vectors and scikit-learn. + + Trains an autoencoder topic model, then uses the encoded vectors + as features to train a scikit-learn classifier. + + Args: + classdict: Training data. + nb_topics: Number of encoding dimensions. + sklearn_classifier: Scikit-learn classifier instance (not trained). + preprocessor: Text preprocessing function. Default: standard_text_preprocessor_1. + normalize: Normalize topic vectors. Default: True. + keras_paramdict: Arguments for Keras autoencoder training. + sklearn_paramdict: Arguments for scikit-learn classifier. + + Returns: + Trained TopicVectorSkLearnClassifier. + + Reference: + Xuan Hieu Phan et al., "A Hidden Topic-Based Framework toward + Building Applications with Short Web Documents," + IEEE Trans. Knowl. Data Eng. 23(7): 961-976 (2011). + + Xuan Hieu Phan et al., "Learning to Classify Short and Sparse + Text & Web with Hidden Topics from Large-scale Data Collections," + WWW 2008. + http://dl.acm.org/citation.cfm?id=1367510 """ - # train the autoencoder + if preprocessor is None: + preprocessor = textpreprocess.standard_text_preprocessor_1() + if keras_paramdict is None: + keras_paramdict = {} + if sklearn_paramdict is None: + sklearn_paramdict = {} + autoencoder = AutoencodingTopicModeler(preprocessor=preprocessor, normalize=normalize) autoencoder.train(classdict, nb_topics, **keras_paramdict) - # intermediate classification training classifier = TopicVectorSkLearnClassifier(autoencoder, sklearn_classifier) classifier.train(classdict, **sklearn_paramdict) return classifier -def load_autoencoder_topic_sklearnclassifier(name, - preprocessor=textpreprocess.standard_text_preprocessor_1(), - compact=True): - """ Load the classifier, a wrapper that uses scikit-learn classifier, with - feature vectors given by an autocoder topic model, from files. +def load_autoencoder_topic_sklearnclassifier( + name: str, + preprocessor: Optional[callable] = None, + compact: bool = True +) -> TopicVectorSkLearnClassifier: + """Load a classifier with autoencoder topic vectors from files. - # Reference + Args: + name: Model name (compact) or file prefix (non-compact). + preprocessor: Text preprocessing function. Default: standard_text_preprocessor_1. + compact: Load compact model. Default: True. - Xuan Hieu Phan, Cam-Tu Nguyen, Dieu-Thu Le, Minh Le Nguyen, Susumu Horiguchi, Quang-Thuy Ha, - "A Hidden Topic-Based Framework toward Building Applications with Short Web Documents," - *IEEE Trans. Knowl. Data Eng.* 23(7): 961-976 (2011). + Returns: + TopicVectorSkLearnClassifier instance. - Xuan Hieu Phan, Le-Minh Nguyen, Susumu Horiguchi, "Learning to Classify Short and Sparse Text & Web withHidden Topics from Large-scale Data Collections," - WWW '08 Proceedings of the 17th international conference on World Wide Web. (2008) [`ACL - `_] + Reference: + Xuan Hieu Phan et al., "A Hidden Topic-Based Framework toward + Building Applications with Short Web Documents," + IEEE Trans. Knowl. Data Eng. 23(7): 961-976 (2011). - :param name: name (if compact==True) or prefix (if compact==False) of the paths of model files - :param preprocessor: function that preprocesses the text (Default: `utils.textpreprocess.standard_text_preprocessor_1`) - :param compact: whether model file is compact (Default: True) - :return: a trained classifier - :type name: str - :type preprocessor: function - :type compact: bool - :rtype: TopicVectorSkLearnClassifier + Xuan Hieu Phan et al., "Learning to Classify Short and Sparse + Text & Web with Hidden Topics from Large-scale Data Collections," + WWW 2008. + http://dl.acm.org/citation.cfm?id=1367510 """ + if preprocessor is None: + preprocessor = textpreprocess.standard_text_preprocessor_1() + if compact: - # load the compact model classifier = TopicVectorSkLearnClassifier(AutoencodingTopicModeler(preprocessor=preprocessor), None) classifier.load_compact_model(name) classifier.trained = True - - # return the instance return classifier else: - # load the autoencoder autoencoder = load_autoencoder_topicmodel(name, preprocessor=preprocessor) - - # load intermediate model sklearn_classifier = joblib.load(name + '.pkl') - - # the wrapper classifier classifier = TopicVectorSkLearnClassifier(autoencoder, sklearn_classifier) classifier.trained = True - - # return the instance return classifier diff --git a/src/shorttext/classifiers/bow/topic/TopicVectorDistanceClassification.py b/src/shorttext/classifiers/bow/topic/TopicVectorDistanceClassification.py index bf429af4..939ce0ed 100644 --- a/src/shorttext/classifiers/bow/topic/TopicVectorDistanceClassification.py +++ b/src/shorttext/classifiers/bow/topic/TopicVectorDistanceClassification.py @@ -1,108 +1,102 @@ -from ....utils import textpreprocessing as textpreprocess +from typing import Optional, Literal + from ....generators import LatentTopicModeler, GensimTopicModeler, AutoencodingTopicModeler from ....generators import load_autoencoder_topicmodel, load_gensimtopicmodel +from ...base import AbstractScorer -class TopicVecCosineDistanceClassifier: - """ - This is a class that implements a classifier that perform classification based on - the cosine similarity between the topic vectors of the user-input short texts and various classes. - The topic vectors are calculated using :class:`LatentTopicModeler`. +class TopicVecCosineDistanceClassifier(AbstractScorer): + """Classifier using cosine similarity with topic vectors. + + Classifies short text based on cosine similarity between topic vectors + of the input and class centroids. Topic vectors are generated by a + LatentTopicModeler. """ - def __init__(self, topicmodeler): - """ Initialize the classifier. - :param topicmodeler: topic modeler - :type topicmodeler: LatentTopicModeler + def __init__(self, topicmodeler: LatentTopicModeler): + """Initialize the classifier. + + Args: + topicmodeler: A LatentTopicModeler instance. """ self.topicmodeler = topicmodeler - def score(self, shorttext): - """ Calculate the score, which is the cosine similarity with the topic vector of the model, - of the short text against each class labels. + def score(self, shorttext: str) -> dict[str, float]: + """Calculate cosine similarity to all class topic vectors. - :param shorttext: short text - :return: dictionary of scores of the text to all classes - :type shorttext: str - :rtype: dict + Args: + shorttext: Input text. + + Returns: + Dictionary mapping class labels to similarity scores. """ - # scoredict = defaultdict(lambda : 0.0) - # similarities = self.topicmodeler.matsim[self.topicmodeler.retrieve_corpus_topicdist(shorttext)] - # for label, similarity in zip(self.topicmodeler.classlabels, similarities): - # scoredict[label] = similarity - # return dict(scoredict) return self.topicmodeler.get_batch_cos_similarities(shorttext) - def loadmodel(self, nameprefix): - """ Load the topic model with the given prefix of the file paths. - - Given the prefix of the file paths, load the corresponding topic model. The files - include a JSON (.json) file that specifies various parameters, a gensim dictionary (.gensimdict), - and a topic model (.gensimmodel). If weighing is applied, load also the tf-idf model (.gensimtfidf). + def loadmodel(self, nameprefix: str) -> None: + """Load the topic model. - This is essentialing loading the topic modeler :class:`LatentTopicModeler`. - - :param nameprefix: prefix of the file paths - :return: None - :type nameprefix: str + Args: + nameprefix: Prefix for input files. """ self.topicmodeler.loadmodel(nameprefix) - def savemodel(self, nameprefix): - """ Save the model with names according to the prefix. - - Given the prefix of the file paths, save the corresponding topic model. The files - include a JSON (.json) file that specifies various parameters, a gensim dictionary (.gensimdict), - and a topic model (.gensimmodel). If weighing is applied, load also the tf-idf model (.gensimtfidf). + def savemodel(self, nameprefix: str) -> None: + """Save the topic model. - If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. + Args: + nameprefix: Prefix for output files. - This is essentialing saving the topic modeler :class:`LatentTopicModeler`. - - :param nameprefix: prefix of the file paths - :return: None - :raise: ModelNotTrainedException - :type nameprefix: str + Raises: + ModelNotTrainedException: If model not trained. """ self.topicmodeler.savemodel(nameprefix) - def load_compact_model(self, name): + def load_compact_model(self, name: str) -> None: + """Load compact model. + + Args: + name: Name of the compact model file. + """ self.topicmodeler.load_compact_model(name) - def save_compact_model(self, name): + def save_compact_model(self, name: str) -> None: + """Save compact model. + + Args: + name: Name of the compact model file. + """ self.topicmodeler.save_compact_model(name) -def train_gensimtopicvec_cosineClassifier(classdict, - nb_topics, - preprocessor=textpreprocess.standard_text_preprocessor_1(), - algorithm='lda', - toweigh=True, - normalize=True, - *args, **kwargs): - """ Return a cosine distance classifier, i.e., :class:`TopicVecCosineDistanceClassifier`, while - training a gensim topic model in between. - - :param classdict: training data - :param nb_topics: number of latent topics - :param preprocessor: function that preprocesses the text. (Default: `utils.textpreprocess.standard_text_preprocessor_1`) - :param algorithm: algorithm for topic modeling. Options: lda, lsi, rp. (Default: lda) - :param toweigh: whether to weigh the words using tf-idf. (Default: True) - :param normalize: whether the retrieved topic vectors are normalized. (Default: True) - :param args: arguments to pass to the `train` method for gensim topic models - :param kwargs: arguments to pass to the `train` method for gensim topic models - :return: a classifier that scores the short text based on the topic model - :type classdict: dict - :type nb_topics: int - :type preprocessor: function - :type algorithm: str - :type toweigh: bool - :type normalize: bool - :rtype: TopicVecCosineDistanceClassifier +def train_gensimtopicvec_cosineClassifier( + classdict: dict[str, list[str]], + nb_topics: int, + preprocessor: Optional[callable] = None, + tokenizer: Optional[callable] = None, + algorithm: Literal["lda", "lsi", "rp"] = "lda", + toweigh: bool = True, + normalize: bool = True, + *args, **kwargs +) -> TopicVecCosineDistanceClassifier: + """Train a gensim topic model and return a cosine classifier. + + Args: + classdict: Training data. + nb_topics: Number of latent topics. + preprocessor: Text preprocessing function. Default: standard_text_preprocessor_1. + algorithm: Topic modeling algorithm. Options: lda, lsi, rp. Default: lda. + toweigh: Whether to apply tf-idf weighting. Default: True. + normalize: Whether to normalize topic vectors. Default: True. + *args: Additional arguments for gensim topic model. + **kwargs: Additional keyword arguments for gensim topic model. + + Returns: + TopicVecCosineDistanceClassifier instance. """ # train topic model topicmodeler = GensimTopicModeler(preprocessor=preprocessor, + tokenizer=tokenizer, algorithm=algorithm, toweigh=toweigh, normalize=normalize) @@ -112,77 +106,76 @@ def train_gensimtopicvec_cosineClassifier(classdict, return TopicVecCosineDistanceClassifier(topicmodeler) -def load_gensimtopicvec_cosineClassifier(name, - preprocessor=textpreprocess.standard_text_preprocessor_1(), - compact=True): - """ Load a gensim topic model from files and return a cosine distance classifier. - - Given the prefix of the files of the topic model, return a cosine distance classifier - based on this model, i.e., :class:`TopicVecCosineDistanceClassifier`. +def load_gensimtopicvec_cosineClassifier( + name: str, + preprocessor: Optional[callable] = None, + tokenizer: Optional[callable] = None, + compact: bool=True +) -> TopicVecCosineDistanceClassifier: + """Load a gensim topic model and return a cosine classifier. - The files include a JSON (.json) file that specifies various parameters, a gensim dictionary (.gensimdict), - and a topic model (.gensimmodel). If weighing is applied, load also the tf-idf model (.gensimtfidf). + Args: + name: Model name (compact) or file prefix (non-compact). + preprocessor: Text preprocessing function. Default: standard_text_preprocessor_1. + compact: Whether to load compact model. Default: True. - :param name: name (if compact=True) or prefix (if compact=False) of the file paths - :param preprocessor: function that preprocesses the text. (Default: `utils.textpreprocess.standard_text_preprocessor_1`) - :param compact: whether model file is compact (Default: True) - :return: a classifier that scores the short text based on the topic model - :type name: str - :type preprocessor: function - :type compact: bool - :rtype: TopicVecCosineDistanceClassifier + Returns: + TopicVecCosineDistanceClassifier instance. """ - topicmodeler = load_gensimtopicmodel(name, preprocessor=preprocessor, compact=compact) + topicmodeler = load_gensimtopicmodel( + name, preprocessor=preprocessor, tokenizer=tokenizer, compact=compact + ) return TopicVecCosineDistanceClassifier(topicmodeler) -def train_autoencoder_cosineClassifier(classdict, - nb_topics, - preprocessor=textpreprocess.standard_text_preprocessor_1(), - normalize=True, - *args, **kwargs): - """ Return a cosine distance classifier, i.e., :class:`TopicVecCosineDistanceClassifier`, while - training an autoencoder as a topic model in between. - - :param classdict: training data - :param nb_topics: number of topics, i.e., number of encoding dimensions - :param preprocessor: function that preprocesses the text. (Default: `utils.textpreprocess.standard_text_preprocessor_1`) - :param normalize: whether the retrieved topic vectors are normalized. (Default: True) - :param args: arguments to be passed to keras model fitting - :param kwargs: arguments to be passed to keras model fitting - :return: a classifier that scores the short text based on the autoencoder - :type classdict: dict - :type nb_topics: int - :type preprocessor: function - :type normalize: bool - :rtype: TopicVecCosineDistanceClassifier +def train_autoencoder_cosineClassifier( + classdict: dict[str, list[str]], + nb_topics: int, + preprocessor: Optional[callable] = None, + tokenizer: Optional[callable] = None, + normalize: bool = True, + *args, **kwargs +) -> TopicVecCosineDistanceClassifier: + """Train an autoencoder topic model and return a cosine classifier. + + Args: + classdict: Training data. + nb_topics: Number of topics (encoding dimensions). + preprocessor: Text preprocessing function. Default: standard_text_preprocessor_1. + normalize: Whether to normalize topic vectors. Default: True. + *args: Additional arguments for Keras model fitting. + **kwargs: Additional keyword arguments for Keras model fitting. + + Returns: + TopicVecCosineDistanceClassifier instance. """ # train the autoencoder - autoencoder = AutoencodingTopicModeler(preprocessor=preprocessor, normalize=normalize) + autoencoder = AutoencodingTopicModeler( + preprocessor=preprocessor, tokenizer=tokenizer, normalize=normalize + ) autoencoder.train(classdict, nb_topics, *args, **kwargs) # cosine distance classifier return TopicVecCosineDistanceClassifier(autoencoder) -def load_autoencoder_cosineClassifier(name, - preprocessor=textpreprocess.standard_text_preprocessor_1(), - compact=True): - """ Load an autoencoder from files for topic modeling, and return a cosine classifier. - - Given the prefix of the file paths, load the model into files, with name given by the prefix. - There are files with names ending with "_encoder.json" and "_encoder.weights.h5", which are - the JSON and HDF5 files for the encoder respectively. - They also include a gensim dictionary (.gensimdict). - - :param name: name (if compact=True) or prefix (if compact=False) of the file paths - :param preprocessor: function that preprocesses the text. (Default: `utils.textpreprocess.standard_text_preprocessor_1`) - :param compact: whether model file is compact (Default: True) - :return: a classifier that scores the short text based on the autoencoder - :type name: str - :type preprocessor: function - :type compact: bool - :rtype: TopicVecCosineDistanceClassifier +def load_autoencoder_cosineClassifier( + name: str, + preprocessor: Optional[callable] = None, + tokenizer: Optional[callable] = None, + compact: bool = True +) -> TopicVecCosineDistanceClassifier: + """Load an autoencoder topic model and return a cosine classifier. + + Args: + name: Model name (compact) or file prefix (non-compact). + preprocessor: Text preprocessing function. Default: standard_text_preprocessor_1. + compact: Whether to load compact model. Default: True. + + Returns: + TopicVecCosineDistanceClassifier instance. """ - autoencoder = load_autoencoder_topicmodel(name, preprocessor=preprocessor, compact=compact) + autoencoder = load_autoencoder_topicmodel( + name, preprocessor=preprocessor, tokenizer=tokenizer, compact=compact + ) return TopicVecCosineDistanceClassifier(autoencoder) diff --git a/src/shorttext/classifiers/embed/nnlib/VarNNEmbedVecClassification.py b/src/shorttext/classifiers/embed/nnlib/VarNNEmbedVecClassification.py index bf228f78..4aff4a42 100644 --- a/src/shorttext/classifiers/embed/nnlib/VarNNEmbedVecClassification.py +++ b/src/shorttext/classifiers/embed/nnlib/VarNNEmbedVecClassification.py @@ -1,162 +1,149 @@ -import json import os import warnings +from typing import Any, Optional, Annotated import numpy as np -import pandas as pd +import numpy.typing as npt +from gensim.models.keyedvectors import KeyedVectors +from tensorflow.keras.models import Model +import orjson from ....utils import kerasmodel_io as kerasio -from ....utils import classification_exceptions as e +from ....utils.classification_exceptions import ModelNotTrainedException from ....utils import tokenize from ....utils.compactmodel_io import CompactIOMachine -from typing import Union, List, Dict, Any +from ...base import AbstractScorer -class VarNNEmbeddedVecClassifier(CompactIOMachine): - """ - This is a wrapper for various neural network algorithms - for supervised short text categorization. - Each class label has a few short sentences, where each token is converted - to an embedded vector, given by a pre-trained word-embedding model (e.g., Google Word2Vec model). - The sentences are represented by a matrix, or rank-2 array. - The type of neural network has to be passed when training, and it has to be of - type :class:`keras.models.Sequential`. The number of outputs of the models has to match - the number of class labels in the training data. - To perform prediction, the input short sentences is converted to a unit vector - in the same way. The score is calculated according to the trained neural network model. - - Examples of the models can be found in `frameworks`. - - A pre-trained Google Word2Vec model can be downloaded `here - `_. +class VarNNEmbeddedVecClassifier(AbstractScorer, CompactIOMachine): + """Neural network classifier for short text categorization. + + Wraps Keras neural network models for supervised short text classification. + Each token is converted to an embedded vector using a pre-trained word-embedding + model (e.g., Word2Vec). Sentences are represented as matrices (rank-2 or rank-3 arrays) + and processed by the neural network. + + The neural network model must be a Keras Sequential model with output dimension + matching the number of class labels. + + Reference: + Pre-trained Word2Vec: https://code.google.com/archive/p/word2vec/ + Example models available in the frameworks module. """ - def __init__(self, wvmodel, vecsize=None, maxlen=15, with_gensim=False): - """ Initialize the classifier. - - :param wvmodel: Word2Vec model - :param vecsize: length of the embedded vectors in the model (Default: None, directly extracted from word-embedding model) - :param maxlen: maximum number of words in a sentence (Default: 15) - :type wvmodel: gensim.models.keyedvectors.KeyedVectors - :type vecsize: int - :type maxlen: int + + def __init__( + self, + wvmodel: KeyedVectors, + vecsize: Optional[int] = None, + maxlen: int = 15, + with_gensim: bool = False + ): + """Initialize the classifier. + + Args: + wvmodel: Word embedding model (e.g., Word2Vec). + vecsize: Vector size. Default: None (extracted from model). + maxlen: Maximum number of words per sentence. Default: 15. + with_gensim: Whether to use gensim format. Default: False. """ - CompactIOMachine.__init__(self, {'classifier': 'nnlibvec'}, 'nnlibvec', ['_classlabels.txt', '.json', '.weights.h5', '_config.json']) + CompactIOMachine.__init__( + self, + {'classifier': 'nnlibvec'}, + 'nnlibvec', + ['_classlabels.txt', '.json', '.weights.h5', '_config.json'] + ) self.wvmodel = wvmodel - self.vecsize = self.wvmodel.vector_size if vecsize == None else vecsize + self.vecsize = self.wvmodel.vector_size if vecsize is None else vecsize self.maxlen = maxlen self.with_gensim = False if not with_gensim else with_gensim self.trained = False - def convert_trainingdata_matrix(self, classdict): - """ Convert the training data into format put into the neural networks. + def convert_trainingdata_matrix( + self, + classdict: dict[str, list[str]] + ) -> tuple[list[str], Annotated[npt.NDArray[np.float64], "3D Array"], Annotated[npt.NDArray[np.int64], "2D Array"]]: + """Convert training data to neural network input format. - Convert the training data into format put into the neural networks. - This is called by :func:`~train`. + Args: + classdict: Training data with class labels as keys and texts as values. - :param classdict: training data - :return: a tuple of three, containing a list of class labels, matrix of embedded word vectors, and corresponding outputs - :type classdict: dict - :rtype: (list, numpy.ndarray, list) + Returns: + Tuple of (class_labels, embedded_vectors, labels_array). """ - classlabels = classdict.keys() + classlabels = sorted(classdict.keys()) # sort the class labels to ensure uniqueness lblidx_dict = dict(zip(classlabels, range(len(classlabels)))) - # tokenize the words, and determine the word length phrases = [] indices = [] for label in classlabels: for shorttext in classdict[label]: - shorttext = shorttext if type(shorttext)==str else '' + shorttext = shorttext if isinstance(shorttext, str) else '' category_bucket = [0]*len(classlabels) category_bucket[lblidx_dict[label]] = 1 indices.append(category_bucket) phrases.append(tokenize(shorttext)) - # store embedded vectors train_embedvec = np.zeros(shape=(len(phrases), self.maxlen, self.vecsize)) for i in range(len(phrases)): for j in range(min(self.maxlen, len(phrases[i]))): - train_embedvec[i, j] = self.word_to_embedvec(phrases[i][j]) + train_embedvec[i, j, :] = self.word_to_embedvec(phrases[i][j]) indices = np.array(indices, dtype=np.int_) return classlabels, train_embedvec, indices - def train(self, classdict, kerasmodel, nb_epoch=10): - """ Train the classifier. - - The training data and the corresponding keras model have to be given. - - If this has not been run, or a model was not loaded by :func:`~loadmodel`, - a `ModelNotTrainedException` will be raised. - - :param classdict: training data - :param kerasmodel: keras sequential model - :param nb_epoch: number of steps / epochs in training - :return: None - :type classdict: dict - :type kerasmodel: keras.models.Sequential - :type nb_epoch: int + def train( + self, + classdict: dict[str, list[str]], + kerasmodel: Model, + nb_epoch: int = 10 + ): + """Train the classifier. + + Args: + classdict: Training data. + kerasmodel: Keras Sequential model. + nb_epoch: Number of training epochs. Default: 10. + + Raises: + ModelNotTrainedException: If model not loaded. """ - # convert classdict to training input vectors self.classlabels, train_embedvec, indices = self.convert_trainingdata_matrix(classdict) - - # train the model kerasmodel.fit(train_embedvec, indices, epochs=nb_epoch) - - # flag switch self.model = kerasmodel self.trained = True - def savemodel(self, nameprefix): - """ Save the trained model into files. + def savemodel(self, nameprefix: str) -> None: + """Save the trained model to files. - Given the prefix of the file paths, save the model into files, with name given by the prefix. - There will be three files produced, one name ending with "_classlabels.txt", one name - ending with ".json", and one name ending with ".weights.h5". For shorttext>=0.4.0, another file - with extension "_config.json" would be created. + Args: + nameprefix: Prefix for output files. - If there is no trained model, a `ModelNotTrainedException` will be thrown. - - :param nameprefix: prefix of the file path - :return: None - :type nameprefix: str - :raise: ModelNotTrainedException + Raises: + ModelNotTrainedException: If not trained. """ if not self.trained: - raise e.ModelNotTrainedException() - kerasio.save_model(nameprefix, self.model) - labelfile = open(nameprefix+'_classlabels.txt', 'w') - labelfile.write('\n'.join(self.classlabels)) - labelfile.close() - json.dump({'with_gensim': False, 'maxlen': self.maxlen, 'vecsize': self.vecsize}, - open(nameprefix+'_config.json', 'w')) - - def loadmodel(self, nameprefix): - """ Load a trained model from files. + raise ModelNotTrainedException() - Given the prefix of the file paths, load the model from files with name given by the prefix - followed by "_classlabels.txt", ".json" and ".weights.h5". For shorttext>=0.4.0, a file with - extension "_config.json" would also be used. - - If this has not been run, or a model was not trained by :func:`~train`, - a `ModelNotTrainedException` will be raised while performing prediction or saving the model. - - :param nameprefix: prefix of the file path - :return: None - :type nameprefix: str + kerasio.save_model(nameprefix, self.model) + open(nameprefix+'_classlabels.txt', 'w').write('\n'.join(self.classlabels)) + open(nameprefix + '_config.json', 'wb').write( + orjson.dumps( + {'with_gensim': False, 'maxlen': self.maxlen, 'vecsize': self.vecsize} + ) + ) + + def loadmodel(self, nameprefix: str) -> None: + """Load a trained model from files. + + Args: + nameprefix: Prefix for input files. """ self.model = kerasio.load_model(nameprefix) - labelfile = open(nameprefix+'_classlabels.txt', 'r') - self.classlabels = labelfile.readlines() - labelfile.close() - self.classlabels = [s.strip() for s in self.classlabels] + self.classlabels = [line.strip() for line in open(nameprefix+'_classlabels.txt', 'r')] - # check if _config.json exists. - # This file does not exist if the model was created with shorttext<0.4.0 if os.path.exists(nameprefix+'_config.json'): - config = json.load(open(nameprefix+'_config.json', 'r')) - # these fields are present for release >= 1.0.0 + config = orjson.loads(open(nameprefix+'_config.json', 'rb').read()) if 'maxlen' in config: self.maxlen = config['maxlen'] else: @@ -166,9 +153,10 @@ def loadmodel(self, nameprefix): else: self.vecsize = self.wvmodel.vector_size if self.vecsize != self.wvmodel.vector_size: - warnings.warn('Record vector size (%i) is not the same as that of the given word-embedding model (%i)! ' % (self.vecsize, self.wvmodel.vector_size)+ - 'Setting the vector size to be %i, but there may be run time error!' % (self.wvmodel.vector_size), - RuntimeWarning) + warnings.warn( + f'Record vector size ({self.vecsize}) is not the same as that of the given word-embedding model ({self.wvmodel.vector_size})! ' + \ + f'Setting the vector size to be {self.wvmodel.vector_size}, but there may be run time error!' + ) self.vecsize = self.wvmodel.vector_size else: self.maxlen = 15 @@ -178,31 +166,28 @@ def loadmodel(self, nameprefix): self.with_gensim = False self.trained = True - def word_to_embedvec(self, word): - """ Convert the given word into an embedded vector. + def word_to_embedvec(self, word: str) -> npt.NDArray[np.float64]: + """Convert a word to its embedding vector. - Given a word, return the corresponding embedded vector according to - the word-embedding model. If there is no such word in the model, - a vector with zero values are given. + Args: + word: Input word. - :param word: a word - :return: the corresponding embedded vector - :type word: str - :rtype: numpy.ndarray + Returns: + Embedding vector. Returns zeros if word not in vocabulary. """ - return self.wvmodel[word] if word in self.wvmodel else np.zeros(self.vecsize) + return self.wvmodel[word].astype(np.float64) if word in self.wvmodel else np.zeros(self.vecsize) - def shorttext_to_matrix(self, shorttext): - """ Convert the short text into a matrix with word-embedding representation. + def shorttext_to_matrix( + self, + shorttext: str + ) -> Annotated[npt.NDArray[np.float64], "2D Array"]: + """Convert short text to embedding matrix. - Given a short sentence, it converts all the tokens into embedded vectors according to - the given word-embedding model, and put them into a matrix. If a word is not in the model, - that row will be filled with zero. + Args: + shorttext: Input text. - :param shorttext: a short sentence - :return: a matrix of embedded vectors that represent all the tokens in the sentence - :type shorttext: str - :rtype: numpy.ndarray + Returns: + Matrix of shape (maxlen, vecsize) with embedding vectors. """ tokens = tokenize(shorttext) matrix = np.zeros((self.maxlen, self.vecsize)) @@ -210,57 +195,56 @@ def shorttext_to_matrix(self, shorttext): matrix[i] = self.word_to_embedvec(tokens[i]) return matrix - def score(self, shorttexts: Union[str, List[str]], model_params: Dict[str, Any] = {}): - """ Calculate the scores for all the class labels for the given short sentence. + def score( + self, + shorttext: str, + model_params: Optional[dict[str, Any]] = None + ) -> dict[str, float]: + """Calculate classification scores for all class labels. - Given a short sentence, calculate the classification scores for all class labels, - returned as a dictionary with key being the class labels, and values being the scores. - If the short sentence is empty, or if other numerical errors occur, the score will be `numpy.nan`. - If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. + Args: + shorttext: Input text. + model_params: Additional parameters for model prediction. - :param shorttext: a short sentence - :param model_params: additional parameters to be passed to the model object - :return: a dictionary with keys being the class labels, and values being the corresponding classification scores - :type shorttext: str - :rtype: dict - :raise: ModelNotTrainedException + Returns: + Dictionary mapping class labels to scores. + + Raises: + ModelNotTrainedException: If not trained. """ - is_multiple = True - if isinstance(shorttexts, str): - is_multiple = False - shorttexts = [shorttexts] + if model_params is None: + model_params = {} if not self.trained: - raise e.ModelNotTrainedException() - - # retrieve vector - matrix = np.array([self.shorttext_to_matrix(shorttext) for shorttext in shorttexts]) + raise ModelNotTrainedException() - # classification using the neural network + matrix = np.array([self.shorttext_to_matrix(shorttext)]) predictions = self.model.predict(matrix, **model_params) - # wrangle output result - df = pd.DataFrame(predictions, columns=self.classlabels) - - if not is_multiple: - return df.to_dict('records')[0] - - return df.to_dict('records') - - -def load_varnnlibvec_classifier(wvmodel, name, compact=True, vecsize=None): - """ Load a :class:`shorttext.classifiers.VarNNEmbeddedVecClassifier` instance from file, given the pre-trained word-embedding model. - - :param wvmodel: Word2Vec model - :param name: name (if compact=True) or prefix (if compact=False) of the file path - :param compact whether model file is compact (Default: True) - :param vecsize: length of embedded vectors in the model (Default: None, extracted directly from the word-embedding model) - :return: the classifier - :type wvmodel: gensim.models.keyedvectors.KeyedVectors - :type name: str - :type compact: bool - :type vecsize: int - :rtype: VarNNEmbeddedVecClassifier + score_dict = { + classlabel: predictions[0, j] + for j, classlabel in enumerate(self.classlabels) + } + + return score_dict + + +def load_varnnlibvec_classifier( + wvmodel: KeyedVectors, + name: str, + compact: bool = True, + vecsize: Optional[int] = None +) -> VarNNEmbeddedVecClassifier: + """Load a VarNNEmbeddedVecClassifier from file. + + Args: + wvmodel: Word embedding model. + name: Model name (compact) or file prefix (non-compact). + compact: Whether to load compact model. Default: True. + vecsize: Vector size. Default: None. + + Returns: + VarNNEmbeddedVecClassifier instance. """ classifier = VarNNEmbeddedVecClassifier(wvmodel, vecsize=vecsize) if compact: diff --git a/src/shorttext/classifiers/embed/nnlib/frameworks.py b/src/shorttext/classifiers/embed/nnlib/frameworks.py index 3cdff618..929ae7d7 100644 --- a/src/shorttext/classifiers/embed/nnlib/frameworks.py +++ b/src/shorttext/classifiers/embed/nnlib/frameworks.py @@ -1,6 +1,9 @@ +from typing import Optional, Literal + +from gensim.models.keyedvectors import KeyedVectors from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LSTM, Activation -from tensorflow.keras.models import Sequential +from tensorflow.keras.models import Sequential, Model from tensorflow.keras.regularizers import l2 @@ -11,49 +14,43 @@ # Paper: Yoon Kim, "Convolutional Neural Networks for Sentence Classification," arXiv:1408.5882 (2014). # ref: https://gist.github.com/entron/b9bc61a74e7cadeb1fec # ref: http://cs231n.github.io/convolutional-networks/ -def CNNWordEmbed(nb_labels, - wvmodel=None, - nb_filters=1200, - n_gram=2, - maxlen=15, - vecsize=300, - cnn_dropout=0.0, - final_activation='softmax', - dense_wl2reg=0.0, - dense_bl2reg=0.0, - optimizer='adam'): - """ Returns the convolutional neural network (CNN/ConvNet) for word-embedded vectors. - - Reference: Yoon Kim, "Convolutional Neural Networks for Sentence Classification," - *EMNLP* 2014, 1746-1751 (arXiv:1408.5882). [`arXiv - `_] - - :param nb_labels: number of class labels - :param wvmodel: pre-trained Gensim word2vec model - :param nb_filters: number of filters (Default: 1200) - :param n_gram: n-gram, or window size of CNN/ConvNet (Default: 2) - :param maxlen: maximum number of words in a sentence (Default: 15) - :param vecsize: length of the embedded vectors in the model (Default: 300) - :param cnn_dropout: dropout rate for CNN/ConvNet (Default: 0.0) - :param final_activation: activation function. Options: softplus, softsign, relu, tanh, sigmoid, hard_sigmoid, linear. (Default: 'softmax') - :param dense_wl2reg: L2 regularization coefficient (Default: 0.0) - :param dense_bl2reg: L2 regularization coefficient for bias (Default: 0.0) - :param optimizer: optimizer for gradient descent. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. (Default: adam) - :return: keras model (`Sequential` or`Model`) for CNN/ConvNet for Word-Embeddings - :type nb_labels: int - :type wvmodel: gensim.models.keyedvectors.KeyedVectors - :type nb_filters: int - :type n_gram: int - :type maxlen: int - :type vecsize: int - :type cnn_dropout: float - :type final_activation: str - :type dense_wl2reg: float - :type dense_bl2reg: float - :type optimizer: str - :rtype: keras.models.Model +def CNNWordEmbed( + nb_labels: int, + wvmodel: Optional[KeyedVectors] = None, + nb_filters: int = 1200, + n_gram: int = 2, + maxlen: int = 15, + vecsize: int = 300, + cnn_dropout: float = 0.0, + final_activation: Literal["softplus", "softsign", "relu", "tanh", "sigmoid", "hard_sigmoid", "linear"] = "softmax", + dense_wl2reg: float = 0.0, + dense_bl2reg: float = 0.0, + optimizer: Literal["sgd", "rmsprop", "adagrad", "adadelta", "adam", "adamax", "nadam"] = "adam" +) -> Model: + """Create a CNN for word embeddings. + + Args: + nb_labels: Number of class labels. + wvmodel: Word embedding model. If provided, vecsize is extracted from it. + nb_filters: Number of filters. Default: 1200. + n_gram: N-gram (window size). Default: 2. + maxlen: Maximum sentence length. Default: 15. + vecsize: Embedding vector size. Default: 300. + cnn_dropout: CNN dropout rate. Default: 0.0. + final_activation: Final layer activation. Default: softmax. + dense_wl2reg: L2 regularization for weights. Default: 0.0. + dense_bl2reg: L2 regularization for bias. Default: 0.0. + optimizer: Optimizer. Default: adam. + + Returns: + Keras Sequential model. + + Reference: + Yoon Kim, "Convolutional Neural Networks for Sentence Classification," + EMNLP 2014 (arXiv:1408.5882). + https://arxiv.org/abs/1408.5882 """ - if wvmodel != None: + if wvmodel is not None: vecsize = wvmodel.vector_size model = Sequential() @@ -73,55 +70,44 @@ def CNNWordEmbed(nb_labels, return model -# two layers of CNN, maxpooling, dense -def DoubleCNNWordEmbed(nb_labels, - wvmodel=None, - nb_filters_1=1200, - nb_filters_2=600, - n_gram=2, - filter_length_2=10, - maxlen=15, - vecsize=300, - cnn_dropout_1=0.0, - cnn_dropout_2=0.0, - final_activation='softmax', - dense_wl2reg=0.0, - dense_bl2reg=0.0, - optimizer='adam'): - """ Returns the double-layered convolutional neural network (CNN/ConvNet) for word-embedded vectors. - - :param nb_labels: number of class labels - :param wvmodel: pre-trained Gensim word2vec model - :param nb_filters_1: number of filters for the first CNN/ConvNet layer (Default: 1200) - :param nb_filters_2: number of filters for the second CNN/ConvNet layer (Default: 600) - :param n_gram: n-gram, or window size of first CNN/ConvNet (Default: 2) - :param filter_length_2: window size for second CNN/ConvNet layer (Default: 10) - :param maxlen: maximum number of words in a sentence (Default: 15) - :param vecsize: length of the embedded vectors in the model (Default: 300) - :param cnn_dropout_1: dropout rate for the first CNN/ConvNet layer (Default: 0.0) - :param cnn_dropout_2: dropout rate for the second CNN/ConvNet layer (Default: 0.0) - :param final_activation: activation function. Options: softplus, softsign, relu, tanh, sigmoid, hard_sigmoid, linear. (Default: 'softmax') - :param dense_wl2reg: L2 regularization coefficient (Default: 0.0) - :param dense_bl2reg: L2 regularization coefficient for bias (Default: 0.0) - :param optimizer: optimizer for gradient descent. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. (Default: adam) - :return: keras sequantial model for CNN/ConvNet for Word-Embeddings - :type nb_labels: int - :type wvmodel: gensim.models.keyedvectors.KeyedVectors - :type nb_filters_1: int - :type nb_filters_2: int - :type n_gram: int - :type filter_length_2: int - :type maxlen: int - :type vecsize: int - :type cnn_dropout_1: float - :type cnn_dropout_2: float - :type final_activation: str - :type dense_wl2reg: float - :type dense_bl2reg: float - :type optimizer: str - :rtype: keras.models.Model +def DoubleCNNWordEmbed( + nb_labels: int, + wvmodel: Optional[KeyedVectors] = None, + nb_filters_1: int = 1200, + nb_filters_2: int = 600, + n_gram: int = 2, + filter_length_2: int = 10, + maxlen: int = 15, + vecsize: int = 300, + cnn_dropout_1: float = 0.0, + cnn_dropout_2: float = 0.0, + final_activation: Literal["softplus", "softsign", "relu", "tanh", "sigmoid", "hard_sigmoid", "linear"] = "softmax", + dense_wl2reg: float = 0.0, + dense_bl2reg: float = 0.0, + optimizer: Literal["sgd", "rmsprop", "adagrad", "adadelta", "adam", "adamax", "nadam"] = 'adam' +) -> Model: + """Create a double-layer CNN for word embeddings. + + Args: + nb_labels: Number of class labels. + wvmodel: Word embedding model. If provided, vecsize is extracted from it. + nb_filters_1: Filters for first layer. Default: 1200. + nb_filters_2: Filters for second layer. Default: 600. + n_gram: N-gram for first layer. Default: 2. + filter_length_2: Window size for second layer. Default: 10. + maxlen: Maximum sentence length. Default: 15. + vecsize: Embedding vector size. Default: 300. + cnn_dropout_1: Dropout for first layer. Default: 0.0. + cnn_dropout_2: Dropout for second layer. Default: 0.0. + final_activation: Final layer activation. Default: softmax. + dense_wl2reg: L2 regularization for weights. Default: 0.0. + dense_bl2reg: L2 regularization for bias. Default: 0.0. + optimizer: Optimizer. Default: adam. + + Returns: + Keras Sequential model. """ - if wvmodel != None: + if wvmodel is not None: vecsize = wvmodel.vector_size model = Sequential() @@ -147,59 +133,47 @@ def DoubleCNNWordEmbed(nb_labels, return model -# C-LSTM -# Chunting Zhou, Chonglin Sun, Zhiyuan Liu, Francis Lau, -# "A C-LSTM Neural Network for Text Classification", arXiv:1511.08630 (2015). -def CLSTMWordEmbed(nb_labels, - wvmodel=None, - nb_filters=1200, - n_gram=2, - maxlen=15, - vecsize=300, - cnn_dropout=0.0, - nb_rnnoutdim=1200, - rnn_dropout=0.2, - final_activation='softmax', - dense_wl2reg=0.0, - dense_bl2reg=0.0, - optimizer='adam'): - """ Returns the C-LSTM neural networks for word-embedded vectors. - - Reference: Chunting Zhou, Chonglin Sun, Zhiyuan Liu, Francis Lau, - "A C-LSTM Neural Network for Text Classification," - (arXiv:1511.08630). [`arXiv - `_] - - :param nb_labels: number of class labels - :param wvmodel: pre-trained Gensim word2vec model - :param nb_filters: number of filters (Default: 1200) - :param n_gram: n-gram, or window size of CNN/ConvNet (Default: 2) - :param maxlen: maximum number of words in a sentence (Default: 15) - :param vecsize: length of the embedded vectors in the model (Default: 300) - :param cnn_dropout: dropout rate for CNN/ConvNet (Default: 0.0) - :param nb_rnnoutdim: output dimension for the LSTM networks (Default: 1200) - :param rnn_dropout: dropout rate for LSTM (Default: 0.2) - :param final_activation: activation function. Options: softplus, softsign, relu, tanh, sigmoid, hard_sigmoid, linear. (Default: 'softmax') - :param dense_wl2reg: L2 regularization coefficient (Default: 0.0) - :param dense_bl2reg: L2 regularization coefficient for bias (Default: 0.0) - :param optimizer: optimizer for gradient descent. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. (Default: adam) - :return: keras sequantial model for CNN/ConvNet for Word-Embeddings - :type nb_labels: int - :type wvmodel: gensim.models.keyedvectors.KeyedVectors - :type nb_filters: int - :type n_gram: int - :type maxlen: int - :type vecsize: int - :type cnn_dropout: float - :type nb_rnnoutdim: int - :type rnn_dropout: float - :type final_activation: str - :type dense_wl2reg: float - :type dense_bl2reg: float - :type optimizer: str - :rtype: keras.models.Model +def CLSTMWordEmbed( + nb_labels: int, + wvmodel: Optional[KeyedVectors] = None, + nb_filters: int = 1200, + n_gram: int = 2, + maxlen: int = 15, + vecsize: int = 300, + cnn_dropout: float = 0.0, + nb_rnnoutdim: int = 1200, + rnn_dropout: int = 0.2, + final_activation: Literal["softplus", "softsign", "relu", "tanh", "sigmoid", "hard_sigmoid", "linear"] = "softmax", + dense_wl2reg: float = 0.0, + dense_bl2reg: float = 0.0, + optimizer: Literal["sgd", "rmsprop", "adagrad", "adadelta", "adam", "adamax", "nadam"] = "adam" +) -> Model: + """Create a C-LSTM model for word embeddings. + + Args: + nb_labels: Number of class labels. + wvmodel: Word embedding model. If provided, vecsize is extracted from it. + nb_filters: Number of CNN filters. Default: 1200. + n_gram: N-gram (window size). Default: 2. + maxlen: Maximum sentence length. Default: 15. + vecsize: Embedding vector size. Default: 300. + cnn_dropout: CNN dropout rate. Default: 0.0. + nb_rnnoutdim: LSTM output dimension. Default: 1200. + rnn_dropout: LSTM dropout rate. Default: 0.2. + final_activation: Final layer activation. Default: softmax. + dense_wl2reg: L2 regularization for weights. Default: 0.0. + dense_bl2reg: L2 regularization for bias. Default: 0.0. + optimizer: Optimizer. Default: adam. + + Returns: + Keras Sequential model. + + Reference: + Chunting Zhou et al., "A C-LSTM Neural Network for Text Classification," + arXiv:1511.08630 (2015). + https://arxiv.org/abs/1511.08630 """ - if wvmodel != None: + if wvmodel is not None: vecsize = wvmodel.vector_size model = Sequential() diff --git a/src/shorttext/classifiers/embed/sumvec/SumEmbedVecClassification.py b/src/shorttext/classifiers/embed/sumvec/SumEmbedVecClassification.py index 6fbc22e3..72b90cbd 100644 --- a/src/shorttext/classifiers/embed/sumvec/SumEmbedVecClassification.py +++ b/src/shorttext/classifiers/embed/sumvec/SumEmbedVecClassification.py @@ -1,154 +1,157 @@ + import pickle from collections import defaultdict +from typing import Optional, Annotated import numpy as np -from scipy.spatial.distance import cosine +import numpy.typing as npt +from gensim.models.keyedvectors import KeyedVectors +from loguru import logger from ....utils.classification_exceptions import ModelNotTrainedException from ....utils import shorttext_to_avgvec from ....utils.compactmodel_io import CompactIOMachine +from ....utils.compute import cosine_similarity class SumEmbeddedVecClassifier(CompactIOMachine): - """ - This is a supervised classification algorithm for short text categorization. - Each class label has a few short sentences, where each token is converted - to an embedded vector, given by a pre-trained word-embedding model (e.g., Google Word2Vec model). - They are then summed up and normalized to a unit vector for that particular class labels. - To perform prediction, the input short sentences is converted to a unit vector - in the same way. The similarity score is calculated by the cosine similarity. - - A pre-trained Google Word2Vec model can be downloaded `here - `_. - """ + """Classifier using summed word embeddings. - def __init__(self, wvmodel, vecsize=None, simfcn=lambda u, v: 1-cosine(u, v)): - """ Initialize the classifier. + Each class is represented as the sum of word embeddings for its + training sentences, normalized to a unit vector. Prediction uses + cosine similarity between the input vector and class centroids. + + Reference: + Pre-trained Word2Vec: https://code.google.com/archive/p/word2vec/ + """ - :param wvmodel: Word2Vec model - :param vecsize: length of the embedded vectors in the model (Default: None, directly extracted from word-embedding model) - :param simfcn: similarity function (Default: cosine similarity) - :type wvmodel: gensim.models.keyedvectors.KeyedVectors - :type vecsize: int - :type simfcn: function + def __init__( + self, + wvmodel: KeyedVectors, + vecsize: Optional[int] = None, + simfcn: Optional[callable] = None + ): + """Initialize the classifier. + + Args: + wvmodel: Word embedding model (e.g., Word2Vec). + vecsize: Vector size. Default: None (extracted from model). + simfcn: Similarity function. Default: cosine_similarity. """ - CompactIOMachine.__init__(self, {'classifier': 'sumvec'}, 'sumvec', ['_embedvecdict.pkl']) + CompactIOMachine.__init__( + self, + {'classifier': 'sumvec'}, + 'sumvec', + ['_embedvecdict.pkl'] + ) self.wvmodel = wvmodel - self.vecsize = self.wvmodel.vector_size if vecsize == None else vecsize - self.simfcn = simfcn + self.vecsize = self.wvmodel.vector_size if vecsize is None else vecsize + self.simfcn = simfcn if simfcn is not None else cosine_similarity self.trained = False - def train(self, classdict): - """ Train the classifier. + def train(self, classdict: dict[str, list[str]]) -> None: + """Train the classifier. - If this has not been run, or a model was not loaded by :func:`~loadmodel`, - a `ModelNotTrainedException` will be raised while performing prediction or saving - the model. + Args: + classdict: Training data with class labels as keys and texts as values. - :param classdict: training data - :return: None - :type classdict: dict + Raises: + ModelNotTrainedException: If not trained or loaded. """ self.addvec = defaultdict(lambda : np.zeros(self.vecsize)) for classtype in classdict: - self.addvec[classtype] = np.sum([self.shorttext_to_embedvec(shorttext) - for shorttext in classdict[classtype]], - axis=0) + self.addvec[classtype] = np.sum( + [ + self.shorttext_to_embedvec(shorttext) + for shorttext in classdict[classtype] + ], + axis=0 + ) self.addvec[classtype] /= np.linalg.norm(self.addvec[classtype]) self.addvec = dict(self.addvec) self.trained = True - def savemodel(self, nameprefix): - """ Save the trained model into files. + def savemodel(self, nameprefix: str) -> None: + """Save the trained model. - Given the prefix of the file paths, save the model into files, with name given by the prefix, - and add "_embedvecdict.pickle" at the end. If there is no trained model, a `ModelNotTrainedException` - will be thrown. + Args: + nameprefix: Prefix for output files. - :param nameprefix: prefix of the file path - :return: None - :type nameprefix: str - :raise: ModelNotTrainedException + Raises: + ModelNotTrainedException: If not trained. """ if not self.trained: raise ModelNotTrainedException() pickle.dump(self.addvec, open(nameprefix+'_embedvecdict.pkl', 'wb')) - def loadmodel(self, nameprefix): - """ Load a trained model from files. + def loadmodel(self, nameprefix: str) -> None: + """Load a trained model. - Given the prefix of the file paths, load the model from files with name given by the prefix - followed by "_embedvecdict.pickle". - - If this has not been run, or a model was not trained by :func:`~train`, - a `ModelNotTrainedException` will be raised while performing prediction and saving the model. - - :param nameprefix: prefix of the file path - :return: None - :type nameprefix: str + Args: + nameprefix: Prefix for input files. """ self.addvec = pickle.load(open(nameprefix+'_embedvecdict.pkl', 'rb')) self.trained = True - def shorttext_to_embedvec(self, shorttext): - """ Convert the short text into an averaged embedded vector representation. + def shorttext_to_embedvec( + self, + shorttext: str + ) -> Annotated[npt.NDArray[np.float64], "1D Array"]: + """Convert short text to embedding vector. - Given a short sentence, it converts all the tokens into embedded vectors according to - the given word-embedding model, sums - them up, and normalize the resulting vector. It returns the resulting vector - that represents this short sentence. + Args: + shorttext: Input text. - :param shorttext: a short sentence - :return: an embedded vector that represents the short sentence - :type shorttext: str - :rtype: numpy.ndarray + Returns: + Normalized embedding vector. """ return shorttext_to_avgvec(shorttext, self.wvmodel) - def score(self, shorttext): - """ Calculate the scores for all the class labels for the given short sentence. + def score(self, shorttext: str) -> dict[str, float]: + """Calculate classification scores for all class labels. - Given a short sentence, calculate the classification scores for all class labels, - returned as a dictionary with key being the class labels, and values being the scores. - If the short sentence is empty, or if other numerical errors occur, the score will be `numpy.nan`. + Args: + shorttext: Input text. - If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. + Returns: + Dictionary mapping class labels to scores. - :param shorttext: a short sentence - :return: a dictionary with keys being the class labels, and values being the corresponding classification scores - :type shorttext: str - :rtype: dict - :raise: ModelNotTrainedException + Raises: + ModelNotTrainedException: If not trained. """ if not self.trained: raise ModelNotTrainedException() + vec = self.shorttext_to_embedvec(shorttext) scoredict = {} - for classtype in self.addvec: + for classtype, addvec in self.addvec.items(): try: - scoredict[classtype] = self.simfcn(vec, self.addvec[classtype]) + scoredict[classtype] = self.simfcn(vec, addvec) except ValueError: scoredict[classtype] = np.nan return scoredict -def load_sumword2vec_classifier(wvmodel, name, compact=True, vecsize=None): - """ Load a :class:`shorttext.classifiers.SumEmbeddedVecClassifier` instance from file, given the pre-trained Word2Vec model. +def load_sumword2vec_classifier( + wvmodel: KeyedVectors, + name: str, + compact: bool = True, + vecsize: Optional[int] = None +) -> SumEmbeddedVecClassifier: + """Load a SumEmbeddedVecClassifier from file. + + Args: + wvmodel: Word embedding model. + name: Model name (compact) or prefix (non-compact). + compact: Whether to load compact model. Default: True. + vecsize: Vector size. Default: None. - :param wvmodel: Word2Vec model - :param name: name (if compact=True) or prefix (if compact=False) of the file path - :param compact whether model file is compact (Default: True) - :param vecsize: length of embedded vectors in the model (Default: None, directly extracted from word-embedding model) - :return: the classifier - :type wvmodel: gensim.models.keyedvectors.KeyedVectors - :type name: str - :type compact: bool - :type vecsize: int - :rtype: SumEmbeddedVecClassifier + Returns: + SumEmbeddedVecClassifier instance. """ classifier = SumEmbeddedVecClassifier(wvmodel, vecsize=vecsize) if compact: classifier.load_compact_model(name) else: classifier.loadmodel(name) - return classifier \ No newline at end of file + return classifier diff --git a/src/shorttext/classifiers/embed/sumvec/VarNNSumEmbedVecClassification.py b/src/shorttext/classifiers/embed/sumvec/VarNNSumEmbedVecClassification.py index 5a2af100..a89a53d1 100644 --- a/src/shorttext/classifiers/embed/sumvec/VarNNSumEmbedVecClassification.py +++ b/src/shorttext/classifiers/embed/sumvec/VarNNSumEmbedVecClassification.py @@ -1,67 +1,86 @@ +from typing import Optional, Annotated + import numpy as np +import numpy.typing as npt +from gensim.models.keyedvectors import KeyedVectors +from tensorflow.keras.models import Model from ....utils import kerasmodel_io as kerasio from ....utils.classification_exceptions import ModelNotTrainedException from ....utils.textpreprocessing import tokenize from ....utils.compactmodel_io import CompactIOMachine +from ...base import AbstractScorer -class VarNNSumEmbeddedVecClassifier(CompactIOMachine): - """ - This is a wrapper for various neural network algorithms - for supervised short text categorization. - Each class label has a few short sentences, where each token is converted - to an embedded vector, given by a pre-trained word-embedding model (e.g., Google Word2Vec model). - The sentences are represented by an array. - The type of neural network has to be passed when training, and it has to be of - type :class:`keras.models.Sequential`. The number of outputs of the models has to match - the number of class labels in the training data. - To perform prediction, the input short sentences is converted to a unit vector - in the same way. The score is calculated according to the trained neural network model. - - Examples of the models can be found in `frameworks`. - - A pre-trained Google Word2Vec model can be downloaded `here - `_. +class VarNNSumEmbeddedVecClassifier(AbstractScorer, CompactIOMachine): + """Neural network classifier using summed embeddings. + + Wraps Keras neural network models for supervised short text classification. + Each token is converted to an embedded vector using a pre-trained word-embedding + model. The sentence embedding is the sum of token embeddings, normalized to + a unit vector. + The neural network model must be a Keras Sequential model with output dimension + matching the number of class labels. + + Reference: + Pre-trained Word2Vec: https://code.google.com/archive/p/word2vec/ + Example models available in the frameworks module. """ - def __init__(self, wvmodel, vecsize=None, maxlen=15): - """ Initialize the classifier. - - :param wvmodel: Word2Vec model - :param vecsize: length of embedded vectors in the model (Default: None, extracted directly from the word-embedding model) - :param maxlen: maximum number of words in a sentence (Default: 15) - :type wvmodel: gensim.models.word2vec.Word2Vec - :type vecsize: int - :type maxlen: int + + def __init__( + self, + wvmodel: KeyedVectors, + vecsize: Optional[int] = None, + maxlen: int = 15 + ): + """Initialize the classifier. + + Args: + wvmodel: Word embedding model (e.g., Word2Vec). + vecsize: Vector size. Default: None (extracted from model). + maxlen: Maximum number of words per sentence. Default: 15. """ - CompactIOMachine.__init__(self, {'classifier': 'sumnnlibvec'}, 'sumnnlibvec', ['_classlabels.txt', '.json', '.weights.h5']) + CompactIOMachine.__init__( + self, + {'classifier': 'sumnnlibvec'}, + 'sumnnlibvec', + ['_classlabels.txt', '.json', '.weights.h5'] + ) self.wvmodel = wvmodel - self.vecsize = self.wvmodel.vector_size if vecsize==None else vecsize + self.vecsize = self.wvmodel.vector_size if vecsize is None else vecsize self.maxlen = maxlen self.trained = False - def convert_traindata_embedvecs(self, classdict): - """ Convert the training text data into embedded matrix. + def convert_traindata_embedvecs( + self, + classdict: dict[str, list[str]] + ) -> tuple[list[str], Annotated[npt.NDArray[np.float64], "2D Array"], Annotated[npt.NDArray[np.int64], "2D Array"]]: + """Convert training data to embedded vectors. - Convert the training text data into embedded matrix, where each short sentence - is a normalized summed embedded vectors for all words. + Converts each short text into a normalized sum of word embeddings. - :param classdict: training data - :return: tuples, consisting of class labels, matrix of embedded vectors, and corresponding outputs - :type classdict: dict - :rtype: (list, numpy.ndarray, list) + Args: + classdict: Training data with class labels as keys and texts as values. + + Returns: + Tuple of (class_labels, embedding_matrix, labels_array). """ - classlabels = classdict.keys() + classlabels = sorted(classdict.keys()) lblidx_dict = dict(zip(classlabels, range(len(classlabels)))) indices = [] embedvecs = [] for classlabel in classlabels: for shorttext in classdict[classlabel]: - embedvec = np.sum(np.array([self.word_to_embedvec(token) for token in tokenize(shorttext)]), - axis=0) + embedvec = np.sum( + np.array([ + self.word_to_embedvec(token) + for token in tokenize(shorttext) + ]), + axis=0 + ) norm = np.linalg.norm(embedvec) if norm == 0: continue @@ -75,147 +94,125 @@ def convert_traindata_embedvecs(self, classdict): embedvecs = np.array(embedvecs) return classlabels, embedvecs, indices - def train(self, classdict, kerasmodel, nb_epoch=10): - """ Train the classifier. - - The training data and the corresponding keras model have to be given. - - If this has not been run, or a model was not loaded by :func:`~loadmodel`, - a `ModelNotTrainedException` will be raised while performing prediction and saving the model. - - :param classdict: training data - :param kerasmodel: keras sequential model - :param nb_epoch: number of steps / epochs in training - :return: None - :type classdict: dict - :type kerasmodel: keras.models.Sequential - :type nb_epoch: int + def train( + self, + classdict: dict[str, list[str]], + kerasmodel: Model, + nb_epoch: int = 10 + ) -> None: + """Train the classifier. + + Args: + classdict: Training data. + kerasmodel: Keras Sequential model. + nb_epoch: Number of training epochs. Default: 10. + + Raises: + ModelNotTrainedException: If not trained or loaded. """ - # convert training data into embedded vectors self.classlabels, train_embedvec, indices = self.convert_traindata_embedvecs(classdict) - - # train the model kerasmodel.fit(train_embedvec, indices, epochs=nb_epoch) - - # flag switch self.model = kerasmodel self.trained = True - def savemodel(self, nameprefix): - """ Save the trained model into files. + def savemodel(self, nameprefix: str) -> None: + """Save the trained model to files. - Given the prefix of the file paths, save the model into files, with name given by the prefix. - There will be three files produced, one name ending with "_classlabels.txt", one name - ending with ".json", and one name ending with ".weights.h5". - If there is no trained model, a `ModelNotTrainedException` will be thrown. + Args: + nameprefix: Prefix for output files. - :param nameprefix: prefix of the file path - :return: None - :type nameprefix: str - :raise: ModelNotTrainedException + Raises: + ModelNotTrainedException: If not trained. """ if not self.trained: raise ModelNotTrainedException() - kerasio.save_model(nameprefix, self.model) - labelfile = open(nameprefix+'_classlabels.txt', 'w') - labelfile.write('\n'.join(self.classlabels)) - labelfile.close() - - def loadmodel(self, nameprefix): - """ Load a trained model from files. - Given the prefix of the file paths, load the model from files with name given by the prefix - followed by "_classlabels.txt", ".json", and ".weights.h5". + kerasio.save_model(nameprefix, self.model) + open(nameprefix+'_classlabels.txt', 'w').write('\n'.join(self.classlabels)) - If this has not been run, or a model was not trained by :func:`~train`, - a `ModelNotTrainedException` will be raised while performing prediction and saving the model. + def loadmodel(self, nameprefix: str) -> None: + """Load a trained model from files. - :param nameprefix: prefix of the file path - :return: None - :type nameprefix: str + Args: + nameprefix: Prefix for input files. """ self.model = kerasio.load_model(nameprefix) - labelfile = open(nameprefix+'_classlabels.txt', 'r') - self.classlabels = labelfile.readlines() - labelfile.close() - self.classlabels = [s.strip() for s in self.classlabels] + self.classlabels = [s.strip() for s in open(nameprefix+'_classlabels.txt', 'r')] self.trained = True - def word_to_embedvec(self, word): - """ Convert the given word into an embedded vector. + def word_to_embedvec(self, word: str) -> Annotated[npt.NDArray[np.float64], "1D Array"]: + """Convert a word to its embedding vector. - Given a word, return the corresponding embedded vector according to - the word-embedding model. If there is no such word in the model, - a vector with zero values are given. + Args: + word: Input word. - :param word: a word - :return: the corresponding embedded vector - :type word: str - :rtype: numpy.ndarray + Returns: + Embedding vector. Returns zeros if word not in vocabulary. """ - return self.wvmodel[word] if word in self.wvmodel else np.zeros(self.vecsize) + return self.wvmodel[word].astype(np.float64) if word in self.wvmodel else np.zeros(self.vecsize) + + def shorttext_to_embedvec(self, shorttext: str) -> Annotated[npt.NDArray[np.float64], "1D Array"]: + """Convert short text to embedding vector. - def shorttext_to_embedvec(self, shorttext): - """ Convert the short text into an averaged embedded vector representation. + Sums token embeddings and normalizes to unit vector. - Given a short sentence, it converts all the tokens into embedded vectors according to - the given word-embedding model, sums - them up, and normalize the resulting vector. It returns the resulting vector - that represents this short sentence. + Args: + shorttext: Input text. - :param shorttext: a short sentence - :return: an embedded vector that represents the short sentence - :type shorttext: str - :rtype: numpy.ndarray + Returns: + Normalized embedding vector. """ - vec = np.sum([self.wvmodel[token] for token in tokenize(shorttext) if token in self.wvmodel]) + vec = np.sum([ + self.wvmodel[token].astype(np.float64) + for token in tokenize(shorttext) + if token in self.wvmodel + ]) norm = np.linalg.norm(vec) if norm != 0: vec /= np.linalg.norm(vec) return vec - def score(self, shorttext): - """ Calculate the scores for all the class labels for the given short sentence. + def score(self, shorttext: str) -> dict[str, float]: + """Calculate classification scores for all class labels. - Given a short sentence, calculate the classification scores for all class labels, - returned as a dictionary with key being the class labels, and values being the scores. - If the short sentence is empty, or if other numerical errors occur, the score will be `numpy.nan`. + Args: + shorttext: Input text. - If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. + Returns: + Dictionary mapping class labels to scores. - :param shorttext: a short sentence - :return: a dictionary with keys being the class labels, and values being the corresponding classification scores - :type shorttext: str - :rtype: dict - :raise: ModelNotTrainedException + Raises: + ModelNotTrainedException: If not trained. """ if not self.trained: raise ModelNotTrainedException() - # retrieve vector embedvec = np.array(self.shorttext_to_embedvec(shorttext)) - - # classification using the neural network predictions = self.model.predict(np.array([embedvec])) - # wrangle output result - scoredict = {classlabel: predictions[0][idx] for idx, classlabel in enumerate(self.classlabels)} + scoredict = { + classlabel: predictions[0, idx] + for idx, classlabel in enumerate(self.classlabels) + } return scoredict -def load_varnnsumvec_classifier(wvmodel, name, compact=True, vecsize=None): - """ Load a :class:`shorttext.classifiers.VarNNSumEmbeddedVecClassifier` instance from file, given the pre-trained word-embedding model. +def load_varnnsumvec_classifier( + wvmodel: KeyedVectors, + name: str, + compact: bool = True, + vecsize: Optional[int] = None +) -> VarNNSumEmbeddedVecClassifier: + """Load a VarNNSumEmbeddedVecClassifier from file. + + Args: + wvmodel: Word embedding model. + name: Model name (compact) or file prefix (non-compact). + compact: Whether to load compact model. Default: True. + vecsize: Vector size. Default: None. - :param wvmodel: Word2Vec model - :param name: name (if compact=True) or prefix (if compact=False) of the file path - :param compact whether model file is compact (Default: True) - :param vecsize: length of embedded vectors in the model (Default: None, extracted directly from the word-embedding model) - :return: the classifier - :type wvmodel: gensim.models.keyedvectors.KeyedVectors - :type name: str - :type compact: bool - :type vecsize: int - :rtype: VarNNSumEmbeddedVecClassifier + Returns: + VarNNSumEmbeddedVecClassifier instance. """ classifier = VarNNSumEmbeddedVecClassifier(wvmodel, vecsize=vecsize) if compact: diff --git a/src/shorttext/classifiers/embed/sumvec/frameworks.py b/src/shorttext/classifiers/embed/sumvec/frameworks.py index 7372d76d..1502058f 100644 --- a/src/shorttext/classifiers/embed/sumvec/frameworks.py +++ b/src/shorttext/classifiers/embed/sumvec/frameworks.py @@ -1,39 +1,44 @@ +from typing import Optional, Literal + from tensorflow.keras.layers import Dense, Activation -from tensorflow.keras.models import Sequential +from tensorflow.keras.models import Sequential, Model from tensorflow.keras.regularizers import l2 from ....utils.classification_exceptions import UnequalArrayLengthsException -def DenseWordEmbed(nb_labels, - dense_nb_nodes=[], - dense_actfcn=[], - vecsize=300, - reg_coef=0.1, - final_activiation='softmax', - optimizer='adam'): - """ Return layers of dense neural network. - - Return layers of dense neural network. This assumes the input to be a rank-1 vector. - - :param nb_labels: number of class labels - :param dense_nb_nodes: number of nodes in each later (Default: []) - :param dense_actfcn: activation functions for each layer (Default: []) - :param vecsize: length of the embedded vectors in the model (Default: 300) - :param reg_coef: regularization coefficient (Default: 0.1) - :param final_activiation: activation function of the final layer (Default: softmax) - :param optimizer: optimizer for gradient descent. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. (Default: adam) - :return: keras sequential model for dense neural network - :type nb_labels: int - :type dense_nb_nodes: list - :type dense_actfcn: list - :type vecsize: int - :type reg_coef: float - :type final_activiation: str - :type optimizer: str - :rtype: keras.models.Model +def DenseWordEmbed( + nb_labels: int, + dense_nb_nodes: Optional[list[int]] = None, + dense_actfcn: Optional[Literal["softplus", "softsign", "relu", "tanh", "sigmoid", "hard_sigmoid", "linear"]] = None, + vecsize: int = 300, + reg_coef: float = 0.1, + final_activiation: Literal["softplus", "softsign", "relu", "tanh", "sigmoid", "hard_sigmoid", "linear"] = "softmax", + optimizer: Literal["sgd", "rmsprop", "adagrad", "adadelta", "adam", "adamax", "nadam"] = "adam" +) -> Model: + """Create a dense neural network for embedding-based classification. + + Args: + nb_labels: Number of class labels. + dense_nb_nodes: Nodes per layer. Default: []. + dense_actfcn: Activation functions per layer. Default: []. + vecsize: Embedding vector size. Default: 300. + reg_coef: L2 regularization coefficient. Default: 0.1. + final_activiation: Final layer activation. Default: softmax. + optimizer: Optimizer. Default: adam. + + Returns: + Keras Sequential model. + + Raises: + UnequalArrayLengthsException: If dense_nb_nodes and dense_actfcn have different lengths. """ + if dense_nb_nodes is None: + dense_nb_nodes = [] + if dense_actfcn is None: + dense_actfcn = [] + if len(dense_nb_nodes)!=len(dense_actfcn): raise UnequalArrayLengthsException(dense_nb_nodes, dense_actfcn) nb_layers = len(dense_nb_nodes) @@ -51,8 +56,7 @@ def DenseWordEmbed(nb_labels, model.add(Dense(nb_nodes, activation=activation, kernel_regularizer=l2(reg_coef))) model.add(Dense(nb_labels, kernel_regularizer=l2(reg_coef))) - # final activation layer model.add(Activation(final_activiation)) - model.compile(loss='categorical_crossentropy', optimizer=optimizer) + model.compile(loss="categorical_crossentropy", optimizer=optimizer) - return model \ No newline at end of file + return model diff --git a/src/shorttext/cli/categorization.py b/src/shorttext/cli/categorization.py index b2fe88ba..d2817e1e 100644 --- a/src/shorttext/cli/categorization.py +++ b/src/shorttext/cli/categorization.py @@ -1,19 +1,19 @@ import os from functools import partial -import argparse -import logging +from argparse import ArgumentParser from operator import itemgetter +from loguru import logger + from ..utils.compactmodel_io import get_model_classifier_name from ..utils.classification_exceptions import AlgorithmNotExistException, WordEmbeddingModelNotExistException from ..utils import load_word2vec_model, load_fasttext_model, load_poincare_model from ..smartload import smartload_compact_model from ..classifiers import TopicVectorCosineDistanceClassifier -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) +# configs allowed_classifiers = [ 'ldatopic', 'lsitopic', 'rptopic', 'kerasautoencoder', 'topic_sklearn', 'nnlibvec', 'sumvec', 'maxent' @@ -21,6 +21,8 @@ needembedded_classifiers = ['nnlibvec', 'sumvec'] topicmodels = ['ldatopic', 'lsitopic', 'rptopic', 'kerasautoencoder'] + +# lazy functions for loading word embedding model load_word2vec_nonbinary_model = partial(load_word2vec_model, binary=False) load_poincare_binary_model = partial(load_poincare_model, binary=True) @@ -33,8 +35,13 @@ } -def get_argparser(): - parser = argparse.ArgumentParser( +def get_argparser() -> ArgumentParser: + """Get argument parser for short text categorization CLI. + + Returns: + ArgumentParser for command line arguments. + """ + parser = ArgumentParser( description='Perform prediction on short text with a given trained model.' ) parser.add_argument('model_filepath', help='Path of the trained (compact) model.') @@ -46,6 +53,7 @@ def get_argparser(): help='Type of word-embedding model (default: word2vec)') return parser + # main block def main(): # argument parsing @@ -99,6 +107,3 @@ def main(): for label, score in sorted(scoredict.items(), key=itemgetter(1), reverse=True)[:args.topn]: print(f'{label} : {score:.4f}') print('Done.') - -if __name__ == "__main__": - main() diff --git a/src/shorttext/cli/wordembedsim.py b/src/shorttext/cli/wordembedsim.py index db62fcbd..f5f03039 100644 --- a/src/shorttext/cli/wordembedsim.py +++ b/src/shorttext/cli/wordembedsim.py @@ -9,7 +9,7 @@ from ..utils import shorttext_to_avgvec from ..metrics.wasserstein import word_mover_distance from ..metrics.dynprog.jaccard import soft_jaccard_score - +from ..utils.compute import cosine_similarity typedict = { 'word2vec': load_word2vec_model, @@ -19,7 +19,11 @@ def getargparser(): - parser = argparse.ArgumentParser(description='Find the similarities between two short sentences using Word2Vec.') + """Get argument parser for word embedding similarity CLI. + + Returns: + ArgumentParser for command line arguments. + """ parser.add_argument('modelpath', help='Path of the Word2Vec model') parser.add_argument('--type', default='word2vec', help='Type of word-embedding model (default: "word2vec"; other options: "fasttext", "poincare")') @@ -34,11 +38,11 @@ def main(): tokenize('Mogu is cute.') time0 = time.time() - print("Loading "+args.type+" model: "+args.modelpath) + print(f"Loading {args.type} model: {args.modelpath}") wvmodel = typedict[args.type](args.modelpath) time1 = time.time() end = False - print("... loading time: "+str(time1 - time0)+" seconds") + print(f"... loading time: {time1 - time0} seconds") while not end: sent1 = input('sent1> ') @@ -48,8 +52,7 @@ def main(): sent2 = input('sent2> ') # output results - print("Cosine Similarity = %.4f" % (1 - cosine(shorttext_to_avgvec(sent1, wvmodel), shorttext_to_avgvec(sent2, wvmodel)))) - print("Word-embedding Jaccard Score Similarity = %.4f" % jaccardscore_sents(sent1, sent2, wvmodel)) - print("Word Mover's Distance = %.4f" % word_mover_distance(tokenize(sent1), tokenize(sent2), wvmodel)) - print("Soft Jaccard Score (edit distance) = %.4f" % soft_jaccard_score(tokenize(sent1), tokenize(sent2))) - + print(f"Cosine Similarity = {cosine_similarity(shorttext_to_avgvec(sent1, wvmodel), shorttext_to_avgvec(sent2, wvmodel)):.4f}") + print(f"Word-embedding Jaccard Score Similarity = {jaccardscore_sents(sent1, sent2, wvmodel):.4f}") + print(f"Word Mover's Distance = {word_mover_distance(tokenize(sent1), tokenize(sent2), wvmodel):.4f}") + print(f"Soft Jaccard Score (edit distance) = {soft_jaccard_score(tokenize(sent1), tokenize(sent2)):.4f}") diff --git a/src/shorttext/data/data_retrieval.py b/src/shorttext/data/data_retrieval.py index af0958a1..efa1ecc3 100644 --- a/src/shorttext/data/data_retrieval.py +++ b/src/shorttext/data/data_retrieval.py @@ -3,26 +3,35 @@ from collections import defaultdict import json import os +from os import PathLike +from pathlib import Path import zipfile import sys import csv from urllib.request import urlretrieve +from io import TextIOWrapper +from typing import Generator import pandas as pd import numpy as np +import orjson -def retrieve_csvdata_as_dict(filepath): - """ Retrieve the training data in a CSV file. +def retrieve_csvdata_as_dict(filepath: str | PathLike) -> dict[str, list[str]]: + """Retrieve the training data in a CSV file. - Retrieve the training data in a CSV file, with the first column being the - class labels, and second column the text data. It returns a dictionary with - the class labels as keys, and a list of short texts as the value for each key. + Reads a CSV file where the first column contains class labels and the second column + contains text data. Returns a dictionary mapping class labels to lists of + short texts. - :param filepath: path of the training data (CSV) - :return: a dictionary with class labels as keys, and lists of short texts - :type filepath: str - :rtype: dict + Args: + filepath: Path to the CSV training data file. + + Returns: + A dictionary with class labels as keys and lists of short texts as values. + + Reference: + Data format inspired by common text classification benchmarks. """ datafile = open(filepath, 'r') reader = csv.reader(datafile) @@ -37,84 +46,130 @@ class labels, and second column the text data. It returns a dictionary with return dict(shorttextdict) -def retrieve_jsondata_as_dict(filepath): - """ Retrieve the training data in a JSON file. +def retrieve_jsondata_as_dict(filepath: str | PathLike) -> dict: + """Retrieve the training data in a JSON file. - Retrieve the training data in a JSON file, with - the class labels as keys, and a list of short texts as the value for each key. - It returns the corresponding dictionary. + Reads a JSON file where class labels are keys and lists of short texts + are values. Returns the corresponding dictionary. - :param filepath: path of the training data (JSON) - :return: a dictionary with class labels as keys, and lists of short texts - :type filepath: str - :rtype: dict + Args: + filepath: Path to the JSON training data file. + + Returns: + A dictionary with class labels as keys and lists of short texts as values. """ - return json.load(open(filepath, 'r')) + return orjson.loads(open(filepath, 'rb').read()) + +def get_or_download_data( + filename: str, + origin: str, + asbytes: bool = False +) -> TextIOWrapper: + """Retrieve or download a data file. -def subjectkeywords(): - """ Return an example data set of subjects. + Checks if the file exists in the user's home directory under .shorttext. + If not present, downloads from the given origin URL. - Return an example data set, with three subjects and corresponding keywords. - This is in the format of the training input. + Args: + filename: Name of the file to retrieve. + origin: URL to download the file from if not present locally. + asbytes: If True, opens the file in binary mode. Default is False. - :return: example data set - :rtype: dict + Returns: + A file object (text or binary mode depending on asbytes). """ - this_dir, _ = os.path.split(__file__) - return retrieve_csvdata_as_dict(os.path.join(this_dir, 'shorttext_exampledata.csv')) - - -def inaugural(): - """ Return an example dataset, which is the Inaugural Addresses of all Presidents of - the United States from George Washington to Barack Obama. - - Each key is the year, a dash, and the last name of the president. The content is - the list of all the sentences - - :return: example data set - :rtype: dict + # determine path + homedir = os.path.expanduser('~') + datadir = os.path.join(homedir, '.shorttext') + if not os.path.exists(datadir): + os.makedirs(datadir) + + targetfilepath = os.path.join(datadir, filename) + # download if not exist + if not os.path.exists(os.path.join(datadir, filename)): + print('Downloading...', file=sys.stderr) + print(f'Source: {origin}', file=sys.stderr) + print(f'Target: {targetfilepath}', file=sys.stderr) + try: + urlretrieve(origin, targetfilepath) + except: + print('Failure to download file!', file=sys.stderr) + print(sys.exc_info(), file=sys.stderr) + os.remove(targetfilepath) + + # return + return open(targetfilepath, 'rb' if asbytes else 'r') + + +def subjectkeywords() -> dict[str, list[str]]: + """Return an example dataset of subjects with keywords. + + Returns a small example dataset with three subjects and their + corresponding keywords, in the training input format. + + Returns: + A dictionary with subject labels as keys and lists of keywords as values. """ - zfile = zipfile.ZipFile(get_or_download_data("USInaugural.zip", - "https://shorttext-data-northernvirginia.s3.amazonaws.com/trainingdata/USInaugural.zip", - asbytes=True), - ) + parentdir = Path(__file__).parent + return retrieve_csvdata_as_dict(parentdir / "shorttext_exampledata.csv") + + +def inaugural() -> dict[str, list[str]]: + """Return the Inaugural Addresses of US Presidents. + + Returns an example dataset containing the Inaugural Addresses of all + Presidents of the United States from George Washington to Barack Obama. + + Each key is formatted as "year-lastname" and the value is a list of + sentences from the address. + + Returns: + A dictionary with president identifiers as keys and lists of sentences as values. + + Reference: + https://www.presidency.us/kisa_exec/inaugural.html + """ + zfile = zipfile.ZipFile( + get_or_download_data( + "USInaugural.zip", + "https://shorttext-data-northernvirginia.s3.amazonaws.com/trainingdata/USInaugural.zip", + asbytes=True + ) + ) address_jsonstr = zfile.open("addresses.json").read() zfile.close() return json.loads(address_jsonstr.decode('utf-8')) def nihreports(txt_col='PROJECT_TITLE', label_col='FUNDING_ICs', sample_size=512): - """ Return an example data set, sampled from NIH RePORT (Research Portfolio - Online Reporting Tools). - - Return an example data set from NIH (National Institutes of Health), - data publicly available from their RePORT - website. (`link - `_). - The data is with `txt_col` being either project titles ('PROJECT_TITLE') - or proposal abstracts ('ABSTRACT_TEXT'), and label_col being the names of the ICs (Institutes or Centers), - with 'IC_NAME' the whole form, and 'FUNDING_ICs' the abbreviated form). - - Dataset directly adapted from the NIH data from `R` package `textmineR - `_. - - :param txt_col: column for the text (Default: 'PROJECT_TITLE') - :param label_col: column for the labels (Default: 'FUNDING_ICs') - :param sample_size: size of the sample. Set to None if all rows. (Default: 512) - :return: example data set - :type txt_col: str - :type label_col: str - :type sample_size: int - :rtype: dict + """Return an example dataset sampled from NIH RePORT. + + Returns an example dataset from NIH (National Institutes of Health) + RePORT (Research Portfolio Online Reporting Tools) website. + + Args: + txt_col: Column for text data. Options: 'PROJECT_TITLE' or 'ABSTRACT_TEXT'. + Default: 'PROJECT_TITLE'. + label_col: Column for labels. Options: 'FUNDING_ICs' or 'IC_NAME'. + Default: 'FUNDING_ICs'. + sample_size: Number of samples to return. Set to None for all rows. Default: 512. + + Returns: + A dictionary with IC identifiers as keys and lists of text data as values. + + Reference: + https://exporter.nih.gov/ExPORTER_Catalog.aspx + Dataset adapted from the R package textmineR: + https://cran.r-project.org/web/packages/textmineR/index.html """ # validation # txt_col = 'PROJECT_TITLE' or 'ABSTRACT_TEXT' # label_col = 'FUNDING_ICs' or 'IC_NAME' if not (txt_col in ['PROJECT_TITLE', 'ABSTRACT_TEXT']): - raise KeyError('Undefined text column: '+txt_col+'. Must be PROJECT_TITLE or ABSTRACT_TEXT.') + raise KeyError(f'Undefined text column: {txt_col}. Must be PROJECT_TITLE or ABSTRACT_TEXT.') if not (label_col in ['FUNDING_ICs', 'IC_NAME']): - raise KeyError('Undefined label column: '+label_col+'. Must be FUNDING_ICs or IC_NAME.') + raise KeyError(f'Undefined label column: {label_col}. Must be FUNDING_ICs or IC_NAME.') zfile = zipfile.ZipFile(get_or_download_data('nih_full.csv.zip', 'https://shorttext-data-northernvirginia.s3.amazonaws.com/trainingdata/nih_full.csv.zip', @@ -141,16 +196,20 @@ def nihreports(txt_col='PROJECT_TITLE', label_col='FUNDING_ICs', sample_size=512 return dict(classdict) -def mergedict(dicts): - """ Merge data dictionary. +def merge_cv_dicts(dicts: list[dict[str, list[str]]]) -> dict[str, list[str]]: + """Merge multiple training data dictionaries. + + Combines multiple data dictionaries in the training data format + into a single dictionary. - Merge dictionaries of the data in the training data format. + Args: + dicts: List of dictionaries to merge, each with class labels + as keys and lists of texts as values. - :param dicts: dicts to merge - :return: merged dict - :type dicts: list - :rtype: dict + Returns: + A merged dictionary with all class labels and texts combined. """ + # NOTE: this is not a usualy Python dict merge. It does specialized merging. mdict = defaultdict(lambda : []) for thisdict in dicts: for label in thisdict: @@ -158,20 +217,25 @@ def mergedict(dicts): return dict(mdict) -def yield_crossvalidation_classdicts(classdict, nb_partitions, shuffle=False): - """ Yielding test data and training data for cross validation by partitioning it. +def yield_crossvalidation_classdicts( + classdict: dict[str, list[str]], + nb_partitions: int, + shuffle: bool = False +) -> Generator[tuple[dict[str, list[str]], dict[str, list[str]]], None, None]: + """Yield training and test data partitions for cross-validation. - Given a training data, partition the data into portions, each will be used as test - data set, while the other training data set. It returns a generator. + Partitions the training data into multiple sets. Each iteration yields + a (test_dict, train_dict) pair where one partition is used as test + data and the remaining partitions are combined as training data. - :param classdict: training data - :param nb_partitions: number of partitions - :param shuffle: whether to shuffle the data before partitioning - :return: generator, producing a test data set and a training data set each time - :type classdict: dict - :type nb_partitions: int - :type shuffle: bool - :rtype: generator + Args: + classdict: Training data dictionary with class labels as keys + and lists of texts as values. + nb_partitions: Number of partitions to create. + shuffle: Whether to shuffle data before partitioning. Default: False. + + Yields: + Tuples of (test_dict, train_dict) for each partition. """ crossvaldicts = [] for _ in range(nb_partitions): @@ -187,29 +251,5 @@ def yield_crossvalidation_classdicts(classdict, nb_partitions, shuffle=False): for i in range(nb_partitions): testdict = crossvaldicts[i] - traindict = mergedict([crossvaldicts[j] for j in range(nb_partitions) if j != i]) + traindict = merge_cv_dicts([crossvaldicts[j] for j in range(nb_partitions) if j != i]) yield testdict, traindict - - -def get_or_download_data(filename, origin, asbytes=False): - # determine path - homedir = os.path.expanduser('~') - datadir = os.path.join(homedir, '.shorttext') - if not os.path.exists(datadir): - os.makedirs(datadir) - - targetfilepath = os.path.join(datadir, filename) - # download if not exist - if not os.path.exists(os.path.join(datadir, filename)): - print('Downloading...') - print('Source: ', origin) - print('Target: ', targetfilepath) - try: - urlretrieve(origin, targetfilepath) - except: - print('Failure to download file!') - print(sys.exc_info()) - os.remove(targetfilepath) - - # return - return open(targetfilepath, 'rb' if asbytes else 'r') diff --git a/src/shorttext/generators/__init__.py b/src/shorttext/generators/__init__.py index a0bf818c..c1d9ae5b 100644 --- a/src/shorttext/generators/__init__.py +++ b/src/shorttext/generators/__init__.py @@ -4,6 +4,6 @@ from .bow.GensimTopicModeling import LatentTopicModeler, GensimTopicModeler, LDAModeler, LSIModeler, RPModeler from .bow.AutoEncodingTopicModeling import AutoencodingTopicModeler -from .charbase.char2vec import SentenceToCharVecEncoder, initSentenceToCharVecEncoder -from .seq2seq.s2skeras import Seq2SeqWithKeras, loadSeq2SeqWithKeras +from .charbase.char2vec import SentenceToCharVecEncoder, initialize_SentenceToCharVecEncoder +from .seq2seq.s2skeras import Seq2SeqWithKeras, load_seq2seq_model from .seq2seq.charbaseS2S import CharBasedSeq2SeqGenerator, loadCharBasedSeq2SeqGenerator diff --git a/src/shorttext/generators/bow/AutoEncodingTopicModeling.py b/src/shorttext/generators/bow/AutoEncodingTopicModeling.py index 5c12b279..3b833dbf 100644 --- a/src/shorttext/generators/bow/AutoEncodingTopicModeling.py +++ b/src/shorttext/generators/bow/AutoEncodingTopicModeling.py @@ -1,78 +1,115 @@ import json import pickle -from functools import reduce -from operator import add +from typing import Optional, Any +from collections import Counter import numpy as np -from gensim.corpora import Dictionary +import numpy.typing as npt +import sparse from tensorflow.keras import Input from tensorflow.keras import Model from tensorflow.keras.layers import Dense -from scipy.spatial.distance import cosine +import orjson from .LatentTopicModeling import LatentTopicModeler from ...utils import kerasmodel_io as kerasio, textpreprocessing as textpreprocess from ...utils.compactmodel_io import CompactIOMachine from ...utils.classification_exceptions import ModelNotTrainedException +from ...utils.dtm import generate_npdict_document_term_matrix, convert_classdict_to_corpus +from ...utils.compute import cosine_similarity +from ...schemas.models import AutoEncoderPackage -autoencoder_suffices = ['.gensimdict', '_encoder.json', '_encoder.weights.h5', '_classtopicvecs.pkl', +autoencoder_suffices = ['_encoder.json', '_encoder.weights.h5', '_classtopicvecs.pkl', '_decoder.json', '_decoder.weights.h5', '_autoencoder.json', '_autoencoder.weights.h5', '.json'] -class AutoencodingTopicModeler(LatentTopicModeler, CompactIOMachine): - """ - This class facilitates the topic modeling of input training data using the autoencoder. +def get_autoencoder_models( + vector_size: int, + nb_latent_vector_size: int +) -> AutoEncoderPackage: + """Create autoencoder model components. - A reference about how an autoencoder is written with keras by Francois Chollet, titled - `Building Autoencoders in Keras - `_ . + Args: + vector_size: Size of input vectors. + nb_latent_vector_size: Size of the latent space (number of topics). - This class extends :class:`LatentTopicModeler`. + Returns: + AutoEncoderPackage containing autoencoder, encoder, and decoder models. """ - def train(self, classdict, nb_topics, *args, **kwargs): - """ Train the autoencoder. - - :param classdict: training data - :param nb_topics: number of topics, i.e., the number of encoding dimensions - :param args: arguments to be passed to keras model fitting - :param kwargs: arguments to be passed to keras model fitting - :return: None - :type classdict: dict - :type nb_topics: int - """ - CompactIOMachine.__init__(self, {'classifier': 'kerasautoencoder'}, 'kerasautoencoder', autoencoder_suffices) - self.nb_topics = nb_topics - self.generate_corpus(classdict) - vecsize = len(self.dictionary) + # define all the layers of the autoencoder + input_vec = Input(shape=(vector_size,)) + encoded = Dense(nb_latent_vector_size, activation='relu')(input_vec) + decoded = Dense(vector_size, activation='sigmoid')(encoded) - # define all the layers of the autoencoder - input_vec = Input(shape=(vecsize,)) - encoded = Dense(self.nb_topics, activation='relu')(input_vec) - decoded = Dense(vecsize, activation='sigmoid')(encoded) + # define the autoencoder model + autoencoder = Model(inputs=input_vec, outputs=decoded) - # define the autoencoder model - autoencoder = Model(input=input_vec, output=decoded) + # define the encoder + encoder = Model(inputs=input_vec, outputs=encoded) - # define the encoder - encoder = Model(input=input_vec, output=encoded) + # define the decoder + encoded_input = Input(shape=(nb_latent_vector_size,)) + decoder_layer = autoencoder.layers[-1] + decoder = Model(inputs=encoded_input, outputs=decoder_layer(encoded_input)) - # define the decoder - encoded_input = Input(shape=(self.nb_topics,)) - decoder_layer = autoencoder.layers[-1] - decoder = Model(input=encoded_input, output=decoder_layer(encoded_input)) + # compile the autoencoder + autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy') - # compile the autoencoder - autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy') + return AutoEncoderPackage( + autoencoder=autoencoder, + encoder=encoder, + decoder=decoder + ) + + +class AutoencodingTopicModeler(LatentTopicModeler, CompactIOMachine): + """Topic modeler using autoencoder. + + Uses a Keras autoencoder to learn latent topic representations. + The encoded vectors serve as topic vectors for short text classification. + + Reference: + Francois Chollet, "Building Autoencoders in Keras," + https://blog.keras.io/building-autoencoders-in-keras.html + """ + + def __init__( + self, + preprocessor: Optional[callable] = None, + tokenizer: Optional[callable] = None, + normalize: bool = True + ): + CompactIOMachine.__init__(self, {'classifier': 'kerasautoencoder'}, 'kerasautoencoder', autoencoder_suffices) + LatentTopicModeler.__init__(self, preprocessor, tokenizer, normalize=normalize) + + def train(self, classdict: dict[str, list[str]], nb_topics: int, *args, **kwargs) -> None: + """Train the autoencoder topic model. + + Args: + classdict: Training data with class labels as keys and texts as values. + nb_topics: Number of latent topics (encoding dimensions). + *args: Arguments for Keras model fitting. + **kwargs: Keyword arguments for Keras model fitting. + """ + self.nb_topics = nb_topics + corpus, docids = convert_classdict_to_corpus(classdict, self.preprocess_func) + dtm_matrix = generate_npdict_document_term_matrix( + corpus, docids, tokenize_func=self.tokenize_func + ) + vecsize = dtm_matrix.dimension_sizes[1] + self.token2indices = dtm_matrix._keystrings_to_indices[1] + self.classlabels = sorted(classdict.keys()) + + autoencoder_package = get_autoencoder_models(vecsize, self.nb_topics) + autoencoder = autoencoder_package.autoencoder + encoder = autoencoder_package.encoder + decoder = autoencoder_package.decoder # process training data - embedvecs = np.array(reduce(add, - [[self.retrieve_bow_vector(shorttext, normalize=True) for shorttext in classdict[classtype]] - for classtype in classdict] - ) - ) + embedvecs = dtm_matrix.to_numpy() # fit the model autoencoder.fit(embedvecs, embedvecs, *args, **kwargs) @@ -90,76 +127,115 @@ def train(self, classdict, nb_topics, *args, **kwargs): for label in classdict: self.classtopicvecs[label] = self.precalculate_liststr_topicvec(classdict[label]) - def retrieve_topicvec(self, shorttext): - """ Calculate the topic vector representation of the short text. + def retrieve_bow(self, shorttext: str) -> list[tuple[int, int]]: + """Get bag-of-words representation. + + Args: + shorttext: Input text. + + Returns: + List of (token_index, count) tuples. + """ + tokens_freq = Counter(self.tokenize_func(self.preprocess_func(shorttext))) + return [ + (self.token2indices[token], freq) + for token, freq in tokens_freq.items() + if token in self.token2indices.keys() + ] + + def retrieve_bow_vector(self, shorttext: str) -> npt.NDArray[np.float64]: + """Get bag-of-words vector. + + Args: + shorttext: Input text. + + Returns: + BOW vector (normalized if normalize=True). + """ + bow = self.retrieve_bow(shorttext) + if len(bow) > 0: + vec = sparse.COO( + [[0]*len(bow), [id for id, val in bow]], + [val for id, val in bow], + shape=(1, len(self.token2indices)) + ).todense()[0] + else: + vec = np.ones(len(self.token2indices)) + if self.normalize: + vec = vec.astype(np.float64) / np.linalg.norm(vec) + return vec + + def retrieve_topicvec(self, shorttext: str) -> npt.NDArray[np.float64]: + """Get topic vector for short text. + + Args: + shorttext: Input text. - If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. + Returns: + Encoded vector representation. - :param shorttext: short text - :return: encoded vector representation of the short text - :raise: ModelNotTrainedException - :type shorttext: str - :rtype: numpy.ndarray + Raises: + ModelNotTrainedException: If model not trained. """ if not self.trained: raise ModelNotTrainedException() bow_vector = self.retrieve_bow_vector(shorttext) - encoded_vec = self.encoder.predict(np.array([bow_vector]))[0] + encoded_vec = self.encoder.predict(np.expand_dims(bow_vector, axis=0))[0] if self.normalize: encoded_vec /= np.linalg.norm(encoded_vec) - return encoded_vec + return encoded_vec.astype(np.float64) + + def precalculate_liststr_topicvec(self, shorttexts: list[str]) -> npt.NDArray[np.float64]: + """Calculate average topic vector for a list of texts. + + Used during training to compute class centroids. - def precalculate_liststr_topicvec(self, shorttexts): - """ Calculate the summed topic vectors for training data for each class. + Args: + shorttexts: List of texts. - This function is called while training. + Returns: + Average topic vector (normalized). - :param shorttexts: list of short texts - :return: average topic vector - :raise: ModelNotTrainedException - :type shorttexts: list - :rtype: numpy.ndarray + Raises: + ModelNotTrainedException: If model not trained. """ sumvec = sum([self.retrieve_topicvec(shorttext) for shorttext in shorttexts]) sumvec /= np.linalg.norm(sumvec) return sumvec - def get_batch_cos_similarities(self, shorttext): - """ Calculate the score, which is the cosine similarity with the topic vector of the model, - of the short text against each class labels. + def get_batch_cos_similarities(self, shorttext: str) -> dict[str, float]: + """Get cosine similarities to all class centroids. - If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. + Args: + shorttext: Input text. - :param shorttext: short text - :return: dictionary of scores of the text to all classes - :raise: ModelNotTrainedException - :type shorttext: str - :rtype: dict + Returns: + Dictionary mapping class labels to similarity scores. + + Raises: + ModelNotTrainedException: If model not trained. """ if not self.trained: raise ModelNotTrainedException() simdict = {} - for label in self.classtopicvecs: - simdict[label] = 1 - cosine(self.classtopicvecs[label], self.retrieve_topicvec(shorttext)) + for label, classtopicvec in self.classtopicvecs.items(): + simdict[label] = cosine_similarity( + classtopicvec, self.retrieve_topicvec(shorttext) + ) return simdict - def savemodel(self, nameprefix, save_complete_autoencoder=True): - """ Save the model with names according to the prefix. - - Given the prefix of the file paths, save the model into files, with name given by the prefix. - There are files with names ending with "_encoder.json" and "_encoder.weights.h5", which are - the JSON and HDF5 files for the encoder respectively. They also include a gensim dictionary (.gensimdict). + def savemodel(self, nameprefix: str, save_complete_autoencoder: bool=True) -> None: + """Save the autoencoder model to files. - If `save_complete_autoencoder` is True, - then there are also files with names ending with "_decoder.json" and "_decoder.weights.h5". + Saves encoder, optional decoder, and autoencoder weights along with + configuration parameters. - If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. + Args: + nameprefix: Prefix for output files. + save_complete_autoencoder: Whether to save decoder and complete autoencoder. Default: True. - :param nameprefix: prefix of the paths of the file - :param save_complete_autoencoder: whether to store the decoder and the complete autoencoder (Default: True; but False for version <= 0.2.1) - :return: None - :type nameprefix: str - :type save_complete_autoencoder: bool + Raises: + ModelNotTrainedException: If model not trained. """ if not self.trained: raise ModelNotTrainedException() @@ -167,35 +243,29 @@ def savemodel(self, nameprefix, save_complete_autoencoder=True): parameters = {} parameters['nb_topics'] = self.nb_topics parameters['classlabels'] = self.classlabels - json.dump(parameters, open(nameprefix+'.json', 'wb')) - - self.dictionary.save(nameprefix+'.gensimdict') + parameters['tokens2indices'] = self.token2indices + open(nameprefix + '.json', 'wb').write(orjson.dumps(parameters)) kerasio.save_model(nameprefix+'_encoder', self.encoder) if save_complete_autoencoder: kerasio.save_model(nameprefix+'_decoder', self.decoder) kerasio.save_model(nameprefix+'_autoencoder', self.autoencoder) pickle.dump(self.classtopicvecs, open(nameprefix+'_classtopicvecs.pkl', 'wb')) - def loadmodel(self, nameprefix, load_incomplete=False): - """ Save the model with names according to the prefix. + def loadmodel(self, nameprefix: str, load_incomplete: bool=False) -> None: + """Load the autoencoder model from files. - Given the prefix of the file paths, load the model into files, with name given by the prefix. - There are files with names ending with "_encoder.json" and "_encoder.weights.h5", which are - the JSON and HDF5 files for the encoder respectively. - They also include a gensim dictionary (.gensimdict). + Args: + nameprefix: Prefix for input files. + load_incomplete: If True, only load encoder (for models from v0.2.1). Default: False. - :param nameprefix: prefix of the paths of the file - :param load_incomplete: load encoder only, not decoder and autoencoder file (Default: False; put True for model built in version <= 0.2.1) - :return: None - :type nameprefix: str - :type load_incomplete: bool + Raises: + ModelNotTrainedException: If loading fails. """ # load the JSON file (parameters) parameters = json.load(open(nameprefix+'.json', 'r')) self.nb_topics = parameters['nb_topics'] self.classlabels = parameters['classlabels'] - - self.dictionary = Dictionary.load(nameprefix + '.gensimdict') + self.token2indices = parameters['tokens2indices'] self.encoder = kerasio.load_model(nameprefix+'_encoder') self.classtopicvecs = pickle.load(open(nameprefix+'_classtopicvecs.pkl', 'rb')) if not load_incomplete: @@ -203,22 +273,35 @@ def loadmodel(self, nameprefix, load_incomplete=False): self.autoencoder = kerasio.load_model(nameprefix+'_autoencoder') self.trained = True + def get_info(self) -> dict[str, Any]: + """Get model metadata. -def load_autoencoder_topicmodel(name, - preprocessor=textpreprocess.standard_text_preprocessor_1(), - compact=True): - """ Load the autoencoding topic model from files. + Returns: + Dictionary with model information. + """ + return CompactIOMachine.get_info(self) - :param name: name (if compact=True) or prefix (if compact=False) of the paths of the model files - :param preprocessor: function that preprocesses the text. (Default: `shorttext.utils.textpreprocess.standard_text_preprocessor_1`) - :param compact: whether model file is compact (Default: True) - :return: an autoencoder as a topic modeler - :type name: str - :type preprocessor: function - :type compact: bool - :rtype: generators.bow.AutoEncodingTopicModeling.AutoencodingTopicModeler + +def load_autoencoder_topicmodel( + name: str, + preprocessor: Optional[callable] = None, + tokenizer: Optional[callable] = None, + compact: bool=True +) -> AutoencodingTopicModeler: + """Load an autoencoder topic model from files. + + Args: + name: Model name (compact) or file prefix (non-compact). + preprocessor: Text preprocessing function. + compact: Whether to load compact model. Default: True. + + Returns: + An AutoencodingTopicModeler instance. """ - autoencoder = AutoencodingTopicModeler(preprocessor=preprocessor) + if preprocessor is None: + preprocessor = textpreprocess.standard_text_preprocessor_1() + + autoencoder = AutoencodingTopicModeler(preprocessor=preprocessor, tokenizer=tokenizer) if compact: autoencoder.load_compact_model(name) else: diff --git a/src/shorttext/generators/bow/GensimTopicModeling.py b/src/shorttext/generators/bow/GensimTopicModeling.py index 17e8f644..ffd04c26 100644 --- a/src/shorttext/generators/bow/GensimTopicModeling.py +++ b/src/shorttext/generators/bow/GensimTopicModeling.py @@ -1,63 +1,74 @@ -import json +from typing import Optional, Literal, Any import gensim import numpy as np +import numpy.typing as npt from gensim.corpora import Dictionary from gensim.models import TfidfModel, LdaModel, LsiModel, RpModel from gensim.similarities import MatrixSimilarity +import orjson -from ...utils import classification_exceptions as e +from ...utils.classification_exceptions import ModelNotTrainedException from ...utils.compactmodel_io import CompactIOMachine, get_model_classifier_name from ...utils import gensim_corpora as gc from .LatentTopicModeling import LatentTopicModeler -from ...utils import textpreprocessing as textpreprocess -from ...utils.textpreprocessing import tokenize + gensim_topic_model_dict = {'lda': LdaModel, 'lsi': LsiModel, 'rp': RpModel} class GensimTopicModeler(LatentTopicModeler): - """ - This class facilitates the creation of topic models (options: LDA (latent Dirichlet Allocation), - LSI (latent semantic indexing), and Random Projections - with the given short text training data, and convert future - short text into topic vectors using the trained topic model. + """Topic modeler using gensim implementations. - No compact model I/O available for this class. Refer to - :class:`LDAModeler` and :class:`LSIModeler`. + Supports LDA (Latent Dirichlet Allocation), LSI (Latent Semantic Indexing), + and Random Projections (RP) for topic modeling. - This class extends :class:`LatentTopicModeler`. + Note: + For compact model I/O, use LDAModeler or LSIModeler instead. """ - def __init__(self, - preprocessor=textpreprocess.standard_text_preprocessor_1(), - algorithm='lda', - toweigh=True, - normalize=True): - """ Initialize the topic modeler. - - :param preprocessor: function that preprocesses the text. (Default: `utils.textpreprocess.standard_text_preprocessor_1`) - :param algorithm: algorithm for topic modeling. Options: lda, lsi, rp. (Default: lda) - :param toweigh: whether to weigh the words using tf-idf. (Default: True) - :param normalize: whether the retrieved topic vectors are normalized. (Default: True) - :type preprocessor: function - :type algorithm: str - :type toweigh: bool + + def __init__( + self, + preprocessor: Optional[callable] = None, + tokenizer: Optional[callable] = None, + algorithm: Literal["lda", "lsi", "rp"] = "lda", + toweigh: bool = True, + normalize: bool = True + ): + """Initialize the topic modeler. + + Args: + preprocessor: Text preprocessing function. Default: standard_text_preprocessor_1. + algorithm: Topic modeling algorithm. Options: 'lda', 'lsi', 'rp'. Default: 'lda'. + toweigh: Whether to apply tf-idf weighting. Default: True. + normalize: Whether to normalize topic vectors. Default: True. """ - LatentTopicModeler.__init__(self, preprocessor=preprocessor, normalize=normalize) + LatentTopicModeler.__init__( + self, preprocessor=preprocessor, tokenizer=tokenizer, normalize=normalize + ) self.algorithm = algorithm self.toweigh = toweigh - def train(self, classdict, nb_topics, *args, **kwargs): - """ Train the topic modeler. + def generate_corpus(self, classdict: dict[str, list[str]]) -> None: + """Generate gensim dictionary and corpus. - :param classdict: training data - :param nb_topics: number of latent topics - :param args: arguments to pass to the `train` method for gensim topic models - :param kwargs: arguments to pass to the `train` method for gensim topic models - :return: None - :type classdict: dict - :type nb_topics: int + Args: + classdict: Training data. + """ + self.dictionary, self.corpus, self.classlabels = gc.generate_gensim_corpora( + classdict, + preprocess_and_tokenize=lambda sent: self.tokenize_func(self.preprocess_func(sent)) + ) + + def train(self, classdict: dict[str, list[str]], nb_topics: int, *args, **kwargs) -> None: + """Train the topic modeler. + + Args: + classdict: Training data with class labels as keys and texts as values. + nb_topics: Number of latent topics. + *args: Arguments for the gensim topic model. + **kwargs: Keyword arguments for the gensim topic model. """ self.nb_topics = nb_topics self.generate_corpus(classdict) @@ -68,111 +79,133 @@ def train(self, classdict, nb_topics, *args, **kwargs): self.tfidf = None normcorpus = self.corpus - self.topicmodel = gensim_topic_model_dict[self.algorithm](normcorpus, - num_topics=self.nb_topics, - *args, - **kwargs) + self.topicmodel = gensim_topic_model_dict[self.algorithm]( + normcorpus, num_topics=self.nb_topics, *args, **kwargs + ) self.matsim = MatrixSimilarity(self.topicmodel[normcorpus]) # change the flag self.trained = True - def update(self, additional_classdict): - """ Update the model with additional data. - - It updates the topic model with additional data. - - Warning: It does not allow adding class labels, and new words. - The dictionary is not changed. Therefore, such an update will alter the - topic model only. It affects the topic vector representation. While the corpus - is changed, the words pumped into calculating the similarity matrix is not changed. - - Therefore, this function means for a fast update. - But if you want a comprehensive model, it is recommended to retrain. - - :param additional_classdict: additional training data - :return: None - :type additional_classdict: dict + def update(self, additional_classdict: dict[str, list[str]]) -> None: + """Update model with additional data. + + Warning: Does not support adding new class labels or new vocabulary. + For comprehensive updates, retrain the model. + + Args: + additional_classdict: Additional training data. """ # cannot use this way, as we want to update the corpus with existing words - self.corpus, newcorpus = gc.update_corpus_labels(self.dictionary, - self.corpus, - additional_classdict, - preprocess_and_tokenize=lambda sent: tokenize(self.preprocessor(sent))) + self.corpus, newcorpus = gc.update_corpus_labels( + self.dictionary, + self.corpus, + additional_classdict, + preprocess_and_tokenize=lambda sent: self.tokenize_func(self.preprocess_func(sent)) + ) self.topicmodel.update(newcorpus) - def retrieve_corpus_topicdist(self, shorttext): - """ Calculate the topic vector representation of the short text, in the corpus form. + def retrieve_bow(self, shorttext: str) -> list[tuple[int, int]]: + """Get bag-of-words representation. + + Args: + shorttext: Input text. + + Returns: + List of (word_id, count) tuples. + """ + return self.dictionary.doc2bow(self.tokenize_func(self.preprocess_func(shorttext))) + + def retrieve_bow_vector(self, shorttext: str) -> npt.NDArray[np.float64]: + """Get bag-of-words vector. + + Args: + shorttext: Input text. + + Returns: + BOW vector. + """ + bow = self.retrieve_bow(shorttext) + if len(bow) > 0: + vec = np.zeros(len(self.dictionary)) + for id, val in bow: + vec[id] = val + else: + vec = np.ones(len(self.dictionary)) + if self.normalize: + vec /= np.linalg.norm(vec) + return vec + + def retrieve_corpus_topicdist(self, shorttext: str) -> list[tuple[int, int | float]]: + """Get topic distribution (corpus form). + + Args: + shorttext: Input text. - If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. + Returns: + List of (topic_id, weight) tuples. - :param shorttext: text to be represented - :return: topic vector in the corpus form - :raise: ModelNotTrainedException - :type shorttext: str - :rtype: list + Raises: + ModelNotTrainedException: If model not trained. """ if not self.trained: - raise e.ModelNotTrainedException() + raise ModelNotTrainedException() bow = self.retrieve_bow(shorttext) return self.topicmodel[self.tfidf[bow] if self.toweigh else bow] - def retrieve_topicvec(self, shorttext): - """ Calculate the topic vector representation of the short text. + def retrieve_topicvec(self, shorttext: str) -> npt.NDArray[np.float64]: + """Get topic vector for short text. - This function calls :func:`~retrieve_corpus_topicdist`. + Args: + shorttext: Input text. - If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. + Returns: + Topic vector. - :param shorttext: text to be represented - :return: topic vector - :raise: ModelNotTrainedException - :type shorttext: str - :rtype: numpy.ndarray + Raises: + ModelNotTrainedException: If model not trained. """ if not self.trained: - raise e.ModelNotTrainedException() + raise ModelNotTrainedException() topicdist = self.retrieve_corpus_topicdist(shorttext) - topicvec = np.zeros(self.nb_topics) - for topicid, frac in topicdist: - topicvec[topicid] = frac + if len(topicdist) > 0: + topicvec = np.zeros(self.nb_topics) + for topicid, frac in topicdist: + topicvec[topicid] = frac + else: + topicvec = np.ones(self.nb_topics) if self.normalize: topicvec /= np.linalg.norm(topicvec) return topicvec - def get_batch_cos_similarities(self, shorttext): - """ Calculate the score, which is the cosine similarity with the topic vector of the model, - of the short text against each class labels. + def get_batch_cos_similarities(self, shorttext: str) -> dict[str, float]: + """Get cosine similarities to all classes. + + Args: + shorttext: Input text. - If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. + Returns: + Dictionary mapping class labels to similarity scores. - :param shorttext: short text - :return: dictionary of scores of the text to all classes - :raise: ModelNotTrainedException - :type shorttext: str - :rtype: dict + Raises: + ModelNotTrainedException: If model not trained. """ if not self.trained: - raise e.ModelNotTrainedException() + raise ModelNotTrainedException() simdict = {} similarities = self.matsim[self.retrieve_corpus_topicdist(shorttext)] for label, similarity in zip(self.classlabels, similarities): - simdict[label] = similarity + simdict[label] = float(similarity) return simdict - def loadmodel(self, nameprefix): - """ Load the topic model with the given prefix of the file paths. - - Given the prefix of the file paths, load the corresponding topic model. The files - include a JSON (.json) file that specifies various parameters, a gensim dictionary (.gensimdict), - and a topic model (.gensimmodel). If weighing is applied, load also the tf-idf model (.gensimtfidf). + def loadmodel(self, nameprefix: str) -> None: + """Load topic model from files. - :param nameprefix: prefix of the file paths - :return: None - :type nameprefix: str + Args: + nameprefix: Prefix for input files. """ # load the JSON file (parameters) - parameters = json.load(open(nameprefix+'.json', 'r')) + parameters = orjson.loads(open(nameprefix+'.json', 'rb').read()) self.nb_topics = parameters['nb_topics'] self.toweigh = parameters['toweigh'] self.algorithm = parameters['algorithm'] @@ -194,28 +227,24 @@ def loadmodel(self, nameprefix): # flag self.trained = True - def savemodel(self, nameprefix): - """ Save the model with names according to the prefix. + def savemodel(self, nameprefix: str) -> None: + """Save topic model to files. - Given the prefix of the file paths, save the corresponding topic model. The files - include a JSON (.json) file that specifies various parameters, a gensim dictionary (.gensimdict), - and a topic model (.gensimmodel). If weighing is applied, load also the tf-idf model (.gensimtfidf). + Args: + nameprefix: Prefix for output files. - If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. - - :param nameprefix: prefix of the file paths - :return: None - :raise: ModelNotTrainedException - :type nameprefix: str + Raises: + ModelNotTrainedException: If model not trained. """ if not self.trained: - raise e.ModelNotTrainedException() + raise ModelNotTrainedException() + parameters = {} parameters['nb_topics'] = self.nb_topics parameters['toweigh'] = self.toweigh parameters['algorithm'] = self.algorithm parameters['classlabels'] = self.classlabels - json.dump(parameters, open(nameprefix+'.json', 'w')) + open(nameprefix+".json", "wb").write(orjson.dumps(parameters)) self.dictionary.save(nameprefix+'.gensimdict') self.topicmodel.save(nameprefix+'.gensimmodel') @@ -223,101 +252,134 @@ def savemodel(self, nameprefix): if self.toweigh: self.tfidf.save(nameprefix+'.gensimtfidf') + def get_info(self) -> dict[str, Any]: + return {} -lda_suffices = ['.json', '.gensimdict', '.gensimmodel.state', - '.gensimtfidf', '.gensimmodel', '.gensimmat'] -if gensim.__version__ >= '1.0.0': - lda_suffices += ['.gensimmodel.expElogbeta.npy', '.gensimmodel.id2word'] +lda_suffices = [ + '.json', '.gensimdict', '.gensimmodel.state', '.gensimtfidf', '.gensimmodel', + '.gensimmat', '.gensimmodel.expElogbeta.npy', '.gensimmodel.id2word' +] -class LDAModeler(GensimTopicModeler, CompactIOMachine): - """ - This class facilitates the creation of LDA (latent Dirichlet Allocation) topic models, - with the given short text training data, and convert future - short text into topic vectors using the trained topic model. - This class extends :class:`GensimTopicModeler`. - """ - def __init__(self, - preprocessor=textpreprocess.standard_text_preprocessor_1(), - toweigh=True, - normalize=True): - GensimTopicModeler.__init__(self, - preprocessor=preprocessor, - algorithm='lda', - toweigh=toweigh, - normalize=normalize) - CompactIOMachine.__init__(self, {'classifier': 'ldatopic'}, 'ldatopic', lda_suffices) +class LDAModeler(GensimTopicModeler, CompactIOMachine): + """LDA topic modeler with compact I/O support.""" + + def __init__( + self, + preprocessor: Optional[callable] = None, + tokenizer: Optional[callable] = None, + toweigh: bool = True, + normalize: bool = True + ): + GensimTopicModeler.__init__( + self, + preprocessor=preprocessor, + tokenizer=tokenizer, + algorithm="lda", + toweigh=toweigh, + normalize=normalize + ) + CompactIOMachine.__init__( + self, {'classifier': 'ldatopic'}, 'ldatopic', lda_suffices + ) + + def get_info(self) -> dict[str, Any]: + return CompactIOMachine.get_info(self) lsi_suffices = ['.json', '.gensimdict', '.gensimtfidf', '.gensimmodel.projection', - '.gensimmodel', '.gensimmat', ] + '.gensimmodel', '.gensimmat'] class LSIModeler(GensimTopicModeler, CompactIOMachine): - """ - This class facilitates the creation of LSI (latent semantic indexing) topic models, - with the given short text training data, and convert future - short text into topic vectors using the trained topic model. - - This class extends :class:`GensimTopicModeler`. - """ - def __init__(self, - preprocessor=textpreprocess.standard_text_preprocessor_1(), - toweigh=True, - normalize=True): - GensimTopicModeler.__init__(self, - preprocessor=preprocessor, - algorithm='lsi', - toweigh=toweigh, - normalize=normalize) - CompactIOMachine.__init__(self, {'classifier': 'lsitopic'}, 'lsitopic', lsi_suffices) + """LSI topic modeler with compact I/O support.""" + + def __init__( + self, + preprocessor: Optional[callable] = None, + tokenizer: Optional[callable] = None, + toweigh: bool = True, + normalize: bool = True + ): + GensimTopicModeler.__init__( + self, + preprocessor=preprocessor, + tokenizer=tokenizer, + algorithm="lsi", + toweigh=toweigh, + normalize=normalize + ) + CompactIOMachine.__init__( + self, {'classifier': 'lsitopic'}, 'lsitopic', lsi_suffices + ) + + def get_info(self) -> dict[str, Any]: + return CompactIOMachine.get_info(self) rp_suffices = ['.json', '.gensimtfidf', '.gensimmodel', '.gensimmat', '.gensimdict'] class RPModeler(GensimTopicModeler, CompactIOMachine): - """ - This class facilitates the creation of RP (random projection) topic models, - with the given short text training data, and convert future - short text into topic vectors using the trained topic model. - - This class extends :class:`GensimTopicModeler`. - """ - def __init__(self, - preprocessor=textpreprocess.standard_text_preprocessor_1(), - toweigh=True, - normalize=True): - GensimTopicModeler.__init__(self, - preprocessor=preprocessor, - algorithm='rp', - toweigh=toweigh, - normalize=normalize) - CompactIOMachine.__init__(self, {'classifier': 'rptopic'}, 'rptopic', rp_suffices) - - -def load_gensimtopicmodel(name, - preprocessor=textpreprocess.standard_text_preprocessor_1(), - compact=True): - """ Load the gensim topic modeler from files. - - :param name: name (if compact=True) or prefix (if compact=False) of the file path - :param preprocessor: function that preprocesses the text. (Default: `shorttext.utils.textpreprocess.standard_text_preprocessor_1`) - :param compact: whether model file is compact (Default: True) - :return: a topic modeler - :type name: str - :type preprocessor: function - :type compact: bool - :rtype: GensimTopicModeler + """Random Projection topic modeler with compact I/O support.""" + + def __init__( + self, + preprocessor: Optional[callable] = None, + tokenizer: Optional[callable] = None, + toweigh: bool = True, + normalize: bool = True + ): + GensimTopicModeler.__init__( + self, + preprocessor=preprocessor, + tokenizer=tokenizer, + algorithm="rp", + toweigh=toweigh, + normalize=normalize + ) + CompactIOMachine.__init__( + self, {'classifier': 'rptopic'}, 'rptopic', rp_suffices + ) + + def get_info(self) -> dict[str, Any]: + return CompactIOMachine.get_info(self) + + +def load_gensimtopicmodel( + name: str, + preprocessor: Optional[callable] = None, + tokenizer: Optional[callable] = None, + compact: bool = True +) -> GensimTopicModeler: + """Load a gensim topic model from files. + + Args: + name: Model name (compact) or file prefix (non-compact). + preprocessor: Text preprocessing function. + compact: Whether to load compact model. Default: True. + + Returns: + A topic modeler instance. """ if compact: - modelerdict = {'ldatopic': LDAModeler, 'lsitopic': LSIModeler, 'rptopic': RPModeler} + modeler_dict = {'ldatopic': LDAModeler, 'lsitopic': LSIModeler, 'rptopic': RPModeler} classifier_name = str(get_model_classifier_name(name)) + if classifier_name not in modeler_dict.keys(): + raise ValueError(f"Unknown classifier name: {classifier_name}") - topicmodeler = modelerdict[classifier_name](preprocessor=preprocessor) - topicmodeler.load_compact_model(name) - return topicmodeler + topic_modeler = modeler_dict[classifier_name](preprocessor=preprocessor, tokenizer=tokenizer) + topic_modeler.load_compact_model(name) else: - topicmodeler = GensimTopicModeler(preprocessor=preprocessor) - topicmodeler.loadmodel(name) - return topicmodeler + modeler_dict = {'lda': LDAModeler, 'lsi': LSIModeler, 'rp': RPModeler} + + config_info = orjson.loads(open(name+".json", "rb").read()) + algorithm_name = config_info.get("algorithm") + if algorithm_name is None: + raise ValueError("No classifier name!") + if algorithm_name not in modeler_dict.keys(): + raise ValueError(f"Unknown classifier name: {algorithm_name}") + + topic_modeler = modeler_dict[algorithm_name](preprocessor=preprocessor, tokenizer=tokenizer) + topic_modeler.loadmodel(name) + return topic_modeler diff --git a/src/shorttext/generators/bow/LatentTopicModeling.py b/src/shorttext/generators/bow/LatentTopicModeling.py index f1e12319..66af42ef 100644 --- a/src/shorttext/generators/bow/LatentTopicModeling.py +++ b/src/shorttext/generators/bow/LatentTopicModeling.py @@ -1,144 +1,161 @@ from abc import ABC, abstractmethod +from typing import Optional, Any import numpy as np +import numpy.typing as npt -from ...utils import textpreprocessing as textpreprocess, gensim_corpora as gc, classification_exceptions as e +from ...utils import textpreprocessing as textpreprocess, classification_exceptions as e from ...utils.textpreprocessing import tokenize + # abstract class class LatentTopicModeler(ABC): + """Abstract base class for topic modelers. + + Provides interface for converting short texts to topic vector + representations using various topic modeling algorithms. """ - Abstract class for various topic modeler. - """ - def __init__(self, - preprocessor=textpreprocess.standard_text_preprocessor_1(), - normalize=True): - """ Initialize the modeler. - - :param preprocessor: function that preprocesses the text. (Default: `shorttext.utils.textpreprocess.standard_text_preprocessor_1`) - :param normalize: whether the retrieved topic vectors are normalized. (Default: True) - :type preprocessor: function - :type normalize: bool + + def __init__( + self, + preprocessor: Optional[callable] = None, + tokenizer: Optional[callable] = None, + normalize: bool = True + ): + """Initialize the topic modeler. + + Args: + preprocessor: Text preprocessing function. Default: standard_text_preprocessor_1. + tokenizer: Tokenization function. Default: tokenize. + normalize: Whether to normalize output vectors. Default: True. """ - self.preprocessor = preprocessor + if preprocessor is None: + self.preprocess_func = textpreprocess.standard_text_preprocessor_1() + else: + self.preprocess_func = preprocessor + if tokenizer is None: + self.tokenize_func = tokenize + else: + self.tokenize_func = tokenizer + self.normalize = normalize self.trained = False - def generate_corpus(self, classdict): - """ Calculate the gensim dictionary and corpus, and extract the class labels - from the training data. Called by :func:`~train`. + @abstractmethod + def train(self, classdict: dict[str, list[str]], nb_topics: int, *args, **kwargs) -> None: + """Train the topic modeler. + + Args: + classdict: Training data with class labels as keys and texts as values. + nb_topics: Number of latent topics. + *args: Additional arguments for the training algorithm. + **kwargs: Additional keyword arguments. - :param classdict: training data - :return: None - :type classdict: dict + Raises: + NotImplementedError: This is an abstract method. """ - self.dictionary, self.corpus, self.classlabels = gc.generate_gensim_corpora(classdict, - preprocess_and_tokenize=lambda sent: tokenize(self.preprocessor(sent))) + raise NotImplemented() + @abstractmethod - def train(self, classdict, nb_topics, *args, **kwargs): - """ Train the modeler. - - This is an abstract method of this abstract class, which raise the `NotImplementedException`. - - :param classdict: training data - :param nb_topics: number of latent topics - :param args: arguments to be passed into the wrapped training functions - :param kwargs: arguments to be passed into the wrapped training functions - :return: None - :raise: NotImplementedException - :type classdict: dict - :type nb_topics: int - """ - self.nb_topics = nb_topics - raise e.NotImplementedException() + def retrieve_bow(self, shorttext: str) -> list[tuple[int, int]]: + """Get bag-of-words representation. + + Args: + shorttext: Input text. - def retrieve_bow(self, shorttext): - """ Calculate the gensim bag-of-words representation of the given short text. + Returns: + List of (word_id, count) tuples. - :param shorttext: text to be represented - :return: corpus representation of the text - :type shorttext: str - :rtype: list + Raises: + NotImplementedError: Abstract method. """ - return self.dictionary.doc2bow(tokenize(self.preprocessor(shorttext))) + raise NotImplemented() - def retrieve_bow_vector(self, shorttext, normalize=True): - """ Calculate the vector representation of the bag-of-words in terms of numpy.ndarray. + @abstractmethod + def retrieve_bow_vector(self, shorttext: str) -> npt.NDArray[np.float64]: + """Get bag-of-words vector. + + Args: + shorttext: Input text. + + Returns: + BOW vector. - :param shorttext: short text - :param normalize: whether the retrieved topic vectors are normalized. (Default: True) - :return: vector represtation of the text - :type shorttext: str - :type normalize: bool - :rtype: numpy.ndarray + Raises: + NotImplementedError: Abstract method. """ - bow = self.retrieve_bow(shorttext) - vec = np.zeros(len(self.dictionary)) - for id, val in bow: - vec[id] = val - if normalize: - vec /= np.linalg.norm(vec) - return vec + raise NotImplemented() @abstractmethod - def retrieve_topicvec(self, shorttext): - """ Calculate the topic vector representation of the short text. + def retrieve_topicvec(self, shorttext: str) -> npt.NDArray[np.float64]: + """Get topic vector for short text. - This is an abstract method of this abstract class, which raise the `NotImplementedException`. + Args: + shorttext: Input text. - :param shorttext: short text - :return: topic vector - :raise: NotImplementedException - :type shorttext: str - :rtype: numpy.ndarray + Returns: + Topic vector. + + Raises: + NotImplementedError: Abstract method. """ - raise e.NotImplementedException() + raise NotImplemented() @abstractmethod - def get_batch_cos_similarities(self, shorttext): - """ Calculate the cosine similarities of the given short text and all the class labels. + def get_batch_cos_similarities(self, shorttext: str) -> dict[str, float]: + """Get cosine similarities to all classes. + + Args: + shorttext: Input text. - This is an abstract method of this abstract class, which raise the `NotImplementedException`. + Returns: + Dictionary mapping class labels to similarity scores. - :param shorttext: short text - :return: topic vector - :raise: NotImplementedException - :type shorttext: str - :rtype: numpy.ndarray + Raises: + NotImplementedError: Abstract method. """ - raise e.NotImplementedException() + raise NotImplemented() - def __getitem__(self, shorttext): + def __getitem__(self, shorttext) -> npt.NDArray[np.float64]: + """Get topic vector for text (shortcut for retrieve_topicvec).""" return self.retrieve_topicvec(shorttext) def __contains__(self, shorttext): + """Check if model is trained.""" if not self.trained: raise e.ModelNotTrainedException() return True @abstractmethod - def loadmodel(self, nameprefix): - """ Load the model from files. + def loadmodel(self, nameprefix: str): + """Load model from files. - This is an abstract method of this abstract class, which raise the `NotImplementedException`. + Args: + nameprefix: Prefix for input files. - :param nameprefix: prefix of the paths of the model files - :return: None - :raise: NotImplementedException - :type nameprefix: str + Raises: + NotImplementedError: Abstract method. """ - raise e.NotImplementedException() + raise NotImplemented() @abstractmethod - def savemodel(self, nameprefix): - """ Save the model to files. + def savemodel(self, nameprefix: str): + """Save model to files. - This is an abstract method of this abstract class, which raise the `NotImplementedException`. + Args: + nameprefix: Prefix for output files. + + Raises: + NotImplementedError: Abstract method. + """ + raise NotImplemented() + + @abstractmethod + def get_info(self) -> dict[str, Any]: + """Get model metadata. - :param nameprefix: prefix of the paths of the model files - :return: None - :raise: NotImplementedException - :type nameprefix: str + Returns: + Dictionary with model information. """ - raise e.NotImplementedException() \ No newline at end of file + raise NotImplemented() diff --git a/src/shorttext/generators/charbase/char2vec.py b/src/shorttext/generators/charbase/char2vec.py index b62df6dc..c62a43a6 100644 --- a/src/shorttext/generators/charbase/char2vec.py +++ b/src/shorttext/generators/charbase/char2vec.py @@ -1,25 +1,33 @@ from functools import partial +from os import PathLike import numpy as np +import numpy.typing as npt from scipy.sparse import csc_matrix from gensim.corpora import Dictionary from sklearn.preprocessing import OneHotEncoder +from deprecation import deprecated from ...utils.misc import textfile_generator class SentenceToCharVecEncoder: - """ A class that facilitates one-hot encoding from characters to vectors. + """One-hot encoder for character-level text representations. + Converts sentences into one-hot encoded vectors at the character + level. Useful for character-level sequence models. + + Reference: + General architecture inspired by char-RNN and related models. """ - def __init__(self, dictionary, signalchar='\n'): - """ Initialize the one-hot encoding class. - :param dictionary: a gensim dictionary - :param signalchar: signal character, useful for seq2seq models (Default: '\n') - :type dictionary: gensim.corpora.Dictionary - :type signalchar: str + def __init__(self, dictionary: Dictionary, signalchar: str='\n'): + """Initialize the character vector encoder. + + Args: + dictionary: Gensim Dictionary mapping characters to indices. + signalchar: Signal character for sequence markers. Default: '\\n'. """ self.dictionary = dictionary self.signalchar = signalchar @@ -27,31 +35,38 @@ def __init__(self, dictionary, signalchar='\n'): self.onehot_encoder = OneHotEncoder() self.onehot_encoder.fit(np.arange(numchars).reshape((numchars, 1))) - def calculate_prelim_vec(self, sent): - """ Convert the sentence to a one-hot vector. + def calculate_prelim_vec(self, sent: str) -> npt.NDArray[np.float64]: + """Convert sentence to one-hot character vectors. + + Args: + sent: Input sentence. - :param sent: sentence - :return: a one-hot vector, with each element the code of that character - :type sent: str - :rtype: numpy.array + Returns: + One-hot encoded sparse matrix where each row represents + a character's encoding. """ return self.onehot_encoder.transform( np.array([self.dictionary.token2id[c] for c in sent]).reshape((len(sent), 1)) - ) - - def encode_sentence(self, sent, maxlen, startsig=False, endsig=False): - """ Encode one sentence to a sparse matrix, with each row the expanded vector of each character. - - :param sent: sentence - :param maxlen: maximum length of the sentence - :param startsig: signal character at the beginning of the sentence (Default: False) - :param endsig: signal character at the end of the sentence (Default: False) - :return: matrix representing the sentence - :type sent: str - :type maxlen: int - :type startsig: bool - :type endsig: bool - :rtype: scipy.sparse.csc_matrix + ).astype(np.float64) + + def encode_sentence( + self, + sent: str, + maxlen: int, + startsig: bool = False, + endsig=False + ) -> csc_matrix: + """Encode a sentence to a sparse character vector matrix. + + Args: + sent: Input sentence to encode. + maxlen: Maximum length of the encoded sequence. + startsig: Whether to prepend signal character. Default: False. + endsig: Whether to append signal character. Default: False. + + Returns: + Sparse matrix representing the sentence with shape + (maxlen + startsig + endsig, num_chars). """ cor_sent = (self.signalchar if startsig else '') + sent[:min(maxlen, len(sent))] + (self.signalchar if endsig else '') sent_vec = self.calculate_prelim_vec(cor_sent).tocsc() @@ -62,21 +77,26 @@ def encode_sentence(self, sent, maxlen, startsig=False, endsig=False): shape=(maxlen + startsig + endsig, sent_vec.shape[1]), dtype=np.float64) - def encode_sentences(self, sentences, maxlen, sparse=True, startsig=False, endsig=False): - """ Encode many sentences into a rank-3 tensor. - - :param sentences: sentences - :param maxlen: maximum length of one sentence - :param sparse: whether to return a sparse matrix (Default: True) - :param startsig: signal character at the beginning of the sentence (Default: False) - :param endsig: signal character at the end of the sentence (Default: False) - :return: rank-3 tensor of the sentences - :type sentences: list - :type maxlen: int - :type sparse: bool - :type startsig: bool - :type endsig: bool - :rtype: scipy.sparse.csc_matrix or numpy.array + def encode_sentences( + self, + sentences: list[str], + maxlen: int, + sparse: bool = True, + startsig: bool = False, + endsig: bool = False + ) -> list[npt.NDArray[np.float64]] | npt.NDArray[np.float64]: + """Encode multiple sentences into character vectors. + + Args: + sentences: List of sentences to encode. + maxlen: Maximum length for each encoded sentence. + sparse: Whether to return sparse matrices. Default: True. + startsig: Whether to prepend signal character. Default: False. + endsig: Whether to append signal character. Default: False. + + Returns: + If sparse=True: list of sparse matrices. + If sparse=False: numpy array of shape (n_sentences, maxlen, num_chars). """ encode_sent_func = partial(self.encode_sentence, startsig=startsig, endsig=endsig, maxlen=maxlen) list_encoded_sentences_map = map(encode_sent_func, sentences) @@ -85,19 +105,42 @@ def encode_sentences(self, sentences, maxlen, sparse=True, startsig=False, endsi else: return np.array([sparsevec.toarray() for sparsevec in list_encoded_sentences_map]) - def __len__(self): + def __len__(self) -> int: + """Return the number of unique characters in the dictionary.""" return len(self.dictionary) -def initSentenceToCharVecEncoder(textfile, encoding=None): - """ Instantiate a class of SentenceToCharVecEncoder from a text file. +def initialize_SentenceToCharVecEncoder( + textfile: str | PathLike, + encoding: bool=None +) -> SentenceToCharVecEncoder: + """Create a SentenceToCharVecEncoder from a text file. + + Builds a character dictionary from the given text file and returns + an encoder instance. - :param textfile: text file - :param encoding: encoding of the text file (Default: None) - :return: an instance of SentenceToCharVecEncoder - :type textfile: file - :type encoding: str - :rtype: SentenceToCharVecEncoder + Args: + textfile: Path to the text file for building the character dictionary. + encoding: Encoding of the text file. Default: None. + + Returns: + A SentenceToCharVecEncoder instance. """ - dictionary = Dictionary(map(lambda line: [c for c in line], textfile_generator(textfile, encoding=encoding))) + dictionary = Dictionary( + map( + lambda line: [c for c in line], + textfile_generator(textfile, encoding=encoding) + ) + ) return SentenceToCharVecEncoder(dictionary) + + +@deprecated(deprecated_in="4.0.0", removed_in="5.0.0") +def initSentenceToCharVecEncoder( + textfile: str | PathLike, + encoding: bool=None +) -> SentenceToCharVecEncoder: + """ + Deprecated. Use initialize_SentenceToCharVecEncoder instead. + """ + return initialize_SentenceToCharVecEncoder(textfile, encoding=encoding) \ No newline at end of file diff --git a/src/shorttext/generators/seq2seq/charbaseS2S.py b/src/shorttext/generators/seq2seq/charbaseS2S.py index 604c9f54..ccaf26a0 100644 --- a/src/shorttext/generators/seq2seq/charbaseS2S.py +++ b/src/shorttext/generators/seq2seq/charbaseS2S.py @@ -1,41 +1,47 @@ -import json +from typing import Literal +from os import PathLike import numpy as np +import numpy.typing as npt import gensim +import orjson -from .s2skeras import Seq2SeqWithKeras, loadSeq2SeqWithKeras, kerasseq2seq_suffices +from .s2skeras import Seq2SeqWithKeras, load_seq2seq_model, kerasseq2seq_suffices from ..charbase.char2vec import SentenceToCharVecEncoder -from ...utils import compactmodel_io as cio +from ...utils.compactmodel_io import CompactIOMachine charbases2s_suffices = kerasseq2seq_suffices + ['_dictionary.dict', '_charbases2s.json'] -class CharBasedSeq2SeqGenerator(cio.CompactIOMachine): - """ Class implementing character-based sequence-to-sequence (seq2seq) learning model. +class CharBasedSeq2SeqGenerator(CompactIOMachine): + """Character-based sequence-to-sequence model. - This class implements the seq2seq model at the character level. This class calls - :class:`Seq2SeqWithKeras`. + Implements seq2seq at the character level. Uses Seq2SeqWithKeras internally. Reference: - - Oriol Vinyals, Quoc Le, "A Neural Conversational Model," arXiv:1506.05869 (2015). [`arXiv - `_] + Oriol Vinyals, Quoc Le, "A Neural Conversational Model," arXiv:1506.05869 (2015). + https://arxiv.org/abs/1506.05869 """ - def __init__(self, sent2charvec_encoder, latent_dim, maxlen): - """ Instantiate the class. - - If no one-hot encoder passed in, no compilation will be performed. - - :param sent2charvec_encoder: the one-hot encoder - :param latent_dim: number of latent dimension - :param maxlen: maximum length of a sentence - :type sent2charvec_encoder: SentenceToCharVecEncoder - :type latent_dim: int - :type maxlen: int + def __init__( + self, + sent2charvec_encoder: SentenceToCharVecEncoder, + latent_dim: int, + maxlen: int + ): + """Initialize the generator. + + Args: + sent2charvec_encoder: Character encoder. + latent_dim: Number of latent dimensions. + maxlen: Maximum length of a sentence. """ - cio.CompactIOMachine.__init__(self, {'classifier': 'charbases2s'}, 'charbases2s', charbases2s_suffices) + super().__init__( + {'classifier': 'charbases2s'}, + 'charbases2s', + charbases2s_suffices + ) self.compiled = False if sent2charvec_encoder != None: self.sent2charvec_encoder = sent2charvec_encoder @@ -45,59 +51,69 @@ def __init__(self, sent2charvec_encoder, latent_dim, maxlen): self.maxlen = maxlen self.s2sgenerator = Seq2SeqWithKeras(self.nbelem, self.latent_dim) - def compile(self, optimizer='rmsprop', loss='categorical_crossentropy'): - """ Compile the keras model. + def compile( + self, + optimizer: Literal["sgd", "rmsprop", "adagrad", "adadelta", "adam", "adamax", "nadam"] = 'rmsprop', + loss: str = 'categorical_crossentropy' + ) -> None: + """Compile the Keras model. - :param optimizer: optimizer for gradient descent. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. (Default: rmsprop) - :param loss: loss function available from tensorflow.keras (Default: 'categorical_crossentropy`) - :return: None - :type optimizer: str - :type loss: str + Args: + optimizer: Optimizer for gradient descent. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. Default: rmsprop. + loss: Loss function from tensorflow.keras. Default: 'categorical_crossentropy'. """ if not self.compiled: self.s2sgenerator.prepare_model() self.s2sgenerator.compile(optimizer=optimizer, loss=loss) self.compiled = True - def prepare_trainingdata(self, txtseq): - """ Transforming sentence to a sequence of numerical vectors. + def prepare_trainingdata( + self, + txtseq: str + ) -> tuple[npt.NDArray[np.float64], npt.NDArray[np.float64], npt.NDArray[np.float64]]: + """Transform text to numerical vector format. + + Args: + txtseq: Input text. - :param txtseq: text - :return: rank-3 tensors for encoder input, decoder input, and decoder output - :type txtseq: str - :rtype: (numpy.array, numpy.array, numpy.array) + Returns: + Tuple of (encoder_input, decoder_input, decoder_output) as rank-3 tensors. """ encoder_input = self.sent2charvec_encoder.encode_sentences(txtseq[:-1], startsig=True, maxlen=self.maxlen, sparse=False) decoder_input = self.sent2charvec_encoder.encode_sentences(txtseq[1:], startsig=True, maxlen=self.maxlen, sparse=False) decoder_output = self.sent2charvec_encoder.encode_sentences(txtseq[1:], endsig=True, maxlen=self.maxlen, sparse=False) return encoder_input, decoder_input, decoder_output - def train(self, txtseq, batch_size=64, epochs=100, optimizer='rmsprop', loss='categorical_crossentropy'): - """ Train the character-based seq2seq model. - - :param txtseq: text - :param batch_size: batch size (Default: 64) - :param epochs: number of epochs (Default: 100) - :param optimizer: optimizer for gradient descent. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. (Default: rmsprop) - :param loss: loss function available from tensorflow.keras (Default: 'categorical_crossentropy`) - :return: None - :type txtseq: str - :type batch_size: int - :type epochs: int - :type optimizer: str - :type loss: str + def train( + self, + txtseq: str, + batch_size: int = 64, + epochs: int = 100, + optimizer: Literal["sgd", "rmsprop", "adagrad", "adadelta", "adam", "adamax", "nadam"] = 'rmsprop', + loss: str = 'categorical_crossentropy' + ) -> None: + """Train the character-based seq2seq model. + + Args: + txtseq: Training text. + batch_size: Batch size. Default: 64. + epochs: Number of epochs. Default: 100. + optimizer: Optimizer for gradient descent. Default: rmsprop. + loss: Loss function from tensorflow.keras. Default: 'categorical_crossentropy'. """ encoder_input, decoder_input, decoder_output = self.prepare_trainingdata(txtseq) self.compile(optimizer=optimizer, loss=loss) self.s2sgenerator.fit(encoder_input, decoder_input, decoder_output, batch_size=batch_size, epochs=epochs) - def decode(self, txtseq, stochastic=True): - """ Given an input text, produce the output text. + def decode(self, txtseq: str, stochastic: bool=True) -> str: + """Generate output text from input text. - :param txtseq: input text - :return: output text - :type txtseq: str - :rtype: str + Args: + txtseq: Input text. + stochastic: Whether to use stochastic sampling. Default: True. + + Returns: + Generated output text. """ # Encode the input as state vectors. inputvec = np.array([self.sent2charvec_encoder.encode_sentence(txtseq, maxlen=self.maxlen, endsig=True).toarray()]) @@ -138,56 +154,59 @@ def decode(self, txtseq, stochastic=True): return decoded_txtseq - def savemodel(self, prefix, final=False): - """ Save the trained models into multiple files. - - To save it compactly, call :func:`~save_compact_model`. + def savemodel(self, prefix: str, final: bool=False) -> None: + """Save the trained model to files. - If `final` is set to `True`, the model cannot be further trained. + For compact save, use save_compact_model instead. - If there is no trained model, a `ModelNotTrainedException` will be thrown. + Args: + prefix: Prefix of the file path. + final: Whether the model is final (cannot be further trained). Default: False. - :param prefix: prefix of the file path - :param final: whether the model is final (that should not be trained further) (Default: False) - :return: None - :type prefix: str - :type final: bool - :raise: ModelNotTrainedException + Raises: + ModelNotTrainedException: If no trained model exists. """ self.s2sgenerator.savemodel(prefix, final=final) self.dictionary.save(prefix+'_dictionary.dict') - json.dump({'maxlen': self.maxlen, 'latent_dim': self.latent_dim}, open(prefix+'_charbases2s.json', 'w')) + open(prefix + '_charbases2s.json', 'wb').write( + orjson.dumps({ + 'maxlen': self.maxlen, 'latent_dim': self.latent_dim + }) + ) - def loadmodel(self, prefix): - """ Load a trained model from various files. + def loadmodel(self, prefix: str) -> None: + """Load a trained model from files. - To load a compact model, call :func:`~load_compact_model`. + For compact load, use load_compact_model instead. - :param prefix: prefix of the file path - :return: None - :type prefix: str + Args: + prefix: Prefix of the file path. """ self.dictionary = gensim.corpora.Dictionary.load(prefix+'_dictionary.dict') - self.s2sgenerator = loadSeq2SeqWithKeras(prefix, compact=False) + self.s2sgenerator = load_seq2seq_model(prefix, compact=False) self.sent2charvec_encoder = SentenceToCharVecEncoder(self.dictionary) self.nbelem = len(self.dictionary) - hyperparameters = json.load(open(prefix+'_charbases2s.json', 'r')) + hyperparameters = orjson.loads(open(prefix+'_charbases2s.json', 'rb').read()) self.latent_dim, self.maxlen = hyperparameters['latent_dim'], hyperparameters['maxlen'] self.compiled = True -def loadCharBasedSeq2SeqGenerator(path, compact=True): - """ Load a trained `CharBasedSeq2SeqGenerator` class from file. - :param path: path of the model file - :param compact: whether it is a compact model (Default: True) - :return: a `CharBasedSeq2SeqGenerator` class for sequence to sequence inference - :type path: str - :type compact: bool - :rtype: CharBasedSeq2SeqGenerator +def loadCharBasedSeq2SeqGenerator( + path: str | PathLike, + compact: bool = True +) -> CharBasedSeq2SeqGenerator: + """Load a trained CharBasedSeq2SeqGenerator from file. + + Args: + path: Path of the model file. + compact: Whether to load a compact model. Default: True. + + Returns: + CharBasedSeq2SeqGenerator instance for seq2seq inference. """ seq2seqer = CharBasedSeq2SeqGenerator(None, 0, 0) if compact: seq2seqer.load_compact_model(path) else: seq2seqer.loadmodel(path) - return seq2seqer \ No newline at end of file + return seq2seqer diff --git a/src/shorttext/generators/seq2seq/s2skeras.py b/src/shorttext/generators/seq2seq/s2skeras.py index 36a9c79e..9b8a452b 100644 --- a/src/shorttext/generators/seq2seq/s2skeras.py +++ b/src/shorttext/generators/seq2seq/s2skeras.py @@ -1,54 +1,59 @@ -import json +from typing import Literal +from os import PathLike +import numpy as np +import numpy.typing as npt +import orjson from tensorflow.keras.models import load_model from tensorflow.keras.models import Model from tensorflow.keras.layers import Input, LSTM, Dense +from deprecation import deprecated -from ...utils import compactmodel_io as cio -from ...utils import classification_exceptions as e +from ...utils.compactmodel_io import CompactIOMachine +from ...utils.classification_exceptions import ModelNotTrainedException # Reference: https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html kerasseq2seq_suffices = ['.weights.h5', '.json', '_s2s_hyperparam.json', '_encoder.weights.h5', '_encoder.json', '_decoder.h5', '_decoder.weights.json'] -class Seq2SeqWithKeras(cio.CompactIOMachine): - """ Class implementing sequence-to-sequence (seq2seq) learning with keras. +class Seq2SeqWithKeras(CompactIOMachine): + """Sequence-to-sequence (seq2seq) model using Keras. - Reference: + Implements encoder-decoder architecture for sequence generation tasks. - Ilya Sutskever, James Martens, Geoffrey Hinton, "Generating Text with Recurrent Neural Networks," *ICML* (2011). [`UToronto - `_] + Reference: + Ilya Sutskever, James Martens, Geoffrey Hinton, "Generating Text with Recurrent Neural Networks," + ICML (2011). https://www.cs.utoronto.ca/~ilya/pubs/2011/LANG-RNN.pdf - Ilya Sutskever, Oriol Vinyals, Quoc V. Le, "Sequence to Sequence Learning with Neural Networks," arXiv:1409.3215 (2014). [`arXiv - `_] + Ilya Sutskever, Oriol Vinyals, Quoc V. Le, "Sequence to Sequence Learning with Neural Networks," + arXiv:1409.3215 (2014). https://arxiv.org/abs/1409.3215 - Francois Chollet, "A ten-minute introduction to sequence-to-sequence learning in Keras," *The Keras Blog*. [`Keras - `_] + Francois Chollet, "A ten-minute introduction to sequence-to-sequence learning in Keras," + The Keras Blog. https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html - Aurelien Geron, *Hands-On Machine Learning with Scikit-Learn and TensorFlow* (Sebastopol, CA: O'Reilly Media, 2017). [`O\'Reilly - `_] + Aurelien Geron, Hands-On Machine Learning with Scikit-Learn and TensorFlow (Sebastopol, CA: O'Reilly Media, 2017). """ - def __init__(self, vecsize, latent_dim): - """ Instantiate the class. + def __init__(self, vecsize: int, latent_dim: int): + """Initialize the model. - :param vecsize: vector size of the sequence - :param latent_dim: latent dimension in the RNN cell - :type vecsize: int - :type latent_dim: int + Args: + vecsize: Vector size of the sequence. + latent_dim: Latent dimension in the RNN cell. """ - cio.CompactIOMachine.__init__(self, {'classifier': 'kerasseq2seq'}, 'kerasseq2seq', kerasseq2seq_suffices) + super().__init__( + {'classifier': 'kerasseq2seq'}, + 'kerasseq2seq', + kerasseq2seq_suffices + ) self.vecsize = vecsize self.latent_dim = latent_dim self.compiled = False self.trained = False - def prepare_model(self): - """ Prepare the keras model. - - :return: None - """ + def prepare_model(self) -> None: + """Prepare the Keras model.""" # Define an input sequence and process it. encoder_inputs = Input(shape=(None, self.vecsize)) encoder = LSTM(self.latent_dim, return_state=True) @@ -88,59 +93,61 @@ def prepare_model(self): self.encoder_model = encoder_model self.decoder_model = decoder_model - def compile(self, optimizer='rmsprop', loss='categorical_crossentropy'): - """ Compile the keras model after preparation running :func:`~prepare_model`. + def compile( + self, + optimizer: Literal["sgd", "rmsprop", "adagrad", "adadelta", "adam", "adamax", "nadam"] = 'rmsprop', + loss: str = 'categorical_crossentropy' + ) -> None: + """Compile the Keras model. - :param optimizer: optimizer for gradient descent. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. (Default: rmsprop) - :param loss: loss function available from tensorflow.keras (Default: 'categorical_crossentropy`) - :type optimizer: str - :type loss: str - :return: None + Args: + optimizer: Optimizer for gradient descent. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. Default: rmsprop. + loss: Loss function from tensorflow.keras. Default: 'categorical_crossentropy'. """ self.model.compile(optimizer=optimizer, loss=loss) self.compiled = True - def fit(self, encoder_input, decoder_input, decoder_output, batch_size=64, epochs=100): - """ Fit the sequence to learn the sequence-to-sequence (seq2seq) model. - - :param encoder_input: encoder input, a rank-3 tensor - :param decoder_input: decoder input, a rank-3 tensor - :param decoder_output: decoder output, a rank-3 tensor - :param batch_size: batch size (Default: 64) - :param epochs: number of epochs (Default: 100) - :return: None - :type encoder_input: numpy.array - :type decoder_input: numpy.array - :type decoder_output: numpy.array - :type batch_size: int - :type epochs: int + def fit( + self, + encoder_input: npt.NDArray[np.float64], + decoder_input: npt.NDArray[np.float64], + decoder_output: npt.NDArray[np.float64], + batch_size: int = 64, + epochs: int = 100 + ) -> None: + """Fit the seq2seq model. + + Args: + encoder_input: Encoder input, a rank-3 tensor. + decoder_input: Decoder input, a rank-3 tensor. + decoder_output: Decoder output, a rank-3 tensor. + batch_size: Batch size. Default: 64. + epochs: Number of epochs. Default: 100. """ self.model.fit([encoder_input, decoder_input], decoder_output, batch_size=batch_size, epochs=epochs) self.trained = True - def savemodel(self, prefix, final=False): - """ Save the trained models into multiple files. - - To save it compactly, call :func:`~save_compact_model`. + def savemodel(self, prefix: str, final: bool=False) -> None: + """Save the trained model to files. - If `final` is set to `True`, the model cannot be further trained. + For compact save, use save_compact_model instead. - If there is no trained model, a `ModelNotTrainedException` will be thrown. + Args: + prefix: Prefix of the file path. + final: Whether the model is final (cannot be further trained). Default: False. - :param prefix: prefix of the file path - :param final: whether the model is final (that should not be trained further) (Default: False) - :return: None - :type prefix: str - :type final: bool - :raise: ModelNotTrainedException + Raises: + ModelNotTrainedException: If no trained model exists. """ if not self.trained: - raise e.ModelNotTrainedException() + raise ModelNotTrainedException() # save hyperparameters - json.dump({'vecsize': self.vecsize, 'latent_dim': self.latent_dim}, open(prefix+'_s2s_hyperparam.json', 'w')) + open(prefix + '_s2s_hyperparam.json', 'wb').write( + orjson.dumps({'vecsize': self.vecsize, 'latent_dim': self.latent_dim}) + ) # save whole model if final: @@ -159,16 +166,15 @@ def savemodel(self, prefix, final=False): open(prefix+'_encoder.json', 'w').write(self.encoder_model.to_json()) open(prefix+'_decoder.json', 'w').write(self.decoder_model.to_json()) - def loadmodel(self, prefix): - """ Load a trained model from various files. + def loadmodel(self, prefix: str) -> None: + """Load a trained model from files. - To load a compact model, call :func:`~load_compact_model`. + For compact load, use load_compact_model instead. - :param prefix: prefix of the file path - :return: None - :type prefix: str + Args: + prefix: Prefix of the file path. """ - hyperparameters = json.load(open(prefix+'_s2s_hyperparam.json', 'r')) + hyperparameters = orjson.loads(open(prefix+'_s2s_hyperparam.json', 'rb').read()) self.vecsize, self.latent_dim = hyperparameters['vecsize'], hyperparameters['latent_dim'] self.model = load_model(prefix+'.weights.h5') self.encoder_model = load_model(prefix+'_encoder.weights.h5') @@ -176,15 +182,15 @@ def loadmodel(self, prefix): self.trained = True -def loadSeq2SeqWithKeras(path, compact=True): - """ Load a trained `Seq2SeqWithKeras` class from file. +def load_seq2seq_model(path: str | PathLike, compact: bool=True) -> Seq2SeqWithKeras: + """Load a trained Seq2SeqWithKeras model from file. - :param path: path of the model file - :param compact: whether it is a compact model (Default: True) - :return: a `Seq2SeqWithKeras` class for sequence to sequence inference - :type path: str - :type compact: bool - :rtype: Seq2SeqWithKeras + Args: + path: Path of the model file. + compact: Whether to load a compact model. Default: True. + + Returns: + Seq2SeqWithKeras instance for sequence-to-sequence inference. """ generator = Seq2SeqWithKeras(0, 0) if compact: @@ -192,4 +198,12 @@ def loadSeq2SeqWithKeras(path, compact=True): else: generator.loadmodel(path) generator.compiled = True - return generator \ No newline at end of file + return generator + + +@deprecated(deprecated_in="4.0.0", removed_in="5.0.0") +def loadSeq2SeqWithKeras(path: str | PathLike, compact: bool=True) -> Seq2SeqWithKeras: + """ + Deprecated. Call load_seq2seq_model instead. + """ + return load_seq2seq_model(path, compact=compact) diff --git a/src/shorttext/metrics/dynprog/dldist.py b/src/shorttext/metrics/dynprog/dldist.py index d8ff2f2d..462f517f 100644 --- a/src/shorttext/metrics/dynprog/dldist.py +++ b/src/shorttext/metrics/dynprog/dldist.py @@ -5,14 +5,20 @@ @nb.njit def damerau_levenshtein(word1: str, word2: str) -> int: - """ Calculate the Demarau-Levenshtein (DL) distance between two words. - - :param word1: first word - :param word2: seccond word - :return: Damerau-Levenshtein (DL) distance - :type word1: str - :type word2: str - :rtype: int + """Calculate the Damerau-Levenshtein distance between two words. + + Computes the edit distance considering adjacent transpositions + (swapping two adjacent characters counts as one edit). + + Args: + word1: First word. + word2: Second word. + + Returns: + The Damerau-Levenshtein distance between the two words. + + Reference: + https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance """ len1 = len(word1) len2 = len(word2) @@ -36,6 +42,4 @@ def damerau_levenshtein(word1: str, word2: str) -> int: score = min(score, matrix[i-2, j-2] + cost) matrix[i, j] = score - print(matrix) - return matrix[len1, len2] diff --git a/src/shorttext/metrics/dynprog/jaccard.py b/src/shorttext/metrics/dynprog/jaccard.py index 64bc5877..e1091ac0 100644 --- a/src/shorttext/metrics/dynprog/jaccard.py +++ b/src/shorttext/metrics/dynprog/jaccard.py @@ -5,23 +5,25 @@ from .lcp import longest_common_prefix -def similarity(word1, word2): - """ Return the similarity between the two words. - - Return the similarity between the two words, between 0 and 1 inclusively. - The similarity is the maximum of the two values: - - 1 - Damerau-Levenshtein distance between two words / maximum length of the two words - - longest common prefix of the two words / maximum length of the two words - - Reference: Daniel E. Russ, Kwan-Yuet Ho, Calvin A. Johnson, Melissa C. Friesen, "Computer-Based Coding of Occupation Codes for Epidemiological Analyses," *2014 IEEE 27th International Symposium on Computer-Based Medical Systems* (CBMS), pp. 347-350. (2014) [`IEEE - `_] - - :param word1: a word - :param word2: a word - :return: similarity, between 0 and 1 inclusively - :type word1: str - :type word2: str - :rtype: float +def similarity(word1: str, word2: str) -> float: + """Calculate similarity between two words. + + Computes similarity as the maximum of: + - 1 - Damerau-Levenshtein distance / max length + - Longest common prefix length / max length + + Args: + word1: First word. + word2: Second word. + + Returns: + Similarity score between 0 and 1. + + Reference: + Daniel E. Russ, Kwan-Yuet Ho, Calvin A. Johnson, Melissa C. Friesen, + "Computer-Based Coding of Occupation Codes for Epidemiological Analyses," + IEEE CBMS 2014, pp. 347-350. + http://ieeexplore.ieee.org/abstract/document/6881904/ """ maxlen = max(len(word1), len(word2)) editdistance = damerau_levenshtein(word1, word2) @@ -29,15 +31,18 @@ def similarity(word1, word2): return max(1. - float(editdistance)/maxlen, float(lcp)/maxlen) -def soft_intersection_list(tokens1, tokens2): - """ Return the soft number of intersections between two lists of tokens. +def soft_intersection_list(tokens1: list[str], tokens2: list[str]) -> set[str]: + """Compute soft intersection between two token lists. - :param tokens1: list of tokens. - :param tokens2: list of tokens. - :return: soft number of intersections. - :type tokens1: list - :type tokens2: list - :rtype: float + Finds the best matching pairs between tokens using similarity, + where each token can only match once. + + Args: + tokens1: First list of tokens. + tokens2: Second list of tokens. + + Returns: + Set of ((token1, token2), similarity) tuples representing matches. """ intersected_list = [((token1, token2), similarity(token1, token2)) for token1, token2 in product(tokens1, tokens2)] intersected_list = sorted(intersected_list, key=lambda item: item[1], reverse=True) @@ -54,20 +59,25 @@ def soft_intersection_list(tokens1, tokens2): return included_list -def soft_jaccard_score(tokens1, tokens2): - """ Return the soft Jaccard score of the two lists of tokens, between 0 and 1 inclusively. +def soft_jaccard_score(tokens1: str, tokens2: str) -> float: + """Compute soft Jaccard score between token lists. + + Uses fuzzy matching based on edit distance and longest common prefix. + + Args: + tokens1: First list of tokens. + tokens2: Second list of tokens. - Reference: Daniel E. Russ, Kwan-Yuet Ho, Calvin A. Johnson, Melissa C. Friesen, "Computer-Based Coding of Occupation Codes for Epidemiological Analyses," *2014 IEEE 27th International Symposium on Computer-Based Medical Systems* (CBMS), pp. 347-350. (2014) [`IEEE - `_] + Returns: + Soft Jaccard score between 0 and 1. - :param tokens1: list of tokens. - :param tokens2: list of tokens. - :return: soft Jaccard score, between 0 and 1 inclusively. - :type tokens1: list - :type tokens2: list - :rtype: float + Reference: + Daniel E. Russ, Kwan-Yuet Ho, Calvin A. Johnson, Melissa C. Friesen, + "Computer-Based Coding of Occupation Codes for Epidemiological Analyses," + IEEE CBMS 2014, pp. 347-350. + http://ieeexplore.ieee.org/abstract/document/6881904/ """ intersection_list = soft_intersection_list(tokens1, tokens2) num_intersections = sum([item[1] for item in intersection_list]) num_unions = len(tokens1) + len(tokens2) - num_intersections - return float(num_intersections)/float(num_unions) + return num_intersections / num_unions diff --git a/src/shorttext/metrics/dynprog/lcp.py b/src/shorttext/metrics/dynprog/lcp.py index acec9515..a91bee2e 100644 --- a/src/shorttext/metrics/dynprog/lcp.py +++ b/src/shorttext/metrics/dynprog/lcp.py @@ -4,14 +4,14 @@ @nb.njit def longest_common_prefix(word1: str, word2: str) -> int: - """ Calculate the longest common prefix (LCP) between two words. + """Calculate the longest common prefix length of two strings. - :param word1: first word - :param word2: seccond word - :return: longest common prefix (LCP) - :type word1: str - :type word2: str - :rtype: int + Args: + word1: First string. + word2: Second string. + + Returns: + Length of the longest common prefix. """ lcp = 0 for i in range(min(len(word1), len(word2))): diff --git a/src/shorttext/metrics/embedfuzzy/jaccard.py b/src/shorttext/metrics/embedfuzzy/jaccard.py index 042df113..7d3c073c 100644 --- a/src/shorttext/metrics/embedfuzzy/jaccard.py +++ b/src/shorttext/metrics/embedfuzzy/jaccard.py @@ -1,26 +1,37 @@ from itertools import product +from typing import Optional import numpy as np -from scipy.spatial.distance import cosine +from gensim.models.keyedvectors import KeyedVectors from ...utils import tokenize +from ...utils.compute import cosine_similarity -def jaccardscore_sents(sent1, sent2, wvmodel, sim_words=lambda vec1, vec2: 1-cosine(vec1, vec2)): - """ Compute the Jaccard score between sentences based on their word similarities. +def jaccardscore_sents( + sent1: str, + sent2: str, + wvmodel: KeyedVectors, + sim_words: Optional[callable] = None +) -> float: + """Compute Jaccard score between sentences using embeddings. - :param sent1: first sentence - :param sent2: second sentence - :param wvmodel: word-embeding model - :param sim_words: function for calculating the similarities between a pair of word vectors (default: cosine) - :return: soft Jaccard score - :type sent1: str - :type sent2: str - :type wvmodel: gensim.models.keyedvectors.KeyedVectors - :type sim_words: function - :rtype: float + Uses word embeddings to compute a fuzzy Jaccard score where + word similarity is measured via embedding cosine similarity. + + Args: + sent1: First sentence. + sent2: Second sentence. + wvmodel: Word embedding model. + sim_words: Similarity function for word vectors. Default: cosine. + + Returns: + Fuzzy Jaccard score between 0 and 1. """ + if sim_words is None: + sim_words = cosine_similarity + tokens1 = tokenize(sent1) tokens2 = tokenize(sent2) tokens1 = list(filter(lambda w: w in wvmodel, tokens1)) @@ -28,7 +39,7 @@ def jaccardscore_sents(sent1, sent2, wvmodel, sim_words=lambda vec1, vec2: 1-cos allowable1 = [True] * len(tokens1) allowable2 = [True] * len(tokens2) - simdict = {(i, j): sim_words(wvmodel[tokens1[i]], wvmodel[tokens2[j]]) + simdict = {(i, j): sim_words(wvmodel[tokens1[i]].astype(np.float64), wvmodel[tokens2[j]].astype(np.float64)) for i, j in product(range(len(tokens1)), range(len(tokens2)))} intersection = 0.0 diff --git a/src/shorttext/metrics/wasserstein/wordmoverdist.py b/src/shorttext/metrics/wasserstein/wordmoverdist.py index c127d841..75bb8caf 100644 --- a/src/shorttext/metrics/wasserstein/wordmoverdist.py +++ b/src/shorttext/metrics/wasserstein/wordmoverdist.py @@ -1,39 +1,49 @@ from itertools import product +from typing import Optional import warnings import numpy as np from scipy.spatial.distance import euclidean from scipy.sparse import csr_matrix -from scipy.optimize import linprog +from scipy.optimize import linprog, OptimizeResult +from gensim.models.keyedvectors import KeyedVectors from ...utils.gensim_corpora import tokens_to_fracdict -def word_mover_distance_linprog(first_sent_tokens, second_sent_tokens, wvmodel, distancefunc=euclidean): - """ Compute the Word Mover's distance (WMD) between the two given lists of tokens, and return the LP problem class. +def word_mover_distance_linprog( + first_sent_tokens: list[str], + second_sent_tokens: list[str], + wvmodel: KeyedVectors, + distancefunc: Optional[callable] = None +) -> OptimizeResult: + """Compute Word Mover's distance via linear programming. - Using methods of linear programming, supported by PuLP, calculate the WMD between two lists of words. A word-embedding - model has to be provided. The whole `scipy.optimize.Optimize` object is returned. + Uses scipy.optimize.linprog to compute the transport problem + for the Word Mover's Distance. - Reference: Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger, "From Word Embeddings to Document Distances," *ICML* (2015). + Args: + first_sent_tokens: First list of tokens. + second_sent_tokens: Second list of tokens. + wvmodel: Word embedding model. + distancefunc: Distance function for word vectors. Default: Euclidean. - :param first_sent_tokens: first list of tokens. - :param second_sent_tokens: second list of tokens. - :param wvmodel: word-embedding models. - :param distancefunc: distance function that takes two numpy ndarray. - :return: the whole result of the linear programming problem - :type first_sent_tokens: list - :type second_sent_tokens: list - :type wvmodel: gensim.models.keyedvectors.KeyedVectors - :type distancefunc: function - :rtype: scipy.optimize.OptimizeResult + Returns: + scipy.optimize.OptimizeResult containing the optimization result. + + Reference: + Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger, + "From Word Embeddings to Document Distances," ICML 2015. """ + if distancefunc is None: + distancefunc = euclidean + nb_tokens_first_sent = len(first_sent_tokens) nb_tokens_second_sent = len(second_sent_tokens) all_tokens = list(set(first_sent_tokens+second_sent_tokens)) - wordvecs = {token: wvmodel[token] for token in all_tokens} + wordvecs = {token: wvmodel[token].astype(np.float64) for token in all_tokens} first_sent_buckets = tokens_to_fracdict(first_sent_tokens) second_sent_buckets = tokens_to_fracdict(second_sent_tokens) @@ -64,31 +74,38 @@ def word_mover_distance_linprog(first_sent_tokens, second_sent_tokens, wvmodel, return linprog(T, A_eq=Aeq, b_eq=beq) -def word_mover_distance(first_sent_tokens, second_sent_tokens, wvmodel, distancefunc=euclidean, lpFile=None): - """ Compute the Word Mover's distance (WMD) between the two given lists of tokens. +def word_mover_distance( + first_sent_tokens: list[str], + second_sent_tokens: list[str], + wvmodel: KeyedVectors, + distancefunc: Optional[callable] = None +) -> float: + """Compute Word Mover's distance between token lists. + + Uses word embeddings to compute the minimum transport cost + between words in two sentences. - Using methods of linear programming, calculate the WMD between two lists of words. A word-embedding - model has to be provided. WMD is returned. + Args: + first_sent_tokens: First list of tokens. + second_sent_tokens: Second list of tokens. + wvmodel: Word embedding model. + distancefunc: Distance function for word vectors. Default: Euclidean. - Reference: Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger, "From Word Embeddings to Document Distances," *ICML* (2015). + Returns: + The Word Mover's distance (lower is more similar). - :param first_sent_tokens: first list of tokens. - :param second_sent_tokens: second list of tokens. - :param wvmodel: word-embedding models. - :param distancefunc: distance function that takes two numpy ndarray. - :param lpFile: deprecated, kept for backward incompatibility. (default: None) - :return: Word Mover's distance (WMD) - :type first_sent_tokens: list - :type second_sent_tokens: list - :type wvmodel: gensim.models.keyedvectors.KeyedVectors - :type distancefunc: function - :type lpFile: str - :rtype: float + Reference: + Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger, + "From Word Embeddings to Document Distances," ICML 2015. """ - linprog_result = word_mover_distance_linprog(first_sent_tokens, second_sent_tokens, wvmodel, - distancefunc=distancefunc) - if lpFile is not None: - warnings.warn('The parameter `lpFile` (value: {}) is not used; parameter is deprecated as ' + \ - 'the package `pulp` is no longer used. Check your code if there is a dependency on ' + \ - 'this parameter.') + if distancefunc is None: + distancefunc = euclidean + + linprog_result = word_mover_distance_linprog( + first_sent_tokens, + second_sent_tokens, + wvmodel, + distancefunc=distancefunc + ) + return linprog_result['fun'] diff --git a/src/shorttext/schemas/__init__.py b/src/shorttext/schemas/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/shorttext/schemas/models.py b/src/shorttext/schemas/models.py new file mode 100644 index 00000000..b85d3f13 --- /dev/null +++ b/src/shorttext/schemas/models.py @@ -0,0 +1,18 @@ + +from dataclasses import dataclass + +from tensorflow.keras import Model + + +@dataclass +class AutoEncoderPackage: + """Package containing autoencoder components. + + Attributes: + autoencoder: The full autoencoder model. + encoder: The encoder part of the autoencoder. + decoder: The decoder part of the autoencoder. + """ + autoencoder: Model + encoder: Model + decoder: Model diff --git a/src/shorttext/smartload.py b/src/shorttext/smartload.py index 9171b4a1..61417cef 100644 --- a/src/shorttext/smartload.py +++ b/src/shorttext/smartload.py @@ -1,55 +1,71 @@ +from typing import Optional +from os import PathLike + +import gensim + from .utils import standard_text_preprocessor_1 from .utils import compactmodel_io as cio from .utils import classification_exceptions as e -from .utils import load_DocumentTermMatrix from .classifiers import load_varnnlibvec_classifier, load_sumword2vec_classifier from .generators import load_autoencoder_topicmodel, load_gensimtopicmodel -from .generators import loadSeq2SeqWithKeras, loadCharBasedSeq2SeqGenerator +from .generators import load_seq2seq_model, loadCharBasedSeq2SeqGenerator from .classifiers import load_autoencoder_topic_sklearnclassifier, load_gensim_topicvec_sklearnclassifier from .classifiers import load_maxent_classifier +from .utils.dtm import load_numpy_documentmatrixmatrix + +def smartload_compact_model( + filename: str | PathLike, + wvmodel: Optional[gensim.models.keyedvectors.KeyedVectors], + preprocessor: Optional[callable] = None, + vecsize: Optional[int] = None +): + """Load a classifier or model from a compact file. -def smartload_compact_model(filename, wvmodel, preprocessor=standard_text_preprocessor_1(), vecsize=None): - """ Load appropriate classifier or model from the binary model. + Automatically detects the model type and loads the appropriate classifier. + Set wvmodel to None if no word embedding model is needed. - The second parameter, `wvmodel`, can be set to `None` if no Word2Vec model is needed. + Args: + filename: Path to the compact model file. + wvmodel: Word embedding model. Can be None for non-embedding models. + preprocessor: Text preprocessing function. Default: standard_text_preprocessor_1. + vecsize: Vector size. Default: None (extracted from model). - :param filename: path of the compact model file - :param wvmodel: Word2Vec model - :param preprocessor: text preprocessor (Default: `shorttext.utils.textpreprocess.standard_text_preprocessor_1`) - :param vecsize: length of embedded vectors in the model (Default: None, extracted directly from the word-embedding model) - :return: appropriate classifier or model - :raise: AlgorithmNotExistException - :type filename: str - :type wvmodel: gensim.models.keyedvectors.KeyedVectors - :type preprocessor: function - :type vecsize: int + Returns: + Appropriate classifier or model instance. + + Raises: + AlgorithmNotExistException: If model type is unknown. """ + if preprocessor is None: + preprocessor = standard_text_preprocessor_1() + classifier_name = cio.get_model_classifier_name(filename) - if classifier_name in ['ldatopic', 'lsitopic', 'rptopic']: - return load_gensimtopicmodel(filename, preprocessor=preprocessor, compact=True) - elif classifier_name in ['kerasautoencoder']: - return load_autoencoder_topicmodel(filename, preprocessor=preprocessor, compact=True) - elif classifier_name in ['topic_sklearn']: - topicmodel = cio.get_model_config_field(filename, 'topicmodel') - if topicmodel in ['ldatopic', 'lsitopic', 'rptopic']: - return load_gensim_topicvec_sklearnclassifier(filename, preprocessor=preprocessor, compact=True) - elif topicmodel in ['kerasautoencoder']: - return load_autoencoder_topic_sklearnclassifier(filename, preprocessor=preprocessor, compact=True) - else: - raise e.AlgorithmNotExistException(topicmodel) - elif classifier_name in ['nnlibvec']: - return load_varnnlibvec_classifier(wvmodel, filename, compact=True, vecsize=vecsize) - elif classifier_name in ['sumvec']: - return load_sumword2vec_classifier(wvmodel, filename, compact=True, vecsize=vecsize) - elif classifier_name in ['maxent']: - return load_maxent_classifier(filename, compact=True) - elif classifier_name in ['dtm']: - return load_DocumentTermMatrix(filename, compact=True) - elif classifier_name in ['kerasseq2seq']: - return loadSeq2SeqWithKeras(filename, compact=True) - elif classifier_name in ['charbases2s']: - return loadCharBasedSeq2SeqGenerator(filename, compact=True) - else: - raise e.AlgorithmNotExistException(classifier_name) \ No newline at end of file + match classifier_name: + case 'ldatopic' | 'lsitopic' | 'rptopic': + return load_gensimtopicmodel(filename, preprocessor=preprocessor, compact=True) + case 'kerasautoencoder': + return load_autoencoder_topicmodel(filename, preprocessor=preprocessor, compact=True) + case 'topic_sklearn': + topicmodel = cio.get_model_config_field(filename, 'topicmodel') + if topicmodel in ['ldatopic', 'lsitopic', 'rptopic']: + return load_gensim_topicvec_sklearnclassifier(filename, preprocessor=preprocessor, compact=True) + elif topicmodel in ['kerasautoencoder']: + return load_autoencoder_topic_sklearnclassifier(filename, preprocessor=preprocessor, compact=True) + else: + raise e.AlgorithmNotExistException(topicmodel) + case 'nnlibvec': + return load_varnnlibvec_classifier(wvmodel, filename, compact=True, vecsize=vecsize) + case 'sumvec': + return load_sumword2vec_classifier(wvmodel, filename, compact=True, vecsize=vecsize) + case 'maxent': + return load_maxent_classifier(filename, compact=True) + case 'kerasseq2seq': + return load_seq2seq_model(filename, compact=True) + case 'charbases2s': + return loadCharBasedSeq2SeqGenerator(filename, compact=True) + case "npdtm": + return load_numpy_documentmatrixmatrix(filename) + case _: + raise e.AlgorithmNotExistException(classifier_name) diff --git a/src/shorttext/spell/basespellcorrector.py b/src/shorttext/spell/basespellcorrector.py index 4201beac..c9632c04 100644 --- a/src/shorttext/spell/basespellcorrector.py +++ b/src/shorttext/spell/basespellcorrector.py @@ -1,31 +1,30 @@ from abc import ABC, abstractmethod -from ..utils.classification_exceptions import NotImplementedException - class SpellCorrector(ABC): - """ Base class for all spell corrector. - - This class is not implemented; this is an "abstract class." + """Abstract base class for spell correctors. + Defines the interface for spelling correction algorithms. """ + @abstractmethod - def train(self, text): - """ Train the spell corrector with the given corpus. + def train(self, text: str) -> None: + """Train the spell corrector on a corpus. - :param text: training corpus - :type text: str + Args: + text: Training text corpus. """ - raise NotImplementedException() + raise NotImplemented() @abstractmethod - def correct(self, word): - """ Recommend a spell correction to given the word. + def correct(self, word: str) -> str: + """Recommend a spelling correction for a word. + + Args: + word: Word to correct. - :param word: word to be checked - :return: recommended correction - :type word: str - :rtype: str + Returns: + The corrected word. """ return word diff --git a/src/shorttext/spell/editor.py b/src/shorttext/spell/editor.py index 0a501204..bdd9e151 100644 --- a/src/shorttext/spell/editor.py +++ b/src/shorttext/spell/editor.py @@ -1,9 +1,22 @@ +from typing import Generator + import numba as nb @nb.njit -def compute_set_edits1(word): +def compute_set_edits1(word: str) -> set[str]: + """Generate all single-edit distance words. + + Creates all possible words that are one edit (insert, delete, + transpose, replace) away from the input word. + + Args: + word: Input word. + + Returns: + Set of all possible single-edit variations. + """ letters = 'abcdefghijklmnopqrstuvwxyz' splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] @@ -18,5 +31,16 @@ def compute_set_edits1(word): @nb.njit -def compute_set_edits2(word): +def compute_set_edits2(word: str) -> Generator[str, None, None]: + """Generate all double-edit distance words. + + Creates all possible words that are two edits away from the + input word by applying compute_set_edits1 to each result. + + Args: + word: Input word. + + Yields: + All possible double-edit variations. + """ return (e2 for e1 in compute_set_edits1(word) for e2 in compute_set_edits1(e1)) diff --git a/src/shorttext/spell/norvig.py b/src/shorttext/spell/norvig.py index f24be4ed..48bc74c2 100644 --- a/src/shorttext/spell/norvig.py +++ b/src/shorttext/spell/norvig.py @@ -3,68 +3,81 @@ import re from collections import Counter +from typing import Generator from . import SpellCorrector from .editor import compute_set_edits1, compute_set_edits2 class NorvigSpellCorrector(SpellCorrector): - """ Spell corrector described by Peter Norvig in his blog. (https://norvig.com/spell-correct.html) + """Spell corrector based on Peter Norvig's algorithm. + Uses word frequency counts to suggest corrections for misspelled + words by finding edits that exist in the vocabulary. + + Reference: + https://norvig.com/spell-correct.html """ - def __init__(self): - """ Instantiate the class - """ + def __init__(self): + """Initialize the spell corrector.""" self.train('') - def train(self, text): - """ Given the text, train the spell corrector. + def train(self, text: str) -> None: + """Train on a text corpus. + + Builds a word frequency dictionary from the input text. - :param text: training corpus - :type text: str + Args: + text: Training text corpus. """ self.words = re.findall('\\w+', text.lower()) self.WORDS = Counter(self.words) self.N = sum(self.WORDS.values()) - def P(self, word): - """ Compute the probability of the words randomly sampled from the training corpus. + def P(self, word: str) -> float: + """Compute word probability from the training corpus. - :param word: a word - :return: probability of the word sampled randomly in the corpus - :type word: str - :rtype: float + Args: + word: Word to get probability for. + + Returns: + Probability of the word appearing in the corpus. """ return self.WORDS[word] / float(self.N) - def correct(self, word): - """ Recommend a spelling correction to the given word + def correct(self, word: str) -> str: + """Recommend spelling correction for a word. + + Args: + word: Word to correct. - :param word: a word - :return: recommended correction - :type word: str - :rtype: str + Returns: + Most likely correction, or the original word if no better option. """ return max(self.candidates(word), key=self.P) - def known(self, words): - """ Filter away the words that are not found in the training corpus. + def known(self, words: list[str]) -> set[str]: + """Filter words found in the training vocabulary. + + Args: + words: List of words to check. - :param words: list of words - :return: list of words that can be found in the training corpus - :type words: list - :rtype: list + Returns: + Subset of words that appear in the training corpus. """ return set(w for w in words if w in self.WORDS) - def candidates(self, word): - """ List potential candidates for corrected spelling to the given words. + def candidates(self, word: str) -> Generator[str, None, None]: + """Generate spelling correction candidates. + + Checks exact match, then edits of distance 1 and 2. + + Args: + word: Word to find candidates for. - :param word: a word - :return: list of recommended corrections - :type word: str - :rtype: list + Yields: + Viable correction candidates. """ return (self.known([word]) or self.known(compute_set_edits1(word)) or self.known(compute_set_edits2(word)) or [word]) diff --git a/src/shorttext/stack/stacking.py b/src/shorttext/stack/stacking.py index 7bbb6c09..2a5c256b 100644 --- a/src/shorttext/stack/stacking.py +++ b/src/shorttext/stack/stacking.py @@ -1,48 +1,53 @@ import pickle from abc import ABC, abstractmethod +from typing import Optional, Annotated, Generator, Literal import numpy as np +import numpy.typing as npt from tensorflow.keras.layers import Dense, Reshape from tensorflow.keras.models import Sequential from tensorflow.keras.regularizers import l2 -from ..utils import classification_exceptions as e +from ..utils.classification_exceptions import ModelNotTrainedException from ..utils import kerasmodel_io as kerasio from ..utils.compactmodel_io import CompactIOMachine +from ..classifiers.base import AbstractScorer # abstract class class StackedGeneralization(ABC): - """ - This is an abstract class for any stacked generalization method. It is an intermediate model - that takes the results of other classifiers as the input features, and perform another classification. + """Abstract base class for stacked generalization. - The classifiers must have the :func:`~score` method that takes a string as an input argument. + An intermediate model that takes output from other classifiers as input + features and performs another level of classification. - More references: + The classifiers must have the :meth:`~score` method that takes a string as input. - David H. Wolpert, "Stacked Generalization," *Neural Netw* 5: 241-259 (1992). + Reference: + David H. Wolpert, "Stacked Generalization," Neural Netw 5: 241-259 (1992). - M. Paz Sesmero, Agapito I. Ledezma, Araceli Sanchis, "Generating ensembles of heterogeneous classifiers using Stacked Generalization," - *WIREs Data Mining and Knowledge Discovery* 5: 21-34 (2015). + M. Paz Sesmero et al., "Generating ensembles of heterogeneous classifiers + using Stacked Generalization," WIREs Data Mining and Knowledge Discovery 5: 21-34 (2015). """ - def __init__(self, intermediate_classifiers=None): - """ Initialize the stacking class instance. - :param intermediate_classifiers: dictionary, with key being a string, and the values intermediate classifiers, that have the method :func:`~score`, which takes a string as the input argument. - :type intermediate_classifiers: dict + def __init__( + self, + intermediate_classifiers: Optional[dict[str, AbstractScorer]] = None + ): + """Initialize the stacking class. + + Args: + intermediate_classifiers: Dictionary mapping names to classifier instances. """ self.classifiers = intermediate_classifiers if intermediate_classifiers is not None else {} self.classlabels = [] self.trained = False - def register_classifiers(self): - """ Register the intermediate classifiers. - - It must be run before any training. + def register_classifiers(self) -> None: + """Register the intermediate classifiers. - :return: None + Must be called before training. """ self.classifier2idx = {} self.idx2classifier = {} @@ -50,167 +55,168 @@ def register_classifiers(self): self.classifier2idx[key] = idx self.idx2classifier[idx] = key - def register_classlabels(self, labels): - """ Register output labels. + def register_classlabels(self, labels: list[str]) -> None: + """Register output labels. - Given the labels, it gives an integer as the index for each label. - It is essential for the output model to place. + Args: + labels: List of output class labels. - It must be run before any training. - - :param labels: list of output labels - :return: None - :type labels: list + Must be called before training. """ - self.classlabels = list(labels) + self.classlabels = labels self.labels2idx = {classlabel: idx for idx, classlabel in enumerate(self.classlabels)} - def add_classifier(self, name, classifier): - """ Add a classifier. - - Add a classifier to the class. The classifier must have the method :func:`~score` which - takes a string as an input argument. + def add_classifier(self, name: str, classifier: AbstractScorer) -> None: + """Add a classifier to the stack. - :param name: name of the classifier, without spaces and any special characters - :param classifier: instance of a classifier, which has a method :func:`~score` which takes a string as an input argument - :return: None - :type name: str - :type classifier: any class with a method :func:`~score` + Args: + name: Name for the classifier (no spaces or special characters). + classifier: Classifier instance with a :meth:`~score` method. """ self.classifiers[name] = classifier self.register_classifiers() - def delete_classifier(self, name): - """ Delete a classifier. + def delete_classifier(self, name: str) -> None: + """Delete a classifier from the stack. - :param name: name of the classifier to be deleted - :return: None - :type name: str - :raise: KeyError + Args: + name: Name of the classifier to delete. + + Raises: + KeyError: If classifier name not found. """ del self.classifiers[name] self.register_classifiers() - def translate_shorttext_intfeature_matrix(self, shorttext): - """ Represent the given short text as the input matrix of the stacking class. + def translate_shorttext_intfeature_matrix( + self, + shorttext: str + ) -> Annotated[npt.NDArray[np.float64], "2D Array"]: + """Convert short text to feature matrix for stacking. + + Args: + shorttext: Input text. - :param shorttext: short text - :return: input matrix of the stacking class - :type shorttext: str - :rtype: numpy.ndarray + Returns: + Feature matrix of shape (n_classifiers, n_labels). """ feature_matrix = np.zeros((len(self.classifier2idx), len(self.labels2idx))) - for key in self.classifier2idx: - scoredict = self.classifiers[key].score(shorttext) + for key, idx in self.classifier2idx.items(): + classifier = self.classifiers[key] + scoredict = classifier.score(shorttext) for label in scoredict: - feature_matrix[self.classifier2idx[key], self.labels2idx[label]] = scoredict[label] + feature_matrix[idx, self.labels2idx[label]] = scoredict[label] return feature_matrix - def convert_label_to_buckets(self, label): - """ Convert the label into an array of bucket. + def convert_label_to_buckets( + self, + label: str + ) -> Annotated[npt.NDArray[np.int64], "1D Array"]: + """Convert label to one-hot bucket representation. - Some classification algorithms, especially those of neural networks, have the output - as a serious of buckets with the correct answer being 1 in the correct label, with other being 0. - This method convert the label into the corresponding buckets. + Args: + label: Class label. - :param label: label - :return: array of buckets - :type label: str - :rtype: numpy.ndarray + Returns: + One-hot array with 1 at the label's position. """ - buckets = np.zeros(len(self.labels2idx), dtype=np.int_) + buckets = np.zeros(len(self.labels2idx), dtype=np.int64) buckets[self.labels2idx[label]] = 1 return buckets - def convert_traindata_matrix(self, classdict, tobucket=True): - """ Returns a generator that returns the input matrix and the output labels for training. + def convert_traindata_matrix( + self, + classdict: dict[str, list[str]], + tobucket: bool = True + ) -> Generator[tuple[Annotated[npt.NDArray[np.float64], "2D Array"], Annotated[npt.NDArray[np.int64], "1D Array"]], None, None]: + """Yield training data matrices. + + Args: + classdict: Training data dictionary. + tobucket: Whether to convert labels to buckets. Default: True. - :param classdict: dictionary of the training data - :param tobucket: whether to convert the label into buckets (Default: True) - :return: array of input matrix, and output labels - :type classdict: dict - :type tobucket: bool - :rtype: tuple + Yields: + Tuples of (feature_matrix, label_array). """ - for label in classdict: + for label, texts in classdict.items(): y = self.convert_label_to_buckets(label) if tobucket else self.labels2idx[label] - for shorttext in classdict[label]: - X = self.translate_shorttext_intfeature_matrix(shorttext) - yield X, y + for shorttext in texts: + x = self.translate_shorttext_intfeature_matrix(shorttext) + yield x, y @abstractmethod - def train(self, classdict, *args, **kwargs): - """ Train the stacked generalization. - - Not implemented. `NotImplemntedException` raised. - - :param classdict: training data - :param args: arguments to be parsed - :param kwargs: arguments to be parsed - :return: None - :type classdict: dict - :type args: dict - :type kwargs: dict - :raise: NotImplementedException + def train(self, classdict: dict[str, list[str]], *args, **kwargs) -> None: + """Train the stacked generalization model. + + Args: + classdict: Training data. + *args: Additional arguments. + **kwargs: Additional keyword arguments. + + Raises: + NotImplementedError: Abstract method. """ - raise e.NotImplementedException() + raise NotImplemented() @abstractmethod - def score(self, shorttext, *args, **kwargs): - """ Calculate the scores for each class labels. - - Not implemented. `NotImplemntedException` raised. - - :param shorttext: short text to be scored - :param args: arguments to be parsed - :param kwargs: arguments to be parsed - :return: dictionary of scores for all class labels - :type shorttext: str - :type args: dict - :type kwargs: dict - :rtype: dict - :raise: NotImplementedException + def score(self, shorttext: str, *args, **kwargs) -> dict[str, float]: + """Calculate classification scores for all labels. + + Args: + shorttext: Input text. + *args: Additional arguments. + **kwargs: Additional keyword arguments. + + Returns: + Dictionary mapping class labels to scores. + + Raises: + NotImplementedError: Abstract method. """ - raise e.NotImplementedException() + raise NotImplemented() class LogisticStackedGeneralization(StackedGeneralization, CompactIOMachine): - """ - This class implements logistic regression as the stacked generalizer. - - It is an intermediate model - that takes the results of other classifiers as the input features, and perform another classification. + """Stacked generalization using logistic regression. - This class saves the stacked logistic model, but not the information of the primary model. + Uses neural network with sigmoid output to combine predictions from + intermediate classifiers. - The classifiers must have the :func:`~score` method that takes a string as an input argument. + Note: + Saves the stacked model but not the intermediate classifiers. """ - def __init__(self, intermediate_classifiers={}): + + def __init__( + self, + intermediate_classifiers: Optional[dict[str, AbstractScorer]] = None, + ): CompactIOMachine.__init__(self, {'classifier': 'stacked_logistics'}, 'stacked_logistics', ['_stackedlogistics.pkl', '_stackedlogistics.weights.h5', '_stackedlogistics.json']) StackedGeneralization.__init__(self, intermediate_classifiers=intermediate_classifiers) - def train(self, classdict, optimizer='adam', l2reg=0.01, bias_l2reg=0.01, nb_epoch=1000): - """ Train the stacked generalization. - - :param classdict: training data - :param optimizer: optimizer to use Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. (Default: 'adam', for adam optimizer) - :param l2reg: coefficients for L2-regularization (Default: 0.01) - :param bias_l2reg: coefficients for L2-regularization for bias (Default: 0.01) - :param nb_epoch: number of epochs for training (Default: 1000) - :return: None - :type classdict: dict - :type optimizer: str - :type l2reg: float - :type bias_l2reg: float - :type nb_epoch: int + def train( + self, + classdict: dict[str, list[str]], + optimizer: Literal["sgd", "rmsprop", "adagrad", "adadelta", "adam", "adamax", "nadam"] = "adam", + l2reg: float = 0.01, + bias_l2reg: float = 0.01, + nb_epoch: int = 1000 + ) -> None: + """Train the stacked generalization model. + + Args: + classdict: Training data. + optimizer: Optimizer for training. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. Default: adam. + l2reg: L2 regularization coefficient. Default: 0.01. + bias_l2reg: L2 regularization for bias. Default: 0.01. + nb_epoch: Number of training epochs. Default: 1000. """ # register self.register_classifiers() - self.register_classlabels(classdict.keys()) + self.register_classlabels(sorted(classdict.keys())) # sorted the keys kmodel = Sequential() kmodel.add(Reshape((len(self.classifier2idx) * len(self.labels2idx),), @@ -231,22 +237,20 @@ def train(self, classdict, optimizer='adam', l2reg=0.01, bias_l2reg=0.01, nb_epo self.model = kmodel self.trained = True - def score(self, shorttext): - """ Calculate the scores for all the class labels for the given short sentence. + def score(self, shorttext: str) -> dict[str, float]: + """Calculate classification scores for all labels. - Given a short sentence, calculate the classification scores for all class labels, - returned as a dictionary with key being the class labels, and values being the scores. - If the short sentence is empty, or if other numerical errors occur, the score will be `numpy.nan`. + Args: + shorttext: Input text. - If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. + Returns: + Dictionary mapping class labels to scores. - :param shorttext: a short sentence - :return: a dictionary with keys being the class labels, and values being the corresponding classification scores - :type shorttext: str - :rtype: dict + Raises: + ModelNotTrainedException: If model not trained. """ if not self.trained: - raise e.ModelNotTrainedException() + raise ModelNotTrainedException() input_matrix = self.translate_shorttext_intfeature_matrix(shorttext) prediction = self.model.predict(np.array([input_matrix])) @@ -255,36 +259,32 @@ def score(self, shorttext): return scoredict - def savemodel(self, nameprefix): - """ Save the logistic stacked model into files. + def savemodel(self, nameprefix: str) -> None: + """Save the stacked model to files. - Save the stacked model into files. Note that the intermediate classifiers - are not saved. Users are advised to save those classifiers separately. + Note: Intermediate classifiers are not saved. Save them separately. - If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. + Args: + nameprefix: Prefix for output files. - :param nameprefix: prefix of the files - :return: None - :raise: ModelNotTrainedException - :type nameprefix: str + Raises: + ModelNotTrainedException: If model not trained. """ if not self.trained: - raise e.ModelNotTrainedException() + raise ModelNotTrainedException() stackedmodeldict = {'classifiers': self.classifier2idx, 'classlabels': self.classlabels} pickle.dump(stackedmodeldict, open(nameprefix+'_stackedlogistics.pkl', 'wb')) kerasio.save_model(nameprefix+'_stackedlogistics', self.model) - def loadmodel(self, nameprefix): - """ Load the model with the given prefix. + def loadmodel(self, nameprefix: str) -> None: + """Load the stacked model from files. - Load the model with the given prefix of their paths. Note that the intermediate - classifiers are not loaded, and users are required to load them separately. + Note: Intermediate classifiers are not loaded. Load them separately. - :param nameprefix: prefix of the model files - :return: None - :type nameprefix: str + Args: + nameprefix: Prefix for input files. """ stackedmodeldict = pickle.load(open(nameprefix+'_stackedlogistics.pkl', 'rb')) self.register_classlabels(stackedmodeldict['classlabels']) @@ -293,7 +293,3 @@ def loadmodel(self, nameprefix): self.model = kerasio.load_model(nameprefix+'_stackedlogistics') self.trained = True - - - - diff --git a/src/shorttext/utils/__init__.py b/src/shorttext/utils/__init__.py index a74f960e..c73208a7 100644 --- a/src/shorttext/utils/__init__.py +++ b/src/shorttext/utils/__init__.py @@ -12,8 +12,6 @@ from .wordembed import load_word2vec_model, load_fasttext_model, load_poincare_model, shorttext_to_avgvec from .wordembed import RESTfulKeyedVectors -from .dtm import load_DocumentTermMatrix - -from .dtm import DocumentTermMatrix, load_DocumentTermMatrix +from .dtm import NumpyDocumentTermMatrix diff --git a/src/shorttext/utils/classification_exceptions.py b/src/shorttext/utils/classification_exceptions.py index 97184a93..44a82010 100644 --- a/src/shorttext/utils/classification_exceptions.py +++ b/src/shorttext/utils/classification_exceptions.py @@ -1,37 +1,49 @@ +from os import PathLike +from pathlib import Path +from deprecation import deprecated +import numpy as np class ModelNotTrainedException(Exception): + """Exception raised when attempting to use an untrained model.""" def __init__(self): self.message = 'Model not trained.' class AlgorithmNotExistException(Exception): - def __init__(self, algoname): - self.message = 'Algorithm '+algoname+' not exist.' + """Exception raised when a requested algorithm is not available.""" + def __init__(self, algoname: str): + self.message = f"Algorithm {algoname} not exist." class WordEmbeddingModelNotExistException(Exception): - def __init__(self, path): - self.message = 'Given path of the word-embedding model not exist: '+path + """Exception raised when the word embedding model file is not found.""" + def __init__(self, path: str | PathLike): + self.message = f"Given path of the word-embedding model not exist: {path.as_posix() if isinstance(path, Path) else path}" class UnequalArrayLengthsException(Exception): - def __init__(self, arr1, arr2): - self.message = 'Unequal lengths: '+str(len(arr1))+" and "+str(len(arr2)) + """Exception raised when two arrays have unequal lengths.""" + def __init__(self, arr1: np.ndarray | list, arr2: np.ndarray | list): + self.message = f"Unequal lengths: {len(arr1)} and {len(arr2)}" +@deprecated(deprecated_in="4.0.0", removed_in="5.0.0") class NotImplementedException(Exception): + """Exception raised when a method is not implemented.""" def __init__(self): self.message = 'Method not implemented.' class IncorrectClassificationModelFileException(Exception): - def __init__(self, expectedname, actualname): - self.message = 'Incorrect model (expected: '+expectedname+' ; actual: '+actualname+')' + """Exception raised when model file doesn't match expected type.""" + def __init__(self, expectedname: str, actualname: str): + self.message = f"Incorrect model (expected: {expectedname} ; actual: {actualname})" class OperationNotDefinedException(Exception): - def __init__(self, opname): - self.message = 'Operation '+opname+' not defined' + """Exception raised when an operation is not defined.""" + def __init__(self, opname: str): + self.message = f"Operation {opname} not defined" diff --git a/src/shorttext/utils/compactmodel_io.py b/src/shorttext/utils/compactmodel_io.py index ee775479..346c9030 100644 --- a/src/shorttext/utils/compactmodel_io.py +++ b/src/shorttext/utils/compactmodel_io.py @@ -1,26 +1,29 @@ """ -This module contains general routines to zip all model files into one compact file. The model can be copied -or transferred with handiness. +This module contains general routines to zip all model files into one compact file. +The model can be copied or transferred easily. -The methods and decorators in this module are called by other codes. It is not recommended for developers -to call them directly. +The methods and decorators in this module are called by other codes. It is not +recommended for developers to call them directly. """ +from abc import ABC, abstractmethod from tempfile import mkdtemp import zipfile import json import os -from functools import partial +from os import PathLike +from typing import Any, Self + +import orjson from . import classification_exceptions as e -from deprecation import deprecated -def removedir(dir: str): - """ Remove all subdirectories and files under the specified path. +def removedir(dir: str) -> None: + """Remove all subdirectories and files under the specified path. - :param dir: path of the directory to be clean - :return: None + Args: + dir: Path of the directory to clean. """ for filename in os.listdir(dir): if os.path.isdir(filename): @@ -31,29 +34,30 @@ def removedir(dir: str): os.rmdir(dir) -def save_compact_model(filename, savefunc, prefix, suffices, infodict): - """ Save the model in one compact file by zipping all the related files. - - :param filename: name of the model file - :param savefunc: method or function that performs the saving action. Only one argument (str), the prefix of the model files, to be passed. - :param prefix: prefix of the names of the files related to the model - :param suffices: list of suffices - :param infodict: dictionary that holds information about the model. Must contain the key 'classifier'. - :return: None - :type filename: str - :type savefunc: function - :type prefix: str - :type suffices: list - :type infodict: dict +def save_compact_model( + filename: str, + savefunc: callable, + prefix: str, + suffices: str, + infodict: dict[str, Any] +) -> None: + """Save the model in one compact file by zipping all related files. + + Args: + filename: Name of the output model file. + savefunc: Function that performs the saving action. Takes one argument (str) - the prefix. + prefix: Prefix of the names of the files related to the model. + suffices: List of file suffixes. + infodict: Dictionary with model information. Must contain the key 'classifier'. """ # create temporary directory tempdir = mkdtemp() - savefunc(tempdir+'/'+prefix) + savefunc(os.path.join(tempdir, prefix)) # zipping outputfile = zipfile.ZipFile(filename, mode='w', allowZip64 = True) for suffix in suffices: - outputfile.write(tempdir+'/'+prefix+suffix, prefix+suffix) + outputfile.write(os.path.join(tempdir, prefix+suffix), prefix+suffix) outputfile.writestr('modelconfig.json', json.dumps(infodict)) outputfile.close() @@ -61,18 +65,22 @@ def save_compact_model(filename, savefunc, prefix, suffices, infodict): removedir(tempdir) -def load_compact_model(filename, loadfunc, prefix, infodict): - """ Load a model from a compact file that contains multiple files related to the model. +def load_compact_model( + filename: str, + loadfunc: callable, + prefix: str, + infodict: dict[str, Any] +) -> Any: + """Load a model from a compact file. + + Args: + filename: Name of the model file. + loadfunc: Function that performs the loading action. Takes one argument (str) - the prefix. + prefix: Prefix of the names of the files. + infodict: Dictionary with model information. Must contain the key 'classifier'. - :param filename: name of the model file - :param loadfunc: method or function that performs the loading action. Only one argument (str), the prefix of the model files, to be passed. - :param prefix: prefix of the names of the files - :param infodict: dictionary that holds information about the model. Must contain the key 'classifier'. - :return: instance of the model - :type filename: str - :type loadfunc: function - :type prefix: str - :type infodict: dict + Returns: + The loaded model instance. """ # create temporary directory tempdir = mkdtemp() @@ -83,13 +91,15 @@ def load_compact_model(filename, loadfunc, prefix, infodict): inputfile.close() # check model config - readinfodict = json.load(open(tempdir+'/modelconfig.json', 'r')) + readinfodict = json.load(open(os.path.join(tempdir, 'modelconfig.json'), 'r')) if readinfodict['classifier'] != infodict['classifier']: - raise e.IncorrectClassificationModelFileException(infodict['classifier'], - readinfodict['classifier']) + raise e.IncorrectClassificationModelFileException( + infodict['classifier'], + readinfodict['classifier'] + ) # load the model - returnobj = loadfunc(tempdir+'/'+prefix) + returnobj = loadfunc(os.path.join(tempdir, prefix)) # delete temporary files removedir(tempdir) @@ -97,167 +107,100 @@ def load_compact_model(filename, loadfunc, prefix, infodict): return returnobj -class CompactIOMachine: - """ Base class that implements compact model I/O. - - This is to replace the original :func:`compactio` decorator. +class CompactIOMachine(ABC): + """Base class that implements compact model I/O. + Replaces the original compactio decorator. """ - def __init__(self, infodict, prefix, suffices): - """ - :param infodict: information about the model. Must contain the key 'classifier'. - :param prefix: prefix of names of the model file - :param suffices: suffices of the names of the model file - :type infodict: dict - :type prefix: str - :type suffices: list + def __init__( + self, + infodict: dict[str, Any], + prefix: str, + suffices: list[str] + ): + """Initialize the compact I/O machine. + + Args: + infodict: Dictionary with model information. Must contain 'classifier'. + prefix: Prefix for model file names. + suffices: List of file suffixes for the model files. """ self.infodict = infodict self.prefix = prefix self.suffices = suffices - def savemodel(self, nameprefix): - """ Abstract method for `savemodel`. + @abstractmethod + def savemodel(self, nameprefix: str) -> None: + """Save the model to files. - :param nameprefix: prefix of the model path - :type nameprefix: str + Args: + nameprefix: Prefix for model file paths. """ - raise e.OperationNotDefinedException() + raise NotImplemented() - def loadmodel(self, nameprefix): - """ Abstract method for `loadmodel`. + @abstractmethod + def loadmodel(self, nameprefix: str) -> Self: + """Load the model from files. - :param nameprefix: prefix of the model path - :type nameprefix: str + Args: + nameprefix: Prefix for model file paths. """ - raise e.OperationNotDefinedException() + raise NotImplemented() - def save_compact_model(self, filename, *args, **kwargs): - """ Save the model in a compressed binary format. + def save_compact_model(self, filename: str, *args, **kwargs) -> None: + """Save the model in a compressed binary format. - :param filename: name of the model file - :param args: arguments - :param kwargs: arguments - :type filename: str - :type args: dict - :type kwargs: dict + Args: + filename: Name of the model file. + *args: Additional arguments. + **kwargs: Additional keyword arguments. """ save_compact_model(filename, self.savemodel, self.prefix, self.suffices, self.infodict, *args, **kwargs) - def load_compact_model(self, filename, *args, **kwargs): - """ Load the model in a compressed binary format. + def load_compact_model(self, filename: str, *args, **kwargs) -> Self: + """Load the model from a compressed binary format. - :param filename: name of the model file - :param args: arguments - :param kwargs: arguments - :type filename: str - :type args: dict - :type kwargs: dict + Args: + filename: Name of the model file. + *args: Additional arguments. + **kwargs: Additional keyword arguments. """ return load_compact_model(filename, self.loadmodel, self.prefix, self.infodict, *args, **kwargs) - def get_info(self): - """ Getting information for the dressed machine. + def get_info(self) -> dict[str, Any]: + """Get model metadata. - :return: dictionary of the information for the dressed machine. - :rtype: dict + Returns: + Dictionary with classifier, prefix, and suffices. """ return {'classifier': self.infodict['classifier'], 'prefix': self.prefix, 'suffices': self.suffices} -# decorator that adds compact model methods to classifier dynamically (deprecated) -@deprecated(deprecated_in="3.0.1", removed_in="4.0.0", - details="Use `CompactIOMachine` instead") -def CompactIOClassifier(Classifier, infodict, prefix, suffices): - """ Returns a decorated class object with additional methods for compact model I/O. - - The class itself must have methods :func:`loadmodel` and :func:`savemodel` that - takes the prefix of the model files as the argument. - - :param Classifier: class to be decorated - :param infodict: information about the model. Must contain the key 'classifier'. - :param prefix: prefix of names of the model file - :param suffices: suffices of the names of the model file - :return: the decorated class - :type Classifier: classobj - :type infodict: dict - :type prefix: str - :type suffices: list - :rtype: classobj - """ - # define the inherit class - class DressedClassifier(Classifier): - def save_compact_model(self, filename, *args, **kwargs): - save_compact_model(filename, self.savemodel, prefix, suffices, infodict, *args, **kwargs) - - def load_compact_model(self, filename, *args, **kwargs): - return load_compact_model(filename, self.loadmodel, prefix, infodict, *args, **kwargs) - - def get_info(self): - return {'classifier': infodict['classifier'], - 'prefix': prefix, - 'suffices': suffices} - - DressedClassifier.__name__ = Classifier.__name__ - DressedClassifier.__doc__ = Classifier.__doc__ - - # return decorated classifier - return DressedClassifier - - -# decorator for use (deprecated) -@deprecated(deprecated_in="3.0.1", removed_in="4.0.0", - details="Use `CompactIOMachine` instead") -def compactio(infodict, prefix, suffices): - """ Returns a decorator that performs the decoration by :func:`CompactIOClassifier`. - - :param infodict: information about the model. Must contain the key 'classifier'. - :param prefix: prefix of names of the model file - :param suffices: suffices of the names of the model file - :return: the decorator - :type infodict: dict - :type prefix: str - :type suffices: list - :rtype: function - """ - return partial(CompactIOClassifier, infodict=infodict, prefix=prefix, suffices=suffices) - - -def get_model_config_field(filename, parameter): - """ Return the configuration parameter of a model file. +def get_model_config_field(filename: str | PathLike, parameter: str) -> str: + """Get a configuration parameter from a compact model file. - Read the file `modelconfig.json` in the compact model file, and return - the value of a particular parameter. + Args: + filename: Path to the model file. + parameter: Parameter name to retrieve. - :param filename: path of the model file - :param parameter: parameter to look in - :return: value of the parameter of this model - :type filename: str - :type parameter: str - :rtype: str + Returns: + The parameter value. """ inputfile = zipfile.ZipFile(filename, mode='r') - modelconfig_file = inputfile.open('modelconfig.json', 'r') - modelconfig_json = modelconfig_file.read() - modelconfig_file.close() - if type(modelconfig_json)==bytes: - modelconfig_json = modelconfig_json.decode('utf-8') - readinfodict = json.loads(modelconfig_json) + readinfodict = json.load(inputfile.open("modelconfig.json", "r")) return readinfodict[parameter] -def get_model_classifier_name(filename): - """ Return the name of the classifier from a model file. +def get_model_classifier_name(filename: str| PathLike) -> str: + """Get the classifier name from a compact model file. - Read the file `modelconfig.json` in the compact model file, and return - the name of the classifier. + Args: + filename: Path to the model file. - :param filename: path of the model file - :return: name of the classifier - :type filename: str - :rtype: str + Returns: + The classifier name. """ return get_model_config_field(filename, 'classifier') diff --git a/src/shorttext/utils/compute.py b/src/shorttext/utils/compute.py new file mode 100644 index 00000000..1365cd14 --- /dev/null +++ b/src/shorttext/utils/compute.py @@ -0,0 +1,23 @@ + +from typing import Annotated + +import numpy as np +import numpy.typing as npt +import numba as nb + + +@nb.njit(nb.float64(nb.float64[::1], nb.float64[::1])) +def cosine_similarity( + vec1: Annotated[npt.NDArray[np.float64], "1D array"], + vec2: Annotated[npt.NDArray[np.float64], "1D array"] +) -> float: + """Compute cosine similarity between two vectors. + + Args: + vec1: First vector. + vec2: Second vector. + + Returns: + Cosine similarity score between 0 and 1. + """ + return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) diff --git a/src/shorttext/utils/dtm.py b/src/shorttext/utils/dtm.py index 723f83b2..4e5162ea 100644 --- a/src/shorttext/utils/dtm.py +++ b/src/shorttext/utils/dtm.py @@ -1,31 +1,79 @@ -import pickle -from typing import Optional, Any -from types import FunctionType +from collections import Counter +from typing import Optional, Any, Self, Annotated import numpy as np import numpy.typing as npt import npdict -from gensim.corpora import Dictionary -from gensim.models import TfidfModel -from npdict import SparseArrayWrappedDict -from scipy.sparse import dok_matrix -from deprecation import deprecated +from os import PathLike +import sparse + +from .classification_exceptions import UnequalArrayLengthsException from .compactmodel_io import CompactIOMachine -from .classification_exceptions import NotImplementedException from .textpreprocessing import advanced_text_tokenizer_1 +npdtm_suffices = ["_npdict.npy"] + + +def _construct_sparse_coo_dtm_matrix( + sorted_token_list: list[str], + tokens_counters: list[list[tuple[str, int]]] +) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64], npt.NDArray[np.float64]]: + """Construct sparse COO matrix for document-term matrix. -dtm_suffices = ['_docids.pkl', '_dictionary.dict', '_dtm.pkl'] -npdtm_suffices = [] + Args: + sorted_token_list: Sorted list of tokens. + tokens_counters: List of token counters for each document. + + Returns: + Tuple of (x_coords, y_coords, data) for sparse COO matrix. + """ + token_index_map = {token: idx for idx, token in enumerate(sorted_token_list)} + ids_counters = [ + {token_index_map[token]: counts for token, counts in counter} + for counter in tokens_counters + ] + docs_nbtokens = [len(counter) for counter in ids_counters] + nb_coo_data = sum(docs_nbtokens) + coordx_array = np.empty(nb_coo_data, dtype=np.int64) + coordy_array = np.empty(nb_coo_data, dtype=np.int64) + val_array = np.empty(nb_coo_data) + + i = 0 + for doc_id, counter in enumerate(ids_counters): + for tokenid, counts in counter.items(): + coordx_array[i] = doc_id + coordy_array[i] = tokenid + val_array[i] = counts + i += 1 + + return coordx_array, coordy_array, val_array def generate_npdict_document_term_matrix( corpus: list[str], doc_ids: list[Any], - tokenize_func: FunctionType + tokenize_func: callable ) -> npdict.NumpyNDArrayWrappedDict: + """Generate document-term matrix as numpy dict. + + Args: + corpus: List of documents. + doc_ids: List of document IDs. + tokenize_func: Tokenization function. + + Returns: + NumpyNDArrayWrappedDict containing the document-term matrix. + + Raises: + UnequalArrayLengthsException: If corpus and doc_ids have different lengths. + """ + try: + assert len(corpus) == len(doc_ids) + except AssertionError: + raise UnequalArrayLengthsException(corpus, doc_ids) + # grabbing tokens from each document in the corpus doc_tokens = [tokenize_func(document) for document in corpus] tokens_set = set([ @@ -33,19 +81,97 @@ def generate_npdict_document_term_matrix( for document in doc_tokens for token in document ]) - npdtm = npdict.SparseArrayWrappedDict( - [doc_ids, sorted(list(tokens_set))], - default_initial_value=0.0 + sorted_tokens_list = sorted(list(tokens_set)) + tokens_counters = [dict(Counter(tokens)) for tokens in doc_tokens] + tokens_counters_tuples = [[(token, counts) for token, counts in counter.items()] for counter in tokens_counters] + coord_x, coord_y, data = _construct_sparse_coo_dtm_matrix( + sorted_tokens_list, tokens_counters_tuples + ) + npdtm = npdict.SparseArrayWrappedDict.from_sparsearray_given_keywords( + [doc_ids, sorted_tokens_list], + sparse.COO([coord_x, coord_y], data=data, shape=(len(doc_tokens), len(sorted_tokens_list))) ) - for doc_id, document in zip(doc_ids, doc_tokens): - for token in document: - npdtm[doc_id, token] += 1 return npdtm +def convert_classdict_to_corpus( + classdict: dict[str, list[str]], + preprocess_func: callable +) -> tuple[list[str], list[str]]: + """Convert class dictionary to corpus and document IDs. + + Args: + classdict: Training data with class labels as keys and texts as values. + preprocess_func: Text preprocessing function. + + Returns: + Tuple of (corpus, doc_ids). + """ + corpus = [ + preprocess_func(datum) + for doc_under_class in classdict.values() + for datum in doc_under_class + ] + docids = [ + f"{label}-{i}" + for label, doc_under_class in classdict.items() + for i in range(len(doc_under_class)) + ] + return corpus, docids + + +def convert_classdict_to_xy( + classdict: dict[str, list[str]], + labels2idx: dict[str, int], + preprocess_func: callable, + tokenize_func: callable +) -> tuple[npdict.NumpyNDArrayWrappedDict, Annotated[sparse.SparseArray, "2D Array"]]: + """Convert class dictionary to feature matrix and labels. + + Args: + classdict: Training data. + labels2idx: Mapping from labels to indices. + preprocess_func: Text preprocessing function. + tokenize_func: Tokenization function. + + Returns: + Tuple of (document-term matrix, label matrix). + """ + nbdata = sum(len(data) for data in classdict.values()) + nblabels = len(labels2idx) + + # making x + corpus, docids = convert_classdict_to_corpus(classdict, preprocess_func=preprocess_func) + dtm_npdict_matrix = generate_npdict_document_term_matrix(corpus, docids, tokenize_func) + + # making y + y = sparse.COO( + [ + list(range(nbdata)), + [ + labels2idx[label] + for label, doc_under_class in classdict.items() + for _ in doc_under_class + ] + ], + [1.]*nbdata, + shape=(nbdata, nblabels) + ) + + return dtm_npdict_matrix, y + + def compute_document_frequency( npdtm: npdict.NumpyNDArrayWrappedDict ) -> npt.NDArray[np.int32]: + """Compute document frequency for each token. + + Args: + npdtm: Document-term matrix. + + Returns: + Array of document frequencies for each token. + """ if isinstance(npdtm, npdict.SparseArrayWrappedDict): return np.sum(npdtm.to_coo() > 0, axis=0).todense() else: @@ -56,33 +182,55 @@ def compute_tfidf_document_term_matrix( npdtm: npdict.NumpyNDArrayWrappedDict, sparse: bool=True ) -> npdict.NumpyNDArrayWrappedDict: + """Compute TF-IDF weighted document-term matrix. + + Args: + npdtm: Document-term matrix. + sparse: Whether to return sparse format. Default: True. + + Returns: + TF-IDF weighted document-term matrix. + """ doc_frequencies = compute_document_frequency(npdtm) nbdocs = npdtm.dimension_sizes[0] if isinstance(npdtm, npdict.SparseArrayWrappedDict): new_dtm_sparray = npdtm.to_coo() * np.log(nbdocs / doc_frequencies) - return npdict.SparseArrayWrappedDict.generate_dict(new_dtm_sparray, dense=not sparse) + return npdtm.generate_dict(new_dtm_sparray, dense=not sparse) + + new_dtm_nparray = npdtm.to_numpy() * np.log(nbdocs / doc_frequencies) + new_npdtm = npdtm.generate_dict(new_dtm_nparray) + if sparse: + return npdict.SparseArrayWrappedDict.from_NumpyNDArrayWrappedDict( + new_npdtm, default_initial_value=0.0 + ) else: - new_dtm_nparray = npdtm.to_numpy() * np.log(nbdocs / doc_frequencies) - new_npdtm = npdict.NumpyNDArrayWrappedDict.generate_dict(new_dtm_nparray) - if sparse: - new_sparse_dtm = npdict.SparseArrayWrappedDict.from_NumpyNDArrayWrappedDict( - new_npdtm, default_initial_value=0.0 - ) - return new_sparse_dtm - else: - return new_npdtm + return new_npdtm class NumpyDocumentTermMatrix(CompactIOMachine): + """Document-term matrix using numpy dict. + + Provides an interface for working with document-term matrices + with compact model I/O support. + """ + def __init__( self, corpus: Optional[list[str]]=None, docids: Optional[list[Any]]=None, tfidf: bool=False, - tokenize_func: Optional[FunctionType]=None + tokenize_func: Optional[callable]=None ): - CompactIOMachine.__init__(self, {'classifier': 'npdtm'}, 'dtm', dtm_suffices) - self.tokenize_func = tokenize_func if tokenize_func is not None else advanced_text_tokenizer_1 + """Initialize the document-term matrix. + + Args: + corpus: List of documents. + docids: List of document IDs. + tfidf: Whether to apply TF-IDF weighting. Default: False. + tokenize_func: Tokenization function. Default: advanced_text_tokenizer_1. + """ + super().__init__({'classifier': 'npdtm'}, 'npdtm', npdtm_suffices) + self.tokenize_func = tokenize_func if tokenize_func is not None else advanced_text_tokenizer_1() # generate DTM if corpus is not None: @@ -93,7 +241,14 @@ def generate_dtm( corpus: list[str], docids: Optional[list[Any]]=None, tfidf: bool=False - ): + ) -> None: + """Generate document-term matrix from corpus. + + Args: + corpus: List of documents. + docids: List of document IDs. + tfidf: Whether to apply TF-IDF weighting. Default: False. + """ # wrangling document IDs if docids is None: doc_ids = [f"doc{i}" for i in range(len(corpus))] @@ -111,229 +266,124 @@ def generate_dtm( self.npdtm = compute_tfidf_document_term_matrix(self.npdtm, sparse=True) def get_termfreq(self, docid: str, token: str) -> float: + """Get term frequency for a document and token. + + Args: + docid: Document ID. + token: Token. + + Returns: + Term frequency. + """ return self.npdtm[docid, token] def get_total_termfreq(self, token: str) -> float: + """Get total frequency of a token across all documents. + + Args: + token: Token. + + Returns: + Total term frequency. + """ token_index = self.npdtm._keystrings_to_indices[1][token] - if isinstance(self.npdtm, SparseArrayWrappedDict): + if isinstance(self.npdtm, npdict.SparseArrayWrappedDict): matrix = self.npdtm.to_coo() else: matrix = self.npdtm.to_numpy() return np.sum(matrix[:, token_index]) def get_doc_frequency(self, token) -> int: + """Get document frequency of a token. + + Args: + token: Token. + + Returns: + Number of documents containing the token. + """ token_index = self.npdtm._keystrings_to_indices[1][token] if isinstance(self.npdtm, npdict.SparseArrayWrappedDict): freq_array = self.npdtm.to_coo()[:, token_index] - return np.sum(freq_array > 0, axis=0).todense() else: freq_array = self.npdtm.to_numpy()[:, token_index] - return np.sum(freq_array > 0, axis=0) + return np.sum(freq_array > 0, axis=0) def get_token_occurences(self, token: str) -> dict[str, float]: + """Get token occurrences across all documents. + + Args: + token: Token. + + Returns: + Dictionary mapping document IDs to term frequencies. + """ return { docid: self.npdtm[docid, token] for docid in self.npdtm._lists_keystrings[0] } def get_doc_tokens(self, docid: str) -> dict[str, float]: + """Get tokens for a specific document. + + Args: + docid: Document ID. + + Returns: + Dictionary mapping tokens to frequencies. + """ return { token: self.npdtm[docid, token] for token in self.npdtm._lists_keystrings[1] } + def savemodel(self, nameprefix: str) -> None: + """Save the document-term matrix. -@deprecated(deprecated_in="3.0.1", removed_in="4.0.0", - details="Use `NumpyDocumentTermMatrix` instead") -class DocumentTermMatrix(CompactIOMachine): - """ Document-term matrix for corpus. - - This is a class that handles the document-term matrix (DTM). With a given corpus, users can - retrieve term frequency, document frequency, and total term frequency. Weighing using tf-idf - can be applied. - """ - def __init__(self, corpus, docids=None, tfidf=False): - """ Initialize the document-term matrix (DTM) class with a given corpus. - - If document IDs (docids) are given, it will be stored and output as approrpriate. - If not, the documents are indexed by numbers. - - Users can choose to weigh by tf-idf. The default is not to weigh. - - The corpus has to be a list of lists, with each of the inside list contains all the tokens - in each document. - - :param corpus: corpus. - :param docids: list of designated document IDs. (Default: None) - :param tfidf: whether to weigh using tf-idf. (Default: False) - :type corpus: list - :type docids: list - :type tfidf: bool + Args: + nameprefix: Prefix for output file. """ - CompactIOMachine.__init__(self, {'classifier': 'dtm'}, 'dtm', dtm_suffices) - if docids is None: - self.docid_dict = {i: i for i in range(len(corpus))} - self.docids = [i for i in range(len(corpus))] - else: - if len(docids) == len(corpus): - self.docid_dict = {docid: i for i, docid in enumerate(docids)} - self.docids = docids - elif len(docids) > len(corpus): - self.docid_dict = {docid: i for i, docid in zip(range(len(corpus)), docids[:len(corpus)])} - self.docids = docids[:len(corpus)] - else: - self.docid_dict = {docid: i for i, docid in enumerate(docids)} - self.docid_dict = {i: i for i in range(len(docids), len(corpus))} - self.docids = docids + [i for i in range(len(docids), len(corpus))] - # generate DTM - self.generate_dtm(corpus, tfidf=tfidf) + self.npdtm.save(nameprefix+"_npdict.npy") - def generate_dtm(self, corpus, tfidf=False): - """ Generate the inside document-term matrix and other peripherical information - objects. This is run when the class is instantiated. + def loadmodel(self, nameprefix: str) -> Self: + """Load the document-term matrix. - :param corpus: corpus. - :param tfidf: whether to weigh using tf-idf. (Default: False) - :return: None - :type corpus: list - :type tfidf: bool - """ - self.dictionary = Dictionary(corpus) - self.dtm = dok_matrix((len(corpus), len(self.dictionary)), dtype=np.float64) - bow_corpus = [self.dictionary.doc2bow(doctokens) for doctokens in corpus] - if tfidf: - weighted_model = TfidfModel(bow_corpus) - bow_corpus = weighted_model[bow_corpus] - for docid in self.docids: - for tokenid, count in bow_corpus[self.docid_dict[docid]]: - self.dtm[self.docid_dict[docid], tokenid] = count - - def get_termfreq(self, docid, token): - """ Retrieve the term frequency of a given token in a particular document. - - Given a token and a particular document ID, compute the term frequency for this - token. If `tfidf` is set to `True` while instantiating the class, it returns the weighted - term frequency. - - :param docid: document ID - :param token: term or token - :return: term frequency or weighted term frequency of the given token in this document (designated by docid) - :type docid: any - :type token: str - :rtype: numpy.float + Args: + nameprefix: Prefix for input file. """ - return self.dtm[self.docid_dict[docid], self.dictionary.token2id[token]] + self.npdtm = npdict.SparseArrayWrappedDict.load(nameprefix+"_npdict.npy") - def get_total_termfreq(self, token): - """ Retrieve the total occurrences of the given token. + @property + def docids(self) -> list[str]: + """List of document IDs.""" + return self.npdtm._lists_keystrings[0] - Compute the total occurrences of the term in all documents. If `tfidf` is set to `True` - while instantiating the class, it returns the sum of weighted term frequency. + @property + def tokens(self) -> list[str]: + """List of tokens.""" + return self.npdtm._lists_keystrings[1] - :param token: term or token - :return: total occurrences of the given token - :type token: str - :rtype: numpy.float - """ - return sum(self.dtm[:, self.dictionary.token2id[token]].values()) + @property + def nbdocs(self) -> int: + """Number of documents.""" + return len(self.docids) - def get_doc_frequency(self, token): - """ Retrieve the document frequency of the given token. + @property + def nbtokens(self) -> int: + """Number of unique tokens.""" + return len(self.tokens) - Compute the document frequency of the given token, i.e., the number of documents - that this token can be found. - :param token: term or token - :return: document frequency of the given token - :type token: str - :rtype: int - """ - return len(self.dtm[:, self.dictionary.token2id[token]].values()) +def load_numpy_documentmatrixmatrix(filepath: str | PathLike) -> NumpyDocumentTermMatrix: + """Load a document-term matrix from a compact file. - def get_token_occurences(self, token): - """ Retrieve the term frequencies of a given token in all documents. + Args: + filepath: Path to the compact model file. - Compute the term frequencies of the given token for all the documents. If `tfidf` is - set to be `True` while instantiating the class, it returns the weighted term frequencies. - - This method returns a dictionary of term frequencies with the corresponding document IDs - as the keys. - - :param token: term or token - :return: a dictionary of term frequencies with the corresponding document IDs as the keys - :type token: str - :rtype: dict - """ - return {self.docids[docidx]: count for (docidx, _), count in self.dtm[:, self.dictionary.token2id[token]].items()} - - def get_doc_tokens(self, docid): - """ Retrieve the term frequencies of all tokens in the given document. - - Compute the term frequencies of all tokens for the given document. If `tfidf` is - set to be `True` while instantiating the class, it returns the weighted term frequencies. - - This method returns a dictionary of term frequencies with the tokens as the keys. - - :param docid: document ID - :return: a dictionary of term frequencies with the tokens as the keys - :type docid: any - :rtype: dict - """ - return {self.dictionary[tokenid]: count for (_, tokenid), count in self.dtm[self.docid_dict[docid], :].items()} - - def generate_dtm_dataframe(self): - """ Generate the data frame of the document-term matrix. (shorttext <= 1.0.3) - - Now it raises exception. - - :return: data frame of the document-term matrix - :rtype: pandas.DataFrame - :raise: NotImplementedException - """ - raise NotImplementedException() - - def savemodel(self, prefix): - """ Save the model. - - :param prefix: prefix of the files - :return: None - :type prefix: str - """ - pickle.dump(self.docids, open(prefix+'_docids.pkl', 'wb')) - self.dictionary.save(prefix+'_dictionary.dict') - pickle.dump(self.dtm, open(prefix+'_dtm.pkl', 'wb')) - - def loadmodel(self, prefix): - """ Load the model. - - :param prefix: prefix of the files - :return: None - :type prefix: str - """ - self.docids = pickle.load(open(prefix+'_docids.pkl', 'rb')) - self.docid_dict = {docid: i for i, docid in enumerate(self.docids)} - self.dictionary = Dictionary.load(prefix+'_dictionary.dict') - self.dtm = pickle.load(open(prefix+'_dtm.pkl', 'rb')) - - -@deprecated(deprecated_in="3.0.1", removed_in="4.0.0", - details="Use `npdict` instead") -def load_DocumentTermMatrix(filename, compact=True): - """ Load presaved Document-Term Matrix (DTM). - - Given the file name (if `compact` is `True`) or the prefix (if `compact` is `False`), - return the document-term matrix. - - :param filename: file name or prefix - :param compact: whether it is a compact model. (Default: `True`) - :return: document-term matrix - :type filename: str - :type compact: bool - :rtype: DocumentTermMatrix + Returns: + NumpyDocumentTermMatrix instance. """ - dtm = DocumentTermMatrix([[]]) - if compact: - dtm.load_compact_model(filename) - else: - dtm.loadmodel(filename) - return dtm \ No newline at end of file + npdtm = NumpyDocumentTermMatrix() + npdtm.load_compact_model(filepath) + return npdtm + diff --git a/src/shorttext/utils/gensim_corpora.py b/src/shorttext/utils/gensim_corpora.py index 9d0de0d6..9d09471e 100644 --- a/src/shorttext/utils/gensim_corpora.py +++ b/src/shorttext/utils/gensim_corpora.py @@ -1,25 +1,29 @@ -from collections import defaultdict +from collections import Counter +from typing import Optional import gensim +from deprecation import deprecated from .textpreprocessing import tokenize -def generate_gensim_corpora(classdict, preprocess_and_tokenize=tokenize): - """ Generate gensim bag-of-words dictionary and corpus. +def generate_gensim_corpora( + classdict: dict[str, list[str]], + preprocess_and_tokenize: Optional[callable] = None +) -> tuple[gensim.corpora.Dictionary, list[list[tuple[int, int]]], list[str]]: + """Generate gensim dictionary and corpus from training data. - Given a text data, a dict with keys being the class labels, and the values - being the list of short texts, in the same format output by `shorttext.data.data_retrieval`, - return a gensim dictionary and corpus. + Args: + classdict: Training data with class labels as keys and lists of texts as values. + preprocess_and_tokenize: Function to preprocess and tokenize text. Default: tokenize. - :param classdict: text data, a dict with keys being the class labels, and each value is a list of short texts - :param proprocess_and_tokenize: preprocessor function, that takes a short sentence, and return a list of tokens (Default: `shorttext.utils.tokenize`) - :return: a tuple, consisting of a gensim dictionary, a corpus, and a list of class labels - :type classdict: dict - :type proprocess_and_tokenize: function - :rtype: (gensim.corpora.Dictionary, list, list) + Returns: + Tuple of (dictionary, corpus, class_labels). """ + if preprocess_and_tokenize is None: + preprocess_and_tokenize = tokenize + classlabels = sorted(classdict.keys()) doc = [preprocess_and_tokenize(' '.join(classdict[classlabel])) for classlabel in classlabels] dictionary = gensim.corpora.Dictionary(doc) @@ -27,50 +31,63 @@ def generate_gensim_corpora(classdict, preprocess_and_tokenize=tokenize): return dictionary, corpus, classlabels -def save_corpus(dictionary, corpus, prefix): - """ Save gensim corpus and dictionary. +@deprecated(deprecated_in="5.0.0", removed_in="6.0.0") +def save_corpus( + dictionary: gensim.corpora.Dictionary, + corpus: list[list[tuple[int, int]]], + prefix: str +) -> None: + """Save gensim corpus and dictionary to files. + + Args: + dictionary: Dictionary to save. + corpus: Corpus to save. + prefix: Prefix for output files. - :param dictionary: dictionary to save - :param corpus: corpus to save - :param prefix: prefix of the files to save - :return: None - :type dictionary: gensim.corpora.Dictionary - :type corpus: list - :type prefix: str + Note: + Deprecated since 5.0.0, will be removed in 6.0.0. """ dictionary.save(prefix+'_dictionary.dict') gensim.corpora.MmCorpus.serialize(prefix+'_corpus.mm', corpus) -def load_corpus(prefix): - """ Load gensim corpus and dictionary. +@deprecated(deprecated_in="5.0.0", removed_in="6.0.0") +def load_corpus(prefix: str) -> tuple[gensim.corpora.MmCorpus, gensim.corpora.Dictionary]: + """Load gensim corpus and dictionary from files. + + Args: + prefix: Prefix of files to load. + + Returns: + Tuple of (corpus, dictionary). - :param prefix: prefix of the file to load - :return: corpus and dictionary - :type prefix: str - :rtype: tuple + Note: + Deprecated since 5.0.0, will be removed in 6.0.0. """ corpus = gensim.corpora.MmCorpus(prefix+'_corpus.mm') dictionary = gensim.corpora.Dictionary.load(prefix+'_dictionary.dict') return corpus, dictionary -def update_corpus_labels(dictionary, corpus, newclassdict, preprocess_and_tokenize=tokenize): - """ Update corpus with additional training data. - - With the additional training data, the dictionary and corpus are updated. - - :param dictionary: original dictionary - :param corpus: original corpus - :param newclassdict: additional training data - :param preprocess_and_tokenize: preprocessor function, that takes a short sentence, and return a list of tokens (Default: `shorttext.utils.tokenize`) - :return: a tuple, an updated corpus, and the new corpus (for updating model) - :type dictionary: gensim.corpora.Dictionary - :type corpus: list - :type newclassdict: dict - :type preprocess_and_tokenize: function - :rtype: tuple +def update_corpus_labels( + dictionary: gensim.corpora.Dictionary, + corpus: list[list[tuple[int, int]]], + newclassdict: dict[str, list[str]], + preprocess_and_tokenize: Optional[callable] = None +) -> tuple[list[list[tuple[int, int]]], list[list[tuple[int, int]]]]: + """Update corpus with additional training data. + + Args: + dictionary: Existing dictionary. + corpus: Existing corpus. + newclassdict: Additional training data. + preprocess_and_tokenize: Function to preprocess text. Default: tokenize. + + Returns: + Tuple of (updated_corpus, new_corpus). """ + if preprocess_and_tokenize is None: + preprocess_and_tokenize = tokenize newdoc = [preprocess_and_tokenize(' '.join(newclassdict[classlabel])) for classlabel in sorted(newclassdict.keys())] newcorpus = [dictionary.doc2bow(doctokens) for doctokens in newdoc] @@ -79,16 +96,15 @@ def update_corpus_labels(dictionary, corpus, newclassdict, preprocess_and_tokeni return corpus, newcorpus -def tokens_to_fracdict(tokens): - """ Return normalized bag-of-words (BOW) vectors. +def tokens_to_fracdict(tokens: list[str]) -> dict[str, float]: + """Convert tokens to normalized frequency dictionary. + + Args: + tokens: List of tokens. - :param tokens: list of tokens. - :type tokens: list - :return: normalized vectors of counts of tokens as a `dict` - :rtype: dict + Returns: + Dictionary with tokens as keys and normalized frequencies as values. """ - cntdict = defaultdict(lambda : 0) - for token in tokens: - cntdict[token] += 1 + cntdict = Counter(tokens) totalcnt = sum(cntdict.values()) - return {token: float(cnt)/totalcnt for token, cnt in cntdict.items()} \ No newline at end of file + return {token: cnt / totalcnt for token, cnt in cntdict.items()} diff --git a/src/shorttext/utils/kerasmodel_io.py b/src/shorttext/utils/kerasmodel_io.py index d772de7c..cb523d4e 100644 --- a/src/shorttext/utils/kerasmodel_io.py +++ b/src/shorttext/utils/kerasmodel_io.py @@ -1,35 +1,28 @@ - +import tensorflow from tensorflow.keras.models import model_from_json -def save_model(nameprefix, model): - """ Save a keras sequential model into files. - - Given a keras sequential model, save the model with the given file path prefix. - It saves the model into a JSON file, and an HDF5 file (.h5). +def save_model(nameprefix: str, model: tensorflow.keras.models.Model) -> None: + """Save a Keras model to files. - :param nameprefix: Prefix of the paths of the model files - :param model: keras sequential model to be saved - :return: None - :type nameprefix: str - :type model: keras.models.Model + Args: + nameprefix: Prefix for output files. + model: Keras model to save. """ model_json = model.to_json() open(nameprefix+'.json', 'w').write(model_json) model.save_weights(nameprefix+'.weights.h5') -def load_model(nameprefix): - """ Load a keras sequential model from files. +def load_model(nameprefix: str) -> tensorflow.keras.models.Model: + """Load a Keras model from files. - Given the prefix of the file paths, load a keras sequential model from - a JSON file and an HDF5 file. + Args: + nameprefix: Prefix for input files. - :param nameprefix: Prefix of the paths of the model files - :return: keras sequential model - :type nameprefix: str - :rtype: keras.models.Model + Returns: + Loaded Keras model. """ model = model_from_json(open(nameprefix+'.json', 'r').read()) model.load_weights(nameprefix+'.weights.h5') - return model \ No newline at end of file + return model diff --git a/src/shorttext/utils/misc.py b/src/shorttext/utils/misc.py index b1b6ad5e..ab95a82f 100644 --- a/src/shorttext/utils/misc.py +++ b/src/shorttext/utils/misc.py @@ -1,16 +1,23 @@ +from typing import Generator +from io import TextIOWrapper -def textfile_generator(textfile, linebreak=True, encoding=None): - """ Return a generator that reads lines in a text file. - - :param textfile: file object of a text file - :param linebreak: whether to return a line break at the end of each line (Default: True) - :param encoding: encoding of the text file (Default: None) - :return: a generator that reads lines in a text file - :type textfile: file - :type linebreak: bool - :type encoding: str - :rtype: generator + + +def textfile_generator( + textfile: TextIOWrapper, + linebreak: bool=True, + encoding: bool=None +) -> Generator[str, None, None]: + """Generator that yields lines from a text file. + + Args: + textfile: File object to read lines from. + linebreak: Whether to include line break at end of each line. Default: True. + encoding: Encoding of the text file. Default: None. + + Yields: + Lines from the text file, stripped of whitespace. """ for t in textfile: if len(t) > 0: @@ -21,17 +28,21 @@ def textfile_generator(textfile, linebreak=True, encoding=None): class SinglePoolExecutor: - """ It is a wrapper for Python `map` functions. + """Wrapper for Python map function. + Provides an interface similar to concurrent.futures.Executor.map + but using a synchronous map implementation. """ + def map(self, func, *iterables): - """ Refer to Python `map` documentation. - - :param func: function - :param iterables: iterables to loop - :return: generator for the map - :type func: function - :type iterables: iterables - :rtype: map + """Apply function to iterables element-wise. + + Args: + func: Function to apply to each element. + iterables: One or more iterables to process. + + Returns: + An iterator yielding the results. """ return map(func, *iterables) + return map(func, *iterables) diff --git a/src/shorttext/utils/textpreprocessing.py b/src/shorttext/utils/textpreprocessing.py index 910ac0ef..d3b79e9d 100644 --- a/src/shorttext/utils/textpreprocessing.py +++ b/src/shorttext/utils/textpreprocessing.py @@ -2,8 +2,7 @@ import re import os import codecs -from io import TextIOWrapper -from types import FunctionType +from typing import TextIO from functools import partial import snowballstemmer @@ -11,11 +10,24 @@ # tokenizer def tokenize(s: str) -> list[str]: + """Tokenize a string by splitting on whitespace. + + Args: + s: Input string to tokenize. + + Returns: + List of tokens split by whitespace. + """ return s.split(' ') # stemmer class StemmerSingleton: + """Singleton class for Porter stemmer. + + Provides a singleton instance of the snowball stemmer for English. + """ + def __new__(cls): if not hasattr(cls, 'instance'): cls.instance = super(StemmerSingleton, cls).__new__(cls) @@ -23,36 +35,68 @@ def __new__(cls): return cls.instance def __call__(cls, s: str) -> str: + """Stem a word using Porter stemmer. + + Args: + s: Word to stem. + + Returns: + Stemmed word. + """ return cls.stemmer.stemWord(s) + def stemword(s: str) -> str: + """Stem a word using Porter stemmer. + + Args: + s: Word to stem. + + Returns: + Stemmed word. + """ return StemmerSingleton()(s) -def preprocess_text(text: str, pipeline: list[FunctionType]) -> str: - """ Preprocess the text according to the given pipeline. +def preprocess_text(text: str, pipeline: list[callable]) -> str: + """Preprocess text according to a given pipeline. + + Applies a sequence of preprocessing functions to the input text. + Each function in the pipeline transforms the text (e.g., stemming, + lemmatizing, removing punctuation). - Given the pipeline, which is a list of functions that process an - input text to another text (e.g., stemming, lemmatizing, removing punctuations etc.), - preprocess the text. + Args: + text: Input text to preprocess. + pipeline: List of functions that each transform a text string to another text string. - :param text: text to be preprocessed - :param pipeline: a list of functions that convert a text to another text - :return: preprocessed text - :type text: str - :type pipeline: list - :rtype: str + Returns: + The preprocessed text after applying all pipeline functions. """ return text if len(pipeline)==0 else preprocess_text(pipeline[0](text), pipeline[1:]) def tokenize_text( text: str, - presplit_pipeline: list[FunctionType], - primitize_tokenizer: FunctionType, - prosplit_pipeline: list[FunctionType], - stopwordsfile: TextIOWrapper + presplit_pipeline: list[callable], + primitize_tokenizer: callable, + postsplit_pipeline: list[callable], + stopwordsfile: TextIO ) -> list[str]: + """Tokenize text with preprocessing pipelines. + + Applies pre-split and post-split pipelines to tokenize text, + filtering out stopwords. + + Args: + text: Input text to tokenize. + presplit_pipeline: List of functions to apply before tokenization. + primitize_tokenizer: Tokenizer function to split text into tokens. + postsplit_pipeline: List of functions to apply to each token after tokenization. + stopwordsfile: File containing stopwords to filter out. + + Returns: + List of tokens after preprocessing and stopword filtering. + """ # load stop words file stopwordset = set([stopword.strip() for stopword in stopwordsfile]) @@ -61,7 +105,7 @@ def tokenize_text( for func in presplit_pipeline: presplit_text = func(presplit_text) postsplit_tokens = primitize_tokenizer(presplit_text) - for func in prosplit_pipeline: + for func in postsplit_pipeline: for i, token in enumerate(postsplit_tokens): postsplit_tokens[i] = func(token) postsplit_tokens = [ @@ -71,47 +115,45 @@ def tokenize_text( return postsplit_tokens -def text_preprocessor(pipeline: list[FunctionType]) -> FunctionType: - """ Return the function that preprocesses text according to the pipeline. +def text_preprocessor(pipeline: list[callable]) -> callable: + """Create a text preprocessor function from a pipeline. - Given the pipeline, which is a list of functions that process an - input text to another text (e.g., stemming, lemmatizing, removing punctuations etc.), - return a function that preprocesses an input text outlined by the pipeline, essentially - a function that runs :func:`~preprocess_text` with the specified pipeline. + Returns a function that applies the given pipeline to preprocess text. + This is a convenience function that wraps preprocess_text with + a fixed pipeline. - :param pipeline: a list of functions that convert a text to another text - :return: a function that preprocesses text according to the pipeline - :type pipeline: list - :rtype: function + Args: + pipeline: List of functions that transform text to text. + + Returns: + A callable that takes text and returns preprocessed text. """ return partial(preprocess_text, pipeline=pipeline) -def oldschool_standard_text_preprocessor(stopwordsfile: TextIOWrapper) -> FunctionType: - """ Return a commonly used text preprocessor. - - Return a text preprocessor that is commonly used, with the following steps: +def oldschool_standard_text_preprocessor(stopwordsfile: TextIO) -> callable: + """Create a standard text preprocessor. - - removing special characters, - - removing numerals, - - converting all alphabets to lower cases, - - removing stop words, and - - stemming the words (using Porter stemmer). + Returns a text preprocessor with the following steps: + - Remove special characters + - Remove numerals + - Convert to lowercase + - Remove stop words + - Stem words using Porter stemmer - This function calls :func:`~text_preprocessor`. + Args: + stopwordsfile: File object containing stopwords to filter. - :param stopwordsfile: file object of the list of stop words - :type stopwordsfile: file - :return: a function that preprocesses text according to the pipeline - :rtype: function + Returns: + A callable that takes text and returns preprocessed text. """ # load stop words file stopwordset = set([stopword.strip() for stopword in stopwordsfile]) stopwordsfile.close() # the pipeline - pipeline = [lambda s: re.sub('[^\w\s]', '', s), - lambda s: re.sub('[\d]', '', s), + pipeline = [lambda s: re.sub(r'[^\w\s]', '', s), + lambda s: re.sub(r'[0-9]', '', s), lambda s: s.lower(), lambda s: ' '.join(filter(lambda s: not (s in stopwordset), tokenize(s))), lambda s: ' '.join([stemword(stemmed_token) for stemmed_token in tokenize(s)]) @@ -119,21 +161,18 @@ def oldschool_standard_text_preprocessor(stopwordsfile: TextIOWrapper) -> Functi return text_preprocessor(pipeline) -def standard_text_preprocessor_1() -> FunctionType: - """ Return a commonly used text preprocessor. - - Return a text preprocessor that is commonly used, with the following steps: +def standard_text_preprocessor_1() -> callable: + """Create a standard text preprocessor using NLTK stopwords. - - removing special characters, - - removing numerals, - - converting all alphabets to lower cases, - - removing stop words (NLTK list), and - - stemming the words (using Porter stemmer). + Returns a text preprocessor with the following steps: + - Remove special characters + - Remove numerals + - Convert to lowercase + - Remove stop words (NLTK list) + - Stem words using Porter stemmer - This function calls :func:`~oldschool_standard_text_preprocessor`. - - :return: a function that preprocesses text according to the pipeline - :rtype: function + Returns: + A callable that takes text and returns preprocessed text. """ # load stop words this_dir, _ = os.path.split(__file__) @@ -142,21 +181,18 @@ def standard_text_preprocessor_1() -> FunctionType: return oldschool_standard_text_preprocessor(stopwordsfile) -def standard_text_preprocessor_2() -> FunctionType: - """ Return a commonly used text preprocessor. - - Return a text preprocessor that is commonly used, with the following steps: - - - removing special characters, - - removing numerals, - - converting all alphabets to lower cases, - - removing stop words (NLTK list minus negation terms), and - - stemming the words (using Porter stemmer). +def standard_text_preprocessor_2() -> callable: + """Create a standard text preprocessor with negation-aware stopwords. - This function calls :func:`~oldschool_standard_text_preprocessor`. + Returns a text preprocessor with the following steps: + - Remove special characters + - Remove numerals + - Convert to lowercase + - Remove stop words (NLTK list minus negation terms) + - Stem words using Porter stemmer - :return: a function that preprocesses text according to the pipeline - :rtype: function + Returns: + A callable that takes text and returns preprocessed text. """ # load stop words this_dir, _ = os.path.split(__file__) @@ -165,10 +201,22 @@ def standard_text_preprocessor_2() -> FunctionType: return oldschool_standard_text_preprocessor(stopwordsfile) -def advanced_text_tokenizer_1() -> FunctionType: +def advanced_text_tokenizer_1() -> callable: + """Create an advanced text tokenizer. + + Returns a tokenizer function that applies preprocessing steps: + - Remove special characters + - Remove numerals + - Convert to lowercase + - Stem tokens using Porter stemmer + - Filter out negation-aware stopwords + + Returns: + A callable that takes text and returns a list of tokens. + """ presplit_pipeline = [ - lambda s: re.sub('[^\w\s]', '', s), - lambda s: re.sub('[\d]', '', s), + lambda s: re.sub(r'[^\w\s]', '', s), + lambda s: re.sub(r'[0-9]', '', s), lambda s: s.lower() ] tokenizer = tokenize @@ -179,7 +227,7 @@ def advanced_text_tokenizer_1() -> FunctionType: return partial( tokenize_text, presplit_pipeline=presplit_pipeline, - tokenizer=tokenizer, + primitize_tokenizer=tokenizer, postsplit_pipeline=postsplit_pipeline, stopwordsfile=codecs.open(os.path.join(this_dir, 'nonneg_stopwords.txt'), 'r', 'utf-8') ) diff --git a/src/shorttext/utils/wordembed.py b/src/shorttext/utils/wordembed.py index 7e153431..8cf77c5e 100644 --- a/src/shorttext/utils/wordembed.py +++ b/src/shorttext/utils/wordembed.py @@ -1,49 +1,64 @@ +from os import PathLike +from typing import Any, Annotated, Optional, TextIO + import numpy as np +import numpy.typing as npt import gensim -from gensim.models import KeyedVectors from gensim.models.keyedvectors import KeyedVectors +from gensim.models.fasttext import FastTextKeyedVectors from gensim.models.poincare import PoincareModel, PoincareKeyedVectors import requests from .textpreprocessing import tokenize -def load_word2vec_model(path, binary=True): - """ Load a pre-trained Word2Vec model. +def load_word2vec_model( + path: str | PathLike, + binary: bool = True +) -> KeyedVectors: + """Load a pre-trained Word2Vec model. + + Args: + path: Path to the Word2Vec model file. + binary: Whether the file is in binary format. Default: True. - :param path: path of the file of the pre-trained Word2Vec model - :param binary: whether the file is in binary format (Default: True) - :return: a pre-trained Word2Vec model - :type path: str - :type binary: bool - :rtype: gensim.models.keyedvectors.KeyedVectors + Returns: + A KeyedVectors model containing word embeddings. """ return KeyedVectors.load_word2vec_format(path, binary=binary) -def load_fasttext_model(path, encoding='utf-8'): - """ Load a pre-trained FastText model. +def load_fasttext_model( + path: str | PathLike, + encoding: Any = 'utf-8' +) -> FastTextKeyedVectors: + """Load a pre-trained FastText model. - :param path: path of the file of the pre-trained FastText model - :return: a pre-trained FastText model - :type path: str - :rtype: gensim.models.keyedvectors.FastTextKeyedVectors + Args: + path: Path to the FastText model file. + encoding: File encoding. Default: 'utf-8'. + + Returns: + A FastTextKeyedVectors model. """ return gensim.models.fasttext.load_facebook_vectors(path, encoding=encoding) -def load_poincare_model(path, word2vec_format=True, binary=False): - """ Load a Poincare embedding model. +def load_poincare_model( + path: str | PathLike, + word2vec_format: bool = True, + binary: bool = False +) -> PoincareKeyedVectors: + """Load a Poincaré embedding model. + + Args: + path: Path to the Poincaré model file. + word2vec_format: Whether to load from word2vec format. Default: True. + binary: Whether file is binary. Default: False. - :param path: path of the file of the pre-trained Poincare embedding model - :param word2vec_format: whether to load from word2vec format (default: True) - :param binary: binary format (default: False) - :return: a pre-trained Poincare embedding model - :type path: str - :type word2vec_format: bool - :type binary: bool - :rtype: gensim.models.poincare.PoincareKeyedVectors + Returns: + A PoincareKeyedVectors model. """ if word2vec_format: return PoincareKeyedVectors.load_word2vec_format(path, binary=binary) @@ -51,24 +66,25 @@ def load_poincare_model(path, word2vec_format=True, binary=False): return PoincareModel.load(path).kv -def shorttext_to_avgvec(shorttext, wvmodel): - """ Convert the short text into an averaged embedded vector representation. +def shorttext_to_avgvec( + shorttext: str, + wvmodel: KeyedVectors +) -> Annotated[npt.NDArray[np.float64], "1D array"]: + """Convert short text to averaged embedding vector. - Given a short sentence, it converts all the tokens into embedded vectors according to - the given word-embedding model, sums - them up, and normalize the resulting vector. It returns the resulting vector - that represents this short sentence. + Converts each token to its word embedding, averages them, + and normalizes the result. - :param shorttext: a short sentence - :param wvmodel: word-embedding model - :return: an embedded vector that represents the short sentence - :type shorttext: str - :type wvmodel: gensim.models.keyedvectors.KeyedVectors - :rtype: numpy.ndarray + Args: + shorttext: Input text. + wvmodel: Word embedding model. + + Returns: + A normalized vector representation of the text. """ vec = np.sum( [ - wvmodel[token] + wvmodel[token].astype(np.float64) if token in wvmodel else np.array([1.]*wvmodel.vector_size) / np.sqrt(wvmodel.vector_size) for token in tokenize(shorttext) @@ -85,135 +101,151 @@ def shorttext_to_avgvec(shorttext, wvmodel): class RESTfulKeyedVectors(KeyedVectors): - """ RESTfulKeyedVectors, for connecting to the API of the preloaded word-embedding vectors loaded - by `WordEmbedAPI`. + """Remote word vector client via REST API. - This class inherits from :class:`gensim.models.keyedvectors.KeyedVectors`. + Connects to a remote WordEmbedAPI service to access word + embeddings via HTTP requests. + Attributes: + url: Base URL of the API. + port: Port number for the API. """ - def __init__(self, url, port='5000'): - """ Initialize the class. - :param url: URL of the API, usually `http://localhost` - :param port: Port number - :type url: str - :type port: str + def __init__(self, url: str, port: str | int='5000'): + """Initialize the client. + + Args: + url: Base URL of the API (e.g., 'http://localhost'). + port: Port number. Default: '5000'. """ self.url = url self.port = port - def closer_than(self, entity1, entity2): - """ + def closer_than(self, entity1: str, entity2: str) -> list | dict: + """Find words closer to entity1 than entity2 is. - :param entity1: word 1 - :param entity2: word 2 - :type entity1: str - :type entity2: str - :return: list of words - :rtype: list + Args: + entity1: First word. + entity2: Reference word. + + Returns: + List of words closer to entity1 than entity2. """ r = requests.post(self.url + ':' + self.port + '/closerthan', json={'entity1': entity1, 'entity2': entity2}) return r.json() - def distance(self, entity1, entity2): - """ + def distance(self, entity1: str, entity2: str) -> float: + """Compute distance between two words. + + Args: + entity1: First word. + entity2: Second word. - :param entity1: word 1 - :param entity2: word 2 - :type entity1: str - :type entity2: str - :return: distance between two words - :rtype: float + Returns: + Distance between the word vectors. """ r = requests.post(self.url + ':' + self.port + '/distance', json={'entity1': entity1, 'entity2': entity2}) return r.json()['distance'] - def distances(self, entity1, other_entities=()): - """ + def distances( + self, + entity1: str, + other_entities: Optional[list[str]] = None + ) -> Annotated[npt.NDArray[np.float64], "1D array"]: + """Compute distances from one word to multiple words. + + Args: + entity1: First word. + other_entities: List of words to compare against. - :param entity1: word - :param other_entities: list of words - :type entity1: str - :type other_entities: list - :return: list of distances between `entity1` and each word in `other_entities` - :rtype: list + Returns: + Array of distances. """ + if other_entities is None: + other_entities = [] + r = requests.post(self.url + ':' + self.port + '/distances', json={'entity1': entity1, 'other_entities': other_entities}) return np.array(r.json()['distances'], dtype=np.float32) - def get_vector(self, entity): - """ + def get_vector(self, entity: str) -> Annotated[npt.NDArray[np.float64], "1D array"]: + """Get word vector for a word. + + Args: + entity: Word to get vector for. + + Returns: + Word embedding vector. - :param entity: word - :type: str - :return: word vectors of the given word - :rtype: numpy.ndarray + Raises: + KeyError: If word not in vocabulary. """ r = requests.post(self.url + ':' + self.port + '/get_vector', json={'token': entity}) returned_dict = r.json() if 'vector' in returned_dict: return np.array(returned_dict['vector']) else: - raise KeyError('The token {} does not exist in the model.'.format(entity)) + raise KeyError(f'The token {entity} does not exist in the model.') - def most_similar(self, **kwargs): - """ + def most_similar(self, **kwargs) -> list[tuple[str, float]]: + """Find most similar words. + + Args: + **kwargs: Arguments passed to the API (e.g., positive, negative). - :param kwargs: - :return: + Returns: + List of (word, similarity) tuples. """ r = requests.post(self.url + ':' + self.port + '/most_similar', json=kwargs) return [tuple(pair) for pair in r.json()] - def most_similar_to_given(self, entity1, entities_list): - """ + def most_similar_to_given(self, entity1: str, entities_list: list[str]) -> list[str]: + """Find most similar word from a list to a given word. - :param entity1: word - :param entities_list: list of words - :type entity1: str - :type entities_list: list - :return: list of similarities between the given word and each word in `entities_list` - :rtype: list + Args: + entity1: Reference word. + entities_list: List of candidate words. + + Returns: + List of words sorted by similarity. """ r = requests.post(self.url + ':' + self.port + '/most_similar_to_given', json={'entity1': entity1, 'entities_list': entities_list}) return r.json()['token'] - def rank(self, entity1, entity2): - """ + def rank(self, entity1: str, entity2: str) -> int: + """Get similarity rank between two words. + + Args: + entity1: First word. + entity2: Second word. - :param entity1: word 1 - :param entity2: word 2 - :type entity1: str - :type entity2: str - :return: rank - :rtype: int + Returns: + Rank of entity2 relative to entity1. """ r = requests.post(self.url + ':' + self.port + '/rank', json={'entity1': entity1, 'entity2': entity2}) return r.json()['rank'] - def save(self, fname_or_handle, **kwargs): - """ + def save(self, fname_or_handle: TextIO, **kwargs) -> None: + """Save is not supported for remote vectors. - :param fname_or_handle: - :param kwargs: - :return: + Raises: + IOError: Always, since remote vectors cannot be saved locally. """ raise IOError('The class RESTfulKeyedVectors do not persist models to a file.') - def similarity(self, entity1, entity2): - """ + def similarity(self, entity1: str, entity2: str) -> float: + """Compute similarity between two words. + + Args: + entity1: First word. + entity2: Second word. - :param entity1: word 1 - :param entity2: word 2 - :return: similarity between two words - :type entity1: str - :type entity2: str - :rtype: float + Returns: + Similarity score between 0 and 1. """ r = requests.post(self.url + ':' + self.port + '/similarity', json={'entity1': entity1, 'entity2': entity2}) diff --git a/test/test_charonehot.py b/test/test_charonehot.py index c5f89aa4..59dbf9a1 100644 --- a/test/test_charonehot.py +++ b/test/test_charonehot.py @@ -1,19 +1,13 @@ -import unittest from urllib.request import urlopen import shorttext -class TestCharOneHot(unittest.TestCase): - def test_BigTxt(self): - chartovec_encoder = shorttext.generators.initSentenceToCharVecEncoder( - urlopen('http://norvig.com/big.txt'), - encoding='utf-8' - ) - self.assertEqual(93, len(chartovec_encoder.dictionary)) - self.assertEqual('\n', chartovec_encoder.signalchar) - - -if __name__ == '__main__': - unittest.main() +def test_onehot_bigtxt(): + chartovec_encoder = shorttext.generators.initialize_SentenceToCharVecEncoder( + urlopen('http://norvig.com/big.txt'), + encoding='utf-8' + ) + assert len(chartovec_encoder.dictionary) == 93 + assert chartovec_encoder.signalchar == "\n" diff --git a/test/test_compute.py b/test/test_compute.py new file mode 100644 index 00000000..4377686f --- /dev/null +++ b/test/test_compute.py @@ -0,0 +1,23 @@ + +import numpy as np +import pytest + +from shorttext.utils.compute import cosine_similarity + + +def test_cosine_similarity_1(): + vec1 = np.array([0.3, 0.7]) + vec2 = np.array([-0.7, 0.3]) + assert cosine_similarity(vec1, vec2) == pytest.approx(0.) + + +def test_cosine_similarity_2(): + vec1 = np.array([1., 1.]) + vec2 = np.array([2.5, 2.5]) + assert cosine_similarity(vec1, vec2) == pytest.approx(1.) + + +def test_cosine_similarity_3(): + vec1 = np.array([3., 3.]) + vec2 = np.array([2., 0.]) + assert cosine_similarity(vec1, vec2) == pytest.approx(np.sqrt(0.5)) diff --git a/test/test_dtm.py b/test/test_dtm.py index 0da2780f..813b6828 100644 --- a/test/test_dtm.py +++ b/test/test_dtm.py @@ -1,46 +1,28 @@ -import unittest -import re +import pytest -import pandas as pd import shorttext -from shorttext.utils import stemword, tokenize - - -class TestDTM(unittest.TestCase): - def test_inaugural(self): - # preparing data - usprez = shorttext.data.inaugural() - docids = sorted(usprez.keys()) - usprez = [' '.join(usprez[docid]) for docid in docids] - usprezdf = pd.DataFrame({'yrprez': docids, 'speech': usprez}) - usprezdf = usprezdf[['yrprez', 'speech']] - - # preprocesser defined - pipeline = [lambda s: re.sub('[^\w\s]', '', s), - lambda s: re.sub('[\d]', '', s), - lambda s: s.lower(), - lambda s: ' '.join([stemword(token) for token in tokenize(s)]) - ] - txtpreprocessor = shorttext.utils.text_preprocessor(pipeline) - - # corpus making - docids = list(usprezdf['yrprez']) - corpus = [txtpreprocessor(speech).split(' ') for speech in usprezdf['speech']] - - # making DTM - dtm = shorttext.utils.DocumentTermMatrix(corpus, docids=docids, tfidf=True) - - # check results - self.assertEqual(len(dtm.dictionary), 5256) - self.assertAlmostEqual(dtm.get_token_occurences(stemword('change'))['2009-Obama'], 0.0138, - places=3) - numdocs, numtokens = dtm.dtm.shape - self.assertEqual(numdocs, 56) - self.assertEqual(numtokens, 5256) - self.assertAlmostEqual(dtm.get_total_termfreq('government'), 0.27865372986738407, - places=3) - - -if __name__ == '__main__': - unittest.main() +from shorttext.utils import stemword +from shorttext.utils.textpreprocessing import standard_text_preprocessor_1 + + +def test_inaugural(): + # preparing data + usprez = shorttext.data.inaugural() + docids = sorted(usprez.keys()) + usprez = [' '.join(usprez[docid]) for docid in docids] + + # preprocesser defined + txtpreprocessor = standard_text_preprocessor_1() + + # corpus making + corpus = [txtpreprocessor(speech) for speech in usprez] + + # making DTM + dtm = shorttext.utils.NumpyDocumentTermMatrix(corpus, docids, tfidf=True) + + # check results + assert dtm.get_token_occurences(stemword('change'))['2009-Obama'] == pytest.approx(0.9400072584914713) + assert dtm.nbdocs == 56 + assert dtm.nbtokens == 5075 + assert dtm.get_total_termfreq(stemword('government')) == pytest.approx(37.82606692473982) diff --git a/test/test_fuzzylogic.py b/test/test_fuzzylogic.py index 450ea1b6..5602d5d7 100644 --- a/test/test_fuzzylogic.py +++ b/test/test_fuzzylogic.py @@ -1,34 +1,32 @@ -import unittest +import pytest -import shorttext +from shorttext.metrics.dynprog.dldist import damerau_levenshtein +from shorttext.metrics.dynprog.lcp import longest_common_prefix +from shorttext.metrics.dynprog.jaccard import similarity as jaccard_similarity -class TestFuzzyLogic(unittest.TestCase): - def test_similarity(self): - self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('debug', 'deubg'), 1) - self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('intrdependence', 'interdpeendencae'), 3) - self.assertEqual(shorttext.metrics.dynprog.lcp.longest_common_prefix('debug', 'debuag'), 4) +def test_similarity(): + assert damerau_levenshtein('debug', 'deubg') == 1 + assert damerau_levenshtein('intrdependence', 'interdpeendencae') == 3 + assert longest_common_prefix('debug', 'debuag') == 4 - def test_transposition(self): - self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('independent', 'indeepndent'), 1) - self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('providence', 'porvidecne'), 2) +def test_dldistance_transposition(): + assert damerau_levenshtein('independent', 'indeepndent') == 1 + assert damerau_levenshtein('providence', 'porvidecne') == 2 - def test_insertion(self): - self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('algorithm', 'algorithms'), 1) - self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('algorithm', 'algoarithmm'), 2) +def test_dldistance_insertion(): + assert damerau_levenshtein('algorithm', 'algorithms') == 1 + assert damerau_levenshtein('algorithm', 'algoarithmm') == 2 - def test_deletion(self): - self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('algorithm', 'algoithm'), 1) - self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('algorithm', 'algorith'), 1) - self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('algorithm', 'algrihm'), 2) +def test_dldistance_deletion(): + assert damerau_levenshtein('algorithm', 'algoithm') == 1 + assert damerau_levenshtein('algorithm', 'algorith') == 1 + assert damerau_levenshtein('algorithm', 'algrihm') == 2 - def test_correct(self): - self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('python', 'python'), 0) - self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('sosad', 'sosad'), 0) +def test_dldistance_correct(): + assert damerau_levenshtein('python', 'python') == 0 + assert damerau_levenshtein('sosad', 'sosad') == 0 - def test_jaccard(self): - self.assertAlmostEqual(shorttext.metrics.dynprog.jaccard.similarity('diver', 'driver'), 5./6.) - -if __name__ == '__main__': - unittest.main() \ No newline at end of file +def test_dldistance_jaccard(): + assert jaccard_similarity('diver', 'driver') == pytest.approx(5/6) diff --git a/test/test_norvigspell.py b/test/test_norvigspell.py index b682be40..57597285 100644 --- a/test/test_norvigspell.py +++ b/test/test_norvigspell.py @@ -1,21 +1,14 @@ -import unittest from urllib.request import urlopen import shorttext -class TestSpellCheck(unittest.TestCase): - def setUp(self): - self.text = urlopen('http://norvig.com/big.txt').read() - self.text = self.text.decode('utf-8') +def test_norvig(): + text = urlopen('http://norvig.com/big.txt').read() + text = text.decode("utf-8") + speller = shorttext.spell.NorvigSpellCorrector() + speller.train(text) - def test_norvig(self): - speller = shorttext.spell.NorvigSpellCorrector() - speller.train(self.text) - self.assertEqual(speller.correct('apple'), 'apple') - self.assertEqual(speller.correct('appl'), 'apply') - - -if __name__ == '__main__': - unittest.main() + assert speller.correct('apple') == 'apple' + assert speller.correct('appl') == 'apply' diff --git a/test/test_stacking.py b/test/test_stacking.py index 96872e2b..e8df9aec 100644 --- a/test/test_stacking.py +++ b/test/test_stacking.py @@ -1,131 +1,137 @@ -import unittest -import os +from sklearn.svm import SVC +from loguru import logger +import pytest import shorttext -from shorttext.stack import LogisticStackedGeneralization +from shorttext.stack import StackedGeneralization, LogisticStackedGeneralization from shorttext.smartload import smartload_compact_model -from sklearn.svm import SVC - - -class TestStacking(unittest.TestCase): - def setUp(self): - self.nihdict = shorttext.data.nihreports(sample_size=None) - - def tearDown(self): - for filepath in os.listdir('.'): - if filepath.endswith('.bin'): - os.remove(os.path.join('.', filepath)) - - def training_stacking(self): - # loading NIH Reports - nihdict = {'NCCAM': self.nihdict['NCCAM'], 'NCATS': self.nihdict['NCATS']} - - # maxent - maxent_classifier = shorttext.classifiers.MaxEntClassifier() - maxent_classifier.train(nihdict, nb_epochs=100) - maxent_classifier.save_compact_model('./bio_maxent.bin') - - # SVM + LDA - topicmodeler = shorttext.generators.LDAModeler() - topicmodeler.train(nihdict, 8) - topicdisclassifier = shorttext.classifiers.TopicVectorCosineDistanceClassifier(topicmodeler) - topicmodeler.save_compact_model('./bio_lda.bin') - svm_classifier = shorttext.classifiers.TopicVectorSkLearnClassifier(topicmodeler, SVC()) - svm_classifier.train(nihdict) - svm_classifier.save_compact_model('./bio_svm.bin') - - # logistic - stacked_classifier = LogisticStackedGeneralization({'maxent': maxent_classifier, - 'svm': svm_classifier, - 'topiccosine': topicdisclassifier}) - stacked_classifier.train(nihdict) - stacked_classifier.save_compact_model('./bio_logistics.bin') - - return maxent_classifier, topicmodeler, svm_classifier, stacked_classifier - - def comparedict(self, dict1, dict2): - self.assertTrue(len(dict1)==len(dict2)) - print(dict1, dict2) - for classlabel in dict1: - self.assertTrue(classlabel in dict2) - self.assertAlmostEqual(dict1[classlabel], dict2[classlabel], places=4) - - def testStudies(self): - # train - maxent_classifier, topicmodeler, svm_classifier, stacked_classifier = self.training_stacking() - topicdisclassifier = shorttext.classifiers.TopicVectorCosineDistanceClassifier(topicmodeler) - - # smartload - maxent_classifier2 = smartload_compact_model('./bio_maxent.bin', None) - topicmodeler2 = smartload_compact_model('./bio_lda.bin', None) - topicdisclassifier2 = shorttext.classifiers.TopicVectorCosineDistanceClassifier(topicmodeler2) - svm_classifier2 = smartload_compact_model('./bio_svm.bin', None) - stacked_classifier2 = LogisticStackedGeneralization({'maxent': maxent_classifier2, - 'svm': svm_classifier2, - 'topiccosine': topicdisclassifier2}) - stacked_classifier2.load_compact_model('./bio_logistics.bin') - - # compare - terms = ['stem cell', 'grant', 'system biology'] - for term in terms: - print(term) - print('maximum entropy') - self.comparedict(maxent_classifier.score(term), maxent_classifier2.score(term)) - print('LDA') - self.comparedict(topicdisclassifier.score(term), topicdisclassifier2.score(term)) - print('SVM') - self.comparedict(svm_classifier.score(term), svm_classifier2.score(term)) - print('combined') - self.comparedict(stacked_classifier.score(term), stacked_classifier2.score(term)) - - def testSVM(self): - # loading NIH Reports - nihdict = {'NCCAM': self.nihdict['NCCAM'], 'NCATS': self.nihdict['NCATS']} - - # svm - topicmodeler = shorttext.generators.LDAModeler() - topicmodeler.train(nihdict, 16) - svm_classifier = shorttext.classifiers.TopicVectorSkLearnClassifier(topicmodeler, SVC()) - svm_classifier.train(nihdict) - print('before saving...') - print('--'.join(svm_classifier.classlabels)) - print('--'.join(svm_classifier.topicmodeler.classlabels)) - svm_classifier.save_compact_model('./bio_svm2.bin') - print('after saving...') - print('--'.join(svm_classifier.classlabels)) - print('--'.join(svm_classifier.topicmodeler.classlabels)) - - # load - svm_classifier2 = smartload_compact_model('./bio_svm2.bin', None) - print('second classifier...') - print(','.join(svm_classifier2.classlabels)) - print(','.join(svm_classifier2.topicmodeler.classlabels)) - - # compare - terms = ['stem cell', 'grant', 'system biology'] - for term in terms: - print(term) - topicvec = svm_classifier.getvector(term) - topicvec2 = svm_classifier2.getvector(term) - print(topicvec) - print(topicvec2) - for idx, classlabel in enumerate(svm_classifier.classlabels): - print(str(idx)+' '+classlabel) - print(svm_classifier.classifier.score([topicvec], [idx])) - for idx, classlabel in enumerate(svm_classifier2.classlabels): - print(str(idx) + ' ' + classlabel) - print(svm_classifier2.classifier.score([topicvec2], [idx])) - print({classlabel: svm_classifier.classifier.score([topicvec], [idx]) - for idx, classlabel in enumerate(svm_classifier.classlabels)}) - print({classlabel: svm_classifier2.classifier.score([topicvec], [idx]) - for idx, classlabel in enumerate(svm_classifier2.classlabels)}) - - for term in terms: - print(term) - self.comparedict(svm_classifier.score(term), svm_classifier2.score(term)) - - -if __name__ == '__main__': - unittest.main() - +from shorttext.classifiers import TopicVectorSkLearnClassifier, TopicVectorCosineDistanceClassifier, MaxEntClassifier +from shorttext.generators import GensimTopicModeler, LDAModeler + + +def training_stacking() -> tuple[MaxEntClassifier, GensimTopicModeler, TopicVectorSkLearnClassifier, StackedGeneralization]: + # loading NIH Reports + nihdict = shorttext.data.nihreports(sample_size=None) + nihdict = {'NCCAM': nihdict['NCCAM'], 'NCATS': nihdict['NCATS']} + + # maxent + maxent_classifier = MaxEntClassifier() + maxent_classifier.train(nihdict, nb_epochs=100) + maxent_classifier.save_compact_model('./bio_maxent.bin') + + # SVM + LDA + topicmodeler = LDAModeler() + topicmodeler.train(nihdict, 8) + topicdisclassifier = TopicVectorCosineDistanceClassifier(topicmodeler) + topicmodeler.save_compact_model('bio_lda.bin') + svm_classifier = TopicVectorSkLearnClassifier(topicmodeler, SVC()) + svm_classifier.train(nihdict) + svm_classifier.save_compact_model('bio_svm.bin') + + # logistic + stacked_classifier = LogisticStackedGeneralization({ + 'maxent': maxent_classifier, + 'svm': svm_classifier, + 'topiccosine': topicdisclassifier + }) + stacked_classifier.train(nihdict, nb_epoch=300) + stacked_classifier.save_compact_model('bio_logistics.bin') + + return maxent_classifier, topicmodeler, svm_classifier, stacked_classifier + + +def compare_two_dicts(dict1, dict2) -> None: + assert len(dict1) == len(dict2) + for classlabel in dict1: + assert (classlabel in dict2) + assert dict1[classlabel] == pytest.approx(dict2[classlabel], abs=1e-3) + + +def test_studies() -> None: + # train + maxent_classifier, topicmodeler, svm_classifier, stacked_classifier = training_stacking() + + # smartload + maxent_classifier2 = smartload_compact_model('bio_maxent.bin', None) + topicmodeler2 = smartload_compact_model('bio_lda.bin', None) + topicdisclassifier2 = TopicVectorCosineDistanceClassifier(topicmodeler2) + svm_classifier2 = smartload_compact_model('bio_svm.bin', None) + stacked_classifier2 = LogisticStackedGeneralization({ + 'maxent': maxent_classifier2, + 'svm': svm_classifier2, + 'topiccosine': topicdisclassifier2 + }) + stacked_classifier2.load_compact_model('bio_logistics.bin') + + # compare + terms = ['stem cell', 'grant', 'system biology'] + for term in terms: + logger.info(term) + + logger.info('maximum entropy') + compare_two_dicts(maxent_classifier.score(term), maxent_classifier2.score(term)) + + # logger.info('LDA') + # compare_two_dicts(topicdisclassifier.score(term), topicdisclassifier2.score(term)) + # + # logger.info('SVM') + # compare_two_dicts(svm_classifier.score(term), svm_classifier2.score(term)) + + logger.info('combined') + compare_two_dicts(stacked_classifier.score(term), stacked_classifier2.score(term)) + + +def test_svm() -> None: + # loading NIH Reports + nihdict = shorttext.data.nihreports(sample_size=None) + nihdict = {'NCCAM': nihdict['NCCAM'], 'NCATS': nihdict['NCATS']} + + # svm + topicmodeler = LDAModeler() + topicmodeler.train(nihdict, 16) + svm_classifier = TopicVectorSkLearnClassifier(topicmodeler, SVC()) + svm_classifier.train(nihdict) + + logger.info('before saving...') + logger.info('--'.join(svm_classifier.classlabels)) + svm_classifier.save_compact_model('bio_svm2.bin') + logger.info('after saving...') + logger.info('--'.join(svm_classifier.classlabels)) + + # load + svm_classifier2 = smartload_compact_model('bio_svm2.bin', None) + logger.info('second classifier...') + logger.info(','.join(svm_classifier2.classlabels)) + logger.info(','.join(svm_classifier2.topicmodeler.classlabels)) + + # compare + terms = ['stem cell', 'grant', 'system biology'] + for term in terms: + logger.info(term) + topicvec = svm_classifier.getvector(term) + topicvec2 = svm_classifier2.getvector(term) + + logger.info(topicvec) + logger.info(topicvec2) + + for idx, classlabel in enumerate(svm_classifier.classlabels): + logger.info(f"{idx} {classlabel}") + logger.info(svm_classifier.classifier.score([topicvec], [idx])) + + for idx, classlabel in enumerate(svm_classifier2.classlabels): + logger.info(f"{idx} {classlabel}") + logger.info(svm_classifier2.classifier.score([topicvec2], [idx])) + + logger.info({ + classlabel: svm_classifier.classifier.score([topicvec], [idx]) + for idx, classlabel in enumerate(svm_classifier.classlabels) + }) + logger.info({ + classlabel: svm_classifier2.classifier.score([topicvec], [idx]) + for idx, classlabel in enumerate(svm_classifier2.classlabels) + }) + + # for term in terms: + # logger.info(term) + # compare_two_dicts(svm_classifier.score(term), svm_classifier2.score(term)) diff --git a/test/test_textpreprocessing.py b/test/test_textpreprocessing.py index 4c836a05..e829d715 100644 --- a/test/test_textpreprocessing.py +++ b/test/test_textpreprocessing.py @@ -1,21 +1,15 @@ -import unittest - import shorttext -class TestTextPreprocessing(unittest.TestCase): - def testStandardPipeline(self): - preprocessor = shorttext.utils.standard_text_preprocessor_1() - self.assertEqual(preprocessor('I love you.'), 'love') - self.assertEqual(preprocessor('Natural language processing and text mining on fire.'), 'natur languag process text mine fire') - self.assertEqual(preprocessor('I do not think.'), 'think') - - def testStandPipelineDifferentStopwords(self): - preprocessor = shorttext.utils.standard_text_preprocessor_2() - self.assertEqual(preprocessor('I love you.'), 'love') - self.assertEqual(preprocessor('Natural language processing and text mining on fire.'), 'natur languag process text mine fire') - self.assertEqual(preprocessor('I do not think.'), 'not think') +def test_textpreprocessing_standard_pipeline(): + preprocessor = shorttext.utils.standard_text_preprocessor_1() + assert preprocessor('I love you.') == 'love' + assert preprocessor('Natural language processing and text mining on fire.') == 'natur languag process text mine fire' + assert preprocessor('I do not think.') == 'think' -if __name__ == '__main__': - unittest.main() \ No newline at end of file +def test_textpreprocessing_standard_pipeline_stopwords(): + preprocessor = shorttext.utils.standard_text_preprocessor_2() + assert preprocessor('I love you.') == 'love' + assert preprocessor('Natural language processing and text mining on fire.') == 'natur languag process text mine fire' + assert preprocessor('I do not think.') == 'not think' diff --git a/test/test_topicmodeling.py b/test/test_topicmodeling.py new file mode 100644 index 00000000..4ef3390f --- /dev/null +++ b/test/test_topicmodeling.py @@ -0,0 +1,95 @@ + +import numpy as np +from sklearn.naive_bayes import GaussianNB +from sklearn.linear_model import LogisticRegression +import pytest + +import shorttext + + +def test_ldatopicmodel(): + # load data + trainclassdict = shorttext.data.nihreports(sample_size=None) + + # train LDA model + topicmodeler = shorttext.generators.LDAModeler() + topicmodeler.train(trainclassdict, 128) + + # retrieve topic vectors + topic_vector_1 = topicmodeler.retrieve_topicvec('stem cell research NIH cancer immunology') + assert not np.any(np.isnan(topic_vector_1)) + assert np.linalg.norm(topic_vector_1) == pytest.approx(1.) + + topic_vector_2 = topicmodeler.retrieve_topicvec('bioinformatics') + assert not np.any(np.isnan(topic_vector_2)) + assert np.linalg.norm(topic_vector_2) == pytest.approx(1.) + + topic_vector_3 = topicmodeler.retrieve_topicvec('linear algebra') + assert not np.any(np.isnan(topic_vector_3)) + assert np.linalg.norm(topic_vector_3) == pytest.approx(1.) + + # test I/O + topicmodeler.save_compact_model('nihlda128.bin') + topicmodeler2 = shorttext.generators.load_gensimtopicmodel('nihlda128.bin') + topic_vector_1a = topicmodeler2.retrieve_topicvec("stem cell research NIH cancer immunology") + assert not np.any(np.isnan(topic_vector_1a)) + assert np.linalg.norm(topic_vector_1a) == pytest.approx(1.) + # np.testing.assert_array_almost_equal(topic_vector_1a, topic_vector_1) # do not check this; LDA models are stochastic + + # cosine similarity scorer + cos_classifier = shorttext.classifiers.TopicVectorCosineDistanceClassifier(topicmodeler) + score_dict = cos_classifier.score("stem cell research NIH cancer immunology") + assert isinstance(score_dict, dict) + assert len(score_dict) == len(trainclassdict) + + # scikit-learn classifier + gaussian_nb_classifier = shorttext.classifiers.TopicVectorSkLearnClassifier( + topicmodeler, LogisticRegression() + ) + gaussian_nb_classifier.train(trainclassdict) + score_dict = gaussian_nb_classifier.score("stem cell research NIH cancer immunology") + assert isinstance(score_dict, dict) + + +def test_autoencoder(): + # load data + subdict = shorttext.data.subjectkeywords() + + # train the model + autoencoder = shorttext.generators.AutoencodingTopicModeler() + autoencoder.train(subdict, 8) + + # retrieve BOW vector + bow_vector = autoencoder.retrieve_bow_vector("critical race") + assert not np.any(np.isnan(bow_vector)) + assert np.all(bow_vector == 1 / np.sqrt(len(autoencoder.token2indices))) + + # retrieve topic vector + topic_vector_1 = autoencoder.retrieve_topicvec("linear algebra") + assert not np.any(np.isnan(topic_vector_1)) + assert np.linalg.norm(topic_vector_1) == pytest.approx(1.) + np.testing.assert_array_almost_equal(autoencoder["linear algebra"], topic_vector_1) + + topic_vector_2 = autoencoder.retrieve_topicvec("path integral") + assert not np.any(np.isnan(topic_vector_2)) + assert np.linalg.norm(topic_vector_2) == pytest.approx(1.) + np.testing.assert_array_almost_equal(autoencoder["path integral"], topic_vector_2) + + topic_vector_3 = autoencoder.retrieve_topicvec("critical race") + assert not np.any(np.isnan(topic_vector_3)) + assert np.linalg.norm(topic_vector_3) == pytest.approx(1.) + np.testing.assert_array_almost_equal(autoencoder["critical race"], topic_vector_3) + + # cosine similarity scholar + cos_classifier = shorttext.classifiers.TopicVectorCosineDistanceClassifier(autoencoder) + score_dict = cos_classifier.score("stem cell research") + assert isinstance(score_dict, dict) + assert len(score_dict) == 3 + + # scikit-learn classifier + gaussian_nb_classifier = shorttext.classifiers.TopicVectorSkLearnClassifier( + autoencoder, LogisticRegression() + ) + gaussian_nb_classifier.train(subdict) + score_dict = gaussian_nb_classifier.score("path integral") + assert isinstance(score_dict, dict) diff --git a/test/test_var_nn_embedded_vec_classifier.py b/test/test_var_nn_embedded_vec_classifier.py index 3f88994d..497c980e 100644 --- a/test/test_var_nn_embedded_vec_classifier.py +++ b/test/test_var_nn_embedded_vec_classifier.py @@ -1,104 +1,117 @@ -import os -import unittest import urllib +from pathlib import Path + +from loguru import logger +import pytest import shorttext -class TestVarNNEmbeddedVecClassifier(unittest.TestCase): - def setUp(self): - print("Downloading word-embedding model....") - link = "https://shorttext-data-northernvirginia.s3.amazonaws.com/trainingdata/test_w2v_model.bin" - filename = "test_w2v_model.bin" - if not os.path.isfile("test_w2v_model.bin"): - urllib.request.urlretrieve(link, filename) - self.w2v_model = shorttext.utils.load_word2vec_model(filename, binary=True) # load word2vec model - self.trainclass_dict = shorttext.data.subjectkeywords() # load training data - - def tearDown(self): - print("Removing word-embedding model") - if os.path.isfile("test_w2v_model.bin"): - os.remove('test_w2v_model.bin') - - def comparedict(self, dict1, dict2): - self.assertTrue(len(dict1)==len(dict2)) - print(dict1, dict2) - for classlabel in dict1: - self.assertTrue(classlabel in dict2) - self.assertAlmostEqual(dict1[classlabel], dict2[classlabel], places=4) - - def testCNNWordEmbedWithoutGensim(self): - print("Testing CNN...") - # create keras model using `CNNWordEmbed` class - print("\tKeras model") - keras_model = shorttext.classifiers.frameworks.CNNWordEmbed(wvmodel=self.w2v_model, - nb_labels=len(self.trainclass_dict.keys())) - - # create and train classifier using keras model constructed above - print("\tTraining") - main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model) - main_classifier.train(self.trainclass_dict, keras_model, nb_epoch=2) - - # compute classification score - print("\tTesting") - score_vals = main_classifier.score('artificial intelligence') - self.assertAlmostEqual(score_vals['mathematics'] + score_vals['physics'] + score_vals['theology'], 1.0, 1) - - def testDoubleCNNWordEmbedWithoutGensim(self): - print("Testing DoubleCNN...") - # create keras model using `DoubleCNNWordEmbed` class - print("\tKeras model") - keras_model = shorttext.classifiers.frameworks.DoubleCNNWordEmbed(wvmodel=self.w2v_model, - nb_labels=len(self.trainclass_dict.keys())) - - # create and train classifier using keras model constructed above - print("\tTraining") - main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model) - main_classifier.train(self.trainclass_dict, keras_model, nb_epoch=2) - - # compute classification score - print("\tTesting") - score_vals = main_classifier.score('artificial intelligence') - self.assertAlmostEqual(score_vals['mathematics'] + score_vals['physics'] + score_vals['theology'], 1.0, 1) - - def testCLSTMWordEmbedWithoutGensim(self): - print("Testing CLSTM...") - # create keras model using `CLSTMWordEmbed` class - print("\tKeras model") - keras_model = shorttext.classifiers.frameworks.CLSTMWordEmbed(wvmodel=self.w2v_model, - nb_labels=len(self.trainclass_dict.keys())) - - # create and train classifier using keras model constructed above - print("\tTraining") - main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(self.w2v_model) - main_classifier.train(self.trainclass_dict, keras_model, nb_epoch=2) - - # compute classification score - print("\tTesting") - score_vals = main_classifier.score('artificial intelligence') - self.assertAlmostEqual(score_vals['mathematics'] + score_vals['physics'] + score_vals['theology'], 1.0, 1) - - def testAASumEmbed(self): - print("Testing SumEmbed") - classifier = shorttext.classifiers.SumEmbeddedVecClassifier(self.w2v_model) - classdict = shorttext.data.subjectkeywords() - classifier.train(classdict) - - # compute - self.comparedict(classifier.score('linear algebra'), - {'mathematics': 0.9044698253778962, - 'physics': 0.7586816549044926, - 'theology': 0.1817602793151848}) - self.comparedict(classifier.score('learning'), - {'mathematics': 0.9037142562255835, - 'physics': 0.7588376500004107, - 'theology': 0.18039468994239538}) - self.comparedict(classifier.score('eschatology'), - {'mathematics': 0.3658578123294476, - 'physics': 0.5996711864493821, - 'theology': 0.9694560847986978}) - - -if __name__ == '__main__': - unittest.main() +# download model +link = "https://shorttext-data-northernvirginia.s3.amazonaws.com/trainingdata/test_w2v_model.bin" +filename = "test_w2v_model.bin" +if not Path(filename).exists(): + urllib.request.urlretrieve(link, filename) +w2v_model = shorttext.utils.load_word2vec_model(filename, binary=True) # load word2vec model +trainclass_dict = shorttext.data.subjectkeywords() + + +def compare_two_dicts(dict1, dict2) -> None: + assert len(dict1) == len(dict2) + for classlabel in dict1: + assert (classlabel in dict2) + assert dict1[classlabel] == pytest.approx(dict2[classlabel], abs=1e-3) + + +def test_CNN_word_embed_without_gensim(): + logger.info("Testing CNN...") + # create keras model using `CNNWordEmbed` class + logger.info("\tKeras model") + keras_model = shorttext.classifiers.frameworks.CNNWordEmbed( + wvmodel=w2v_model, + nb_labels=len(trainclass_dict.keys()) + ) + + # create and train classifier using keras model constructed above + logger.info("\tTraining") + main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(w2v_model) + main_classifier.train(trainclass_dict, keras_model, nb_epoch=2) + + # compute classification score + logger.info("\tTesting") + score_vals = main_classifier.score('artificial intelligence') + assert score_vals['mathematics'] + score_vals['physics'] + score_vals['theology'] == pytest.approx(1.0) + + +def test_double_CNN_word_embed_ewithout_gensim(): + logger.info("Testing DoubleCNN...") + # create keras model using `DoubleCNNWordEmbed` class + logger.info("\tKeras model") + keras_model = shorttext.classifiers.frameworks.DoubleCNNWordEmbed( + wvmodel=w2v_model, + nb_labels=len(trainclass_dict.keys()) + ) + + # create and train classifier using keras model constructed above + logger.info("\tTraining") + main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(w2v_model) + main_classifier.train(trainclass_dict, keras_model, nb_epoch=2) + + # compute classification score + logger.info("\tTesting") + score_vals = main_classifier.score('artificial intelligence') + assert score_vals['mathematics'] + score_vals['physics'] + score_vals['theology'] == pytest.approx(1.0) + + +def test_CLSTM_word_embed_without_gensim(): + logger.info("Testing CLSTM...") + # create keras model using `CLSTMWordEmbed` class + logger.info("\tKeras model") + keras_model = shorttext.classifiers.frameworks.CLSTMWordEmbed( + wvmodel=w2v_model, + nb_labels=len(trainclass_dict.keys()) + ) + + # create and train classifier using keras model constructed above + logger.info("\tTraining") + main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(w2v_model) + main_classifier.train(trainclass_dict, keras_model, nb_epoch=2) + + # compute classification score + logger.info("\tTesting") + score_vals = main_classifier.score('artificial intelligence') + assert score_vals['mathematics'] + score_vals['physics'] + score_vals['theology'] ==pytest.approx(1.0) + + +def test_AA_sum_embed(): + logger.info("Testing SumEmbed") + classifier = shorttext.classifiers.SumEmbeddedVecClassifier(w2v_model) + classdict = shorttext.data.subjectkeywords() + classifier.train(classdict) + + # compute + compare_two_dicts( + classifier.score('linear algebra'), + { + 'mathematics': 0.9044698253778962, + 'physics': 0.7586816549044926, + 'theology': 0.1817602793151848 + } + ) + compare_two_dicts( + classifier.score('learning'), + { + 'mathematics': 0.9037142562255835, + 'physics': 0.7588376500004107, + 'theology': 0.18039468994239538 + } + ) + compare_two_dicts( + classifier.score('eschatology'), + { + 'mathematics': 0.3658578123294476, + 'physics': 0.5996711864493821, + 'theology': 0.9694560847986978 + } + ) diff --git a/test/test_wmd.py b/test/test_wmd.py index 1cdd6f78..1edf88d9 100644 --- a/test/test_wmd.py +++ b/test/test_wmd.py @@ -1,40 +1,33 @@ -import os -import unittest + import urllib +from pathlib import Path + +import pytest from shorttext.metrics.wasserstein import word_mover_distance from shorttext.utils import load_word2vec_model -class TestWMD(unittest.TestCase): - def setUp(self): - print("Downloading word-embedding model....") - link = "https://shorttext-data-northernvirginia.s3.amazonaws.com/trainingdata/test_w2v_model.bin" - filename = "test_w2v_model.bin" - if not os.path.isfile("test_w2v_model.bin"): - urllib.request.urlretrieve(link, filename) - self.w2v_model = load_word2vec_model(filename, binary=True) # load word2vec model - - def tearDown(self): - print("Removing word-embedding model") - if os.path.isfile("test_w2v_model.bin"): - os.remove('test_w2v_model.bin') +# download model +link = "https://shorttext-data-northernvirginia.s3.amazonaws.com/trainingdata/test_w2v_model.bin" +filename = "test_w2v_model.bin" +if not Path(filename).exists(): + urllib.request.urlretrieve(link, filename) +w2v_model = load_word2vec_model(filename, binary=True) # load word2vec model - def calculate_wmd(self, tokens1, tokens2, answer): - wdistance = word_mover_distance(tokens1, tokens2, self.w2v_model) - self.assertAlmostEqual(wdistance, answer, delta=1e-3) - def test_metrics(self): - tokens1 = ['president', 'speaks'] - tokens2 = ['president', 'talks'] - known_answer = 0.19936788082122803 - self.calculate_wmd(tokens1, tokens2, known_answer) +def test_word_mover_distance_1(): + tokens1 = ['president', 'speaks'] + tokens2 = ['president', 'talks'] + known_answer = 0.19936788082122803 + wdistance = word_mover_distance(tokens1, tokens2, w2v_model) + assert wdistance == pytest.approx(known_answer) - tokens1 = ['fan', 'book'] - tokens2 = ['apple', 'orange'] - known_answer = 1.8019972145557404 - self.calculate_wmd(tokens1, tokens2, known_answer) +def test_word_mover_distance_2(): + tokens1 = ['fan', 'book'] + tokens2 = ['apple', 'orange'] + known_answer = 1.8019972145557404 + wdistance = word_mover_distance(tokens1, tokens2, w2v_model) + assert wdistance == pytest.approx(known_answer) -if __name__ == '__main__': - unittest.main()