diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 00000000..1674f36c --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,55 @@ +version: 2 + + +shared: &shared + working_directory: ~/shorttext + + steps: + - checkout + + - run: + name: Apt Install + command: | + sudo apt-get update + sudo apt-get install libc6 + sudo apt-get install python3-dev + sudo apt-get install -y g++ + + - run: + name: Installing Miniconda and Packages + command: | + pip install --upgrade --user pip + pip install --upgrade --user google-compute-engine + pip install --user . + + - run: + name: Run Unit Tests + command: | + pip install --user .[test] + pytest + + +jobs: + py311: + <<: *shared + docker: + - image: cimg/python:3.11 + + py312: + <<: *shared + docker: + - image: cimg/python:3.12 + + py313: + <<: *shared + docker: + - image: cimg/python:3.13 + + +workflows: + version: 2 + build: + jobs: + - py311 + - py312 + - py313 diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml new file mode 100644 index 00000000..36a8f562 --- /dev/null +++ b/.github/workflows/publish-to-pypi.yml @@ -0,0 +1,35 @@ +name: Publish to PyPI + +on: + release: + types: [published] + +jobs: + publish-to-pypi: + name: Publish to PyPI + runs-on: ubuntu-latest + permissions: + id-token: write # IMPORTANT: this permission is mandatory for trusted publishing + steps: + - name: Checkout code + uses: actions/checkout@v5 + + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: "3.10" + + - name: Install build dependencies + run: | + python -m pip install --upgrade pip + pip install build + + - name: Build package + run: python -m build + + - name: Publish package to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} + skip-existing: true diff --git a/.gitignore b/.gitignore index 2b50e0a7..e0b5b946 100644 --- a/.gitignore +++ b/.gitignore @@ -1,60 +1,5 @@ - -# Created by https://www.gitignore.io/api/python,pycharm - -### PyCharm ### -# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm -# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 - -# User-specific stuff: -.idea/**/workspace.xml -.idea/**/tasks.xml - -# Sensitive or high-churn files: -.idea/**/dataSources/ -.idea/**/dataSources.ids -.idea/**/dataSources.xml -.idea/**/dataSources.local.xml -.idea/**/sqlDataSources.xml -.idea/**/dynamic.xml -.idea/**/uiDesigner.xml -.idea/inspectionProfiles/*.xml - -# Gradle: -.idea/**/gradle.xml -.idea/**/libraries - -# Mongo Explorer plugin: -.idea/**/mongoSettings.xml - -## File-based project format: -*.iws - -## Plugin-specific files: - -# IntelliJ -/out/ - -# mpeltonen/sbt-idea plugin -.idea_modules/ - -# JIRA plugin -atlassian-ide-plugin.xml - -# Crashlytics plugin (for Android Studio and IntelliJ) -com_crashlytics_export_strings.xml -crashlytics.properties -crashlytics-build.properties -fabric.properties - -### PyCharm Patch ### -# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 - -# *.iml -# modules.xml -# .idea/misc.xml -# *.ipr - -### Python ### +# Created by .ignore support plugin (hsz.mobi) +### Python template # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -65,7 +10,6 @@ __pycache__/ # Distribution / packaging .Python -env/ build/ develop-eggs/ dist/ @@ -78,9 +22,12 @@ parts/ sdist/ var/ wheels/ +pip-wheel-metadata/ +share/python-wheels/ *.egg-info/ .installed.cfg *.egg +MANIFEST # PyInstaller # Usually these files are written by a python script from a template @@ -95,13 +42,17 @@ pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ +.nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml -*,cover +*.cover +*.py,cover .hypothesis/ +.pytest_cache/ +cover/ # Translations *.mo @@ -110,6 +61,8 @@ coverage.xml # Django stuff: *.log local_settings.py +db.sqlite3 +db.sqlite3-journal # Flask stuff: instance/ @@ -122,29 +75,840 @@ instance/ docs/_build/ # PyBuilder +.pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints +# IPython +profile_default/ +ipython_config.py + # pyenv -.python-version +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ -# celery beat schedule file +# Celery stuff celerybeat-schedule +celerybeat.pid -# dotenv -.env +# SageMath parsed files +*.sage.py -# virtualenv +# Environments +.env .venv +env/ venv/ ENV/ +env.bak/ +venv.bak/ # Spyder project settings .spyderproject +.spyproject # Rope project settings .ropeproject -# End of https://www.gitignore.io/api/python,pycharm \ No newline at end of file +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +### Emacs template +# -*- mode: gitignore; -*- +*~ +\#*\# +/.emacs.desktop +/.emacs.desktop.lock +*.elc +auto-save-list +tramp +.\#* + +# Org-mode +.org-id-locations +*_archive + +# flymake-mode +*_flymake.* + +# eshell files +/eshell/history +/eshell/lastdir + +# elpa packages +/elpa/ + +# reftex files +*.rel + +# AUCTeX auto folder +/auto/ + +# cask packages +.cask/ + +# Flycheck +flycheck_*.el + +# server auth directory +/server/ + +# projectiles files +.projectile + +# directory configuration +.dir-locals.el + +# network security +/network-security.data + + +### C template +# Prerequisites +*.d + +# Object files +*.o +*.ko +*.obj +*.elf + +# Linker output +*.ilk +*.map +*.exp + +# Precompiled Headers +*.gch +*.pch + +# Libraries +*.lib +*.a +*.la +*.lo + +# Shared objects (inc. Windows DLLs) +*.dll +*.so.* +*.dylib + +# Executables +*.exe +*.out +*.app +*.i*86 +*.x86_64 +*.hex + +# Debug files +*.dSYM/ +*.su +*.idb +*.pdb + +# Kernel Module Compile Results +*.mod* +*.cmd +.tmp_versions/ +modules.order +Module.symvers +Mkfile.old +dkms.conf + +### JupyterNotebooks template +# gitignore template for Jupyter Notebooks +# website: http://jupyter.org/ + +*/.ipynb_checkpoints/* + +# IPython + +# Remove previous ipynb_checkpoints +# git rm -r .ipynb_checkpoints/ + +### JetBrains template +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/artifacts +# .idea/compiler.xml +# .idea/jarRepositories.xml +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +.idea + +### C++ template +# Prerequisites + +# Compiled Object files +*.slo + +# Precompiled Headers + +# Compiled Dynamic libraries + +# Fortran module files +*.mod +*.smod + +# Compiled Static libraries +*.lai + +# Executables + +### Linux template + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### ArchLinuxPackages template +*.tar +*.tar.* +*.jar +*.msi +*.zip +*.tgz +*.log.* +*.sig + +pkg/ + +### Fortran template +# Prerequisites + +# Compiled Object files + +# Precompiled Headers + +# Compiled Dynamic libraries + +# Fortran module files + +# Compiled Static libraries + +# Executables + +### macOS template +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +### VisualStudio template +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. +## +## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore + +# User-specific files +*.rsuser +*.suo +*.user +*.userosscache +*.sln.docstates + +# User-specific files (MonoDevelop/Xamarin Studio) +*.userprefs + +# Mono auto generated files +mono_crash.* + +# Build results +[Dd]ebug/ +[Dd]ebugPublic/ +[Rr]elease/ +[Rr]eleases/ +x64/ +x86/ +[Ww][Ii][Nn]32/ +[Aa][Rr][Mm]/ +[Aa][Rr][Mm]64/ +bld/ +[Bb]in/ +[Oo]bj/ +[Ll]og/ +[Ll]ogs/ + +# Visual Studio 2015/2017 cache/options directory +.vs/ +# Uncomment if you have tasks that create the project's static files in wwwroot +#wwwroot/ + +# Visual Studio 2017 auto generated files +Generated\ Files/ + +# MSTest test Results +[Tt]est[Rr]esult*/ +[Bb]uild[Ll]og.* + +# NUnit +*.VisualState.xml +TestResult.xml +nunit-*.xml + +# Build Results of an ATL Project +[Dd]ebugPS/ +[Rr]eleasePS/ +dlldata.c + +# Benchmark Results +BenchmarkDotNet.Artifacts/ + +# .NET Core +project.lock.json +project.fragment.lock.json +artifacts/ + +# ASP.NET Scaffolding +ScaffoldingReadMe.txt + +# StyleCop +StyleCopReport.xml + +# Files built by Visual Studio +*_i.c +*_p.c +*_h.h +*.meta +*.iobj +*.ipdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.tmp_proj +*_wpftmp.csproj +*.vspscc +*.vssscc +.builds +*.pidb +*.svclog +*.scc + +# Chutzpah Test files +_Chutzpah* + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opendb +*.opensdf +*.sdf +*.cachefile +*.VC.db +*.VC.VC.opendb + +# Visual Studio profiler +*.psess +*.vsp +*.vspx +*.sap + +# Visual Studio Trace Files +*.e2e + +# TFS 2012 Local Workspace +$tf/ + +# Guidance Automation Toolkit +*.gpState + +# ReSharper is a .NET coding add-in +_ReSharper*/ +*.[Rr]e[Ss]harper +*.DotSettings.user + +# TeamCity is a build add-in +_TeamCity* + +# DotCover is a Code Coverage Tool +*.dotCover + +# AxoCover is a Code Coverage Tool +.axoCover/* +!.axoCover/settings.json + +# Coverlet is a free, cross platform Code Coverage Tool +coverage*[.json, .xml, .info] + +# Visual Studio code coverage results +*.coverage +*.coveragexml + +# NCrunch +_NCrunch_* +.*crunch*.local.xml +nCrunchTemp_* + +# MightyMoose +*.mm.* +AutoTest.Net/ + +# Web workbench (sass) +.sass-cache/ + +# Installshield output folder +[Ee]xpress/ + +# DocProject is a documentation generator add-in +DocProject/buildhelp/ +DocProject/Help/*.HxT +DocProject/Help/*.HxC +DocProject/Help/*.hhc +DocProject/Help/*.hhk +DocProject/Help/*.hhp +DocProject/Help/Html2 +DocProject/Help/html + +# Click-Once directory +publish/ + +# Publish Web Output +*.[Pp]ublish.xml +*.azurePubxml +# Note: Comment the next line if you want to checkin your web deploy settings, +# but database connection strings (with potential passwords) will be unencrypted +*.pubxml +*.publishproj + +# Microsoft Azure Web App publish settings. Comment the next line if you want to +# checkin your Azure Web App publish settings, but sensitive information contained +# in these scripts will be unencrypted +PublishScripts/ + +# NuGet Packages +*.nupkg +# NuGet Symbol Packages +*.snupkg +# The packages folder can be ignored because of Package Restore +**/[Pp]ackages/* +# except build/, which is used as an MSBuild target. +!**/[Pp]ackages/build/ +# Uncomment if necessary however generally it will be regenerated when needed +#!**/[Pp]ackages/repositories.config +# NuGet v3's project.json files produces more ignorable files +*.nuget.props +*.nuget.targets + +# Microsoft Azure Build Output +csx/ +*.build.csdef + +# Microsoft Azure Emulator +ecf/ +rcf/ + +# Windows Store app package directories and files +AppPackages/ +BundleArtifacts/ +Package.StoreAssociation.xml +_pkginfo.txt +*.appx +*.appxbundle +*.appxupload + +# Visual Studio cache files +# files ending in .cache can be ignored +*.[Cc]ache +# but keep track of directories ending in .cache +!?*.[Cc]ache/ + +# Others +ClientBin/ +~$* +*.dbmdl +*.dbproj.schemaview +*.jfm +*.pfx +*.publishsettings +orleans.codegen.cs + +# Including strong name files can present a security risk +# (https://github.com/github/gitignore/pull/2483#issue-259490424) +#*.snk + +# Since there are multiple workflows, uncomment next line to ignore bower_components +# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) +#bower_components/ + +# RIA/Silverlight projects +Generated_Code/ + +# Backup & report files from converting an old project file +# to a newer Visual Studio version. Backup files are not needed, +# because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML +UpgradeLog*.htm +ServiceFabricBackup/ +*.rptproj.bak + +# SQL Server files +*.mdf +*.ldf +*.ndf + +# Business Intelligence projects +*.rdl.data +*.bim.layout +*.bim_*.settings +*.rptproj.rsuser +*- [Bb]ackup.rdl +*- [Bb]ackup ([0-9]).rdl +*- [Bb]ackup ([0-9][0-9]).rdl + +# Microsoft Fakes +FakesAssemblies/ + +# GhostDoc plugin setting file +*.GhostDoc.xml + +# Node.js Tools for Visual Studio +.ntvs_analysis.dat +node_modules/ + +# Visual Studio 6 build log +*.plg + +# Visual Studio 6 workspace options file +*.opt + +# Visual Studio 6 auto-generated workspace file (contains which files were open etc.) +*.vbw + +# Visual Studio LightSwitch build output +**/*.HTMLClient/GeneratedArtifacts +**/*.DesktopClient/GeneratedArtifacts +**/*.DesktopClient/ModelManifest.xml +**/*.Server/GeneratedArtifacts +**/*.Server/ModelManifest.xml +_Pvt_Extensions + +# Paket dependency manager +.paket/paket.exe +paket-files/ + +# FAKE - F# Make +.fake/ + +# CodeRush personal settings +.cr/personal + +# Python Tools for Visual Studio (PTVS) +*.pyc + +# Cake - Uncomment if you are using it +# tools/** +# !tools/packages.config + +# Tabs Studio +*.tss + +# Telerik's JustMock configuration file +*.jmconfig + +# BizTalk build output +*.btp.cs +*.btm.cs +*.odx.cs +*.xsd.cs + +# OpenCover UI analysis results +OpenCover/ + +# Azure Stream Analytics local run output +ASALocalRun/ + +# MSBuild Binary and Structured Log +*.binlog + +# NVidia Nsight GPU debugger configuration file +*.nvuser + +# MFractors (Xamarin productivity tool) working folder +.mfractor/ + +# Local History for Visual Studio +.localhistory/ + +# BeatPulse healthcheck temp database +healthchecksdb + +# Backup folder for Package Reference Convert tool in Visual Studio 2017 +MigrationBackup/ + +# Ionide (cross platform F# VS Code tools) working folder +.ionide/ + +### CUDA template +*.i +*.ii +*.gpu +*.ptx +*.cubin +*.fatbin + +### Eclipse template +.metadata +bin/ +tmp/ +*.bak +*.swp +*~.nib +local.properties +.settings/ +.loadpath +.recommenders + +# External tool builders +.externalToolBuilders/ + +# Locally stored "Eclipse launch configurations" +*.launch + +# PyDev specific (Python IDE for Eclipse) +*.pydevproject + +# CDT-specific (C/C++ Development Tooling) +.cproject + +# CDT- autotools +.autotools + +# Java annotation processor (APT) +.factorypath + +# PDT-specific (PHP Development Tools) +.buildpath + +# sbteclipse plugin +.target + +# Tern plugin +.tern-project + +# TeXlipse plugin +.texlipse + +# STS (Spring Tool Suite) +.springBeans + +# Code Recommenders +.recommenders/ + +# Annotation Processing +.apt_generated/ +.apt_generated_test/ + +# Scala IDE specific (Scala & Java development for Eclipse) +.cache-main +.scala_dependencies +.worksheet + +# Uncomment this line if you wish to ignore the project description file. +# Typically, this file would be tracked if it contains build/dependency configurations: +#.project + +### Windows template +# Windows thumbnail cache files +Thumbs.db +Thumbs.db:encryptable +ehthumbs.db +ehthumbs_vista.db + +# Dump file +*.stackdump + +# Folder config file +[Dd]esktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msix +*.msm +*.msp + +# Windows shortcuts +*.lnk + +### VisualStudioCode template +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +*.code-workspace + +# Local History for Visual Studio Code +.history/ + +### VirtualEnv template +# Virtualenv +# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ +[Bb]in +[Ii]nclude +[Ll]ib +[Ll]ib64 +[Ll]ocal +[Ss]cripts +pyvenv.cfg +pip-selfcheck.json + +### Xcode template +# Xcode +# +# gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore + +## User settings +xcuserdata/ + +## compatibility with Xcode 8 and earlier (ignoring not required starting Xcode 9) +*.xcscmblueprint +*.xccheckout + +## compatibility with Xcode 3 and earlier (ignoring not required starting Xcode 4) +DerivedData/ +*.moved-aside +*.pbxuser +!default.pbxuser +*.mode1v3 +!default.mode1v3 +*.mode2v3 +!default.mode2v3 +*.perspectivev3 +!default.perspectivev3 + +## Gcc Patch +/*.gcno diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 00000000..da10187e --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,31 @@ +# .readthedocs.yml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Build documentation in the docs/ directory with Sphinx +sphinx: + configuration: docs/conf.py + +build: + os: ubuntu-22.04 + tools: + python: "3.12" + +# Build documentation with MkDocs +#mkdocs: +# configuration: mkdocs.yml + +# Optionally build your docs in additional formats such as PDF and ePub +formats: all + +# Optionally set the version of Python and requirements required to build your docs +python: + install: + - requirements: docs/requirements_minimal.txt + +# conda environment +#conda: +# environment: environment.yml \ No newline at end of file diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 31ed28ef..00000000 --- a/.travis.yml +++ /dev/null @@ -1,16 +0,0 @@ -language: python -python: - - "2.7" -before_install: - - wget 'http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh' -O miniconda.sh - - chmod +x miniconda.sh - - ./miniconda.sh -b - - export PATH=/home/travis/miniconda2/bin:$PATH - - conda update --yes conda -install: - - conda create --yes -n shorttext-test python=$TRAVIS_PYTHON_VERSION pip numpy scipy - - source activate shorttext-test - - pip install unittest2 - - pip install pytest - - pip install -U . -script: python shorttext_tests.py \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..e2a3f566 --- /dev/null +++ b/LICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2016 Kwan Yuet Stephen Ho + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in index 17bc2ad2..4a6b9900 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,6 @@ include README.md -include shorttext/data/shorttext_exampledata.csv -include shorttext/utils/stopwordset.pkl \ No newline at end of file +include LICENSE +include pyproject.toml +include src/shorttext/data/shorttext_exampledata.csv +include src/shorttext/utils/stopwords.txt +include src/shorttext/utils/nonneg_stopwords.txt diff --git a/README.md b/README.md index 5a9d8661..e761efb7 100644 --- a/README.md +++ b/README.md @@ -1,39 +1,156 @@ -# Short Text Categorization in Python +# Short Text Mining in Python -This repository is a collection of algorithms for multi-class classification to short texts using Python. Modules are backward compatible unless otherwise specified. Feel free to give suggestions. +[![CircleCI](https://circleci.com/gh/stephenhky/PyShortTextCategorization.svg?style=svg)](https://circleci.com/gh/stephenhky/PyShortTextCategorization.svg) +[![GitHub release](https://img.shields.io/github/release/stephenhky/PyShortTextCategorization.svg?maxAge=3600)](https://github.com/stephenhky/PyShortTextCategorization/releases) +[![Documentation Status](https://readthedocs.org/projects/pyqentangle/badge/?version=latest)](https://pyqentangle.readthedocs.io/en/latest/?badge=latest) +[![Updates](https://pyup.io/repos/github/stephenhky/PyShortTextCategorization/shield.svg)](https://pyup.io/repos/github/stephenhky/PyShortTextCategorization/) +[![Python 3](https://pyup.io/repos/github/stephenhky/PyShortTextCategorization/python-3-shield.svg)](https://pyup.io/repos/github/stephenhky/PyShortTextCategorization/) +[![pypi](https://img.shields.io/pypi/v/shorttext.svg?maxAge=3600)](https://pypi.org/project/shorttext/) +[![download](https://img.shields.io/pypi/dm/shorttext.svg?maxAge=2592000&label=installs&color=%2327B1FF)](https://pypi.org/project/shorttext/) +[![stars](https://img.shields.io/github/stars/stephenhky/PyShortTextCategorization.svg?style=social&label=Star&maxAge=60)](https://github.com/stephenhky/PyShortTextCategorization) + +## Introduction + +This package `shorttext` is a Python package that facilitates supervised and unsupervised +learning for short text categorization. Due to the sparseness of words and +the lack of information carried in the short texts themselves, an intermediate +representation of the texts and documents are needed before they are put into +any classification algorithm. In this package, it facilitates various types +of these representations, including topic modeling and word-embedding algorithms. + +The package `shorttext` runs on Python 3.9, 3.10, 3.11, 3.12, and 3.13. +Characteristics: + +- example data provided (including subject keywords and NIH RePORT); +- text preprocessing; +- pre-trained word-embedding support; +- `gensim` topic models (LDA, LSI, Random Projections) and autoencoder; +- topic model representation supported for supervised learning using `scikit-learn`; +- cosine distance classification; +- neural network classification (including ConvNet, and C-LSTM); +- maximum entropy classification; +- metrics of phrases differences, including soft Jaccard score (using Damerau-Levenshtein distance), and Word Mover's distance (WMD); +- character-level sequence-to-sequence (seq2seq) learning; and +- spell correction. + +## Documentation + +Documentation and tutorials for `shorttext` can be found here: [http://shorttext.rtfd.io/](http://shorttext.rtfd.io/). + +See [tutorial](http://shorttext.readthedocs.io/en/latest/tutorial.html) for how to use the package, and [FAQ](https://shorttext.readthedocs.io/en/latest/faq.html). + +## Installation To install it, in a console, use `pip`. ``` ->>> pip install -U shorttext +>>> pip install shorttext ``` -or, if you want the most updated code that is not released on PyPI yet, type +or, if you want the most recent development version on Github, type ``` ->>> pip install -U git+https://github.com/stephenhky/PyShortTextCategorization@master +>>> pip install git+https://github.com/stephenhky/PyShortTextCategorization@master ``` -Developers are advised to make sure `Keras` >=2 be installed. Users are advised to install the backend `Tensorflow` (preferred) or `Theano` in advance. +See [installation guide](https://shorttext.readthedocs.io/en/latest/install.html) for more details. -See [tutorial](http://pythonhosted.org/shorttext/tutorial.html) for how to use the package. -# Issues +## Issues To report any issues, go to the [Issues](https://github.com/stephenhky/PyShortTextCategorization/issues) tab of the Github page and start a thread. It is welcome for developers to submit pull requests on their own to fix any errors. -# Useful Links +## Contributors + +If you would like to contribute, feel free to submit the pull requests to the `develop` branch. +You can talk to me in advance through e-mails or the [Issues](https://github.com/stephenhky/PyShortTextCategorization/issues) page. -* Documentation : [https://pythonhosted.org/shorttext/](https://pythonhosted.org/shorttext/) +## Useful Links + +* Documentation: [http://shorttext.readthedocs.io](http://shorttext.readthedocs.io/) * Github: [https://github.com/stephenhky/PyShortTextCategorization](https://github.com/stephenhky/PyShortTextCategorization) -* PyPI: [https://pypi.python.org/pypi/shorttext](https://pypi.python.org/pypi/shorttext) +* PyPI: [https://pypi.org/project/shorttext/](https://pypi.org/project/shorttext/) +* "Package shorttext 1.0.0 released," [Medium](https://medium.com/@stephenhky/package-shorttext-1-0-0-released-ca3cb24d0ff3) * "Python Package for Short Text Mining", [WordPress](https://datawarrior.wordpress.com/2016/12/22/python-package-for-short-text-mining/) +* "Document-Term Matrix: Text Mining in R and Python," [WordPress](https://datawarrior.wordpress.com/2018/01/22/document-term-matrix-text-mining-in-r-and-python/) * An [earlier version](https://github.com/stephenhky/PyShortTextCategorization/tree/b298d3ce7d06a9b4e0f7d32f27bab66064ba7afa) of this repository is a demonstration of the following blog post: [Short Text Categorization using Deep Neural Networks and Word-Embedding Models](https://datawarrior.wordpress.com/2016/10/12/short-text-categorization-using-deep-neural-networks-and-word-embedding-models/) -# News +## News + +* 04/19/2026: `shorttext` 4.0.0 released. +* 03/22/2026: `shorttext` 3.1.1 released. +* 03/02/2026: `shorttext` 3.1.0 reelased. +* 10/27/2025: `shorttext` 3.0.1 released. +* 08/10/2025: `shorttext` 3.0.0 released. +* 06/02/2025: `shorttext` 2.2.1 released. (Acknowledgement: [Minseo Kim](https://kmingseo.github.io/)) +* 05/29/2025: `shorttext` 2.2.0 released. (Acknowledgement: [Minseo Kim](https://kmingseo.github.io/)) +* 05/08/2025: `shorttext` 2.1.1 released. +* 12/14/2024: `shorttext` 2.1.0 released. +* 07/12/2024: `shorttext` 2.0.0 released. +* 12/21/2023: `shorttext` 1.6.1 released. +* 08/26/2023: `shorttext` 1.6.0 released. +* 06/19/2023: `shorttext` 1.5.9 released. +* 09/23/2022: `shorttext` 1.5.8 released. +* 09/22/2022: `shorttext` 1.5.7 released. +* 08/29/2022: `shorttext` 1.5.6 released. +* 05/28/2022: `shorttext` 1.5.5 released. +* 12/15/2021: `shorttext` 1.5.4 released. +* 07/11/2021: `shorttext` 1.5.3 released. +* 07/06/2021: `shorttext` 1.5.2 released. +* 04/10/2021: `shorttext` 1.5.1 released. +* 04/09/2021: `shorttext` 1.5.0 released. +* 02/11/2021: `shorttext` 1.4.8 released. +* 01/11/2021: `shorttext` 1.4.7 released. +* 01/03/2021: `shorttext` 1.4.6 released. +* 12/28/2020: `shorttext` 1.4.5 released. +* 12/24/2020: `shorttext` 1.4.4 released. +* 11/10/2020: `shorttext` 1.4.3 released. +* 10/18/2020: `shorttext` 1.4.2 released. +* 09/23/2020: `shorttext` 1.4.1 released. +* 09/02/2020: `shorttext` 1.4.0 released. +* 07/23/2020: `shorttext` 1.3.0 released. +* 06/05/2020: `shorttext` 1.2.6 released. +* 05/20/2020: `shorttext` 1.2.5 released. +* 05/13/2020: `shorttext` 1.2.4 released. +* 04/28/2020: `shorttext` 1.2.3 released. +* 04/07/2020: `shorttext` 1.2.2 released. +* 03/23/2020: `shorttext` 1.2.1 released. +* 03/21/2020: `shorttext` 1.2.0 released. +* 12/01/2019: `shorttext` 1.1.6 released. +* 09/24/2019: `shorttext` 1.1.5 released. +* 07/20/2019: `shorttext` 1.1.4 released. +* 07/07/2019: `shorttext` 1.1.3 released. +* 06/05/2019: `shorttext` 1.1.2 released. +* 04/23/2019: `shorttext` 1.1.1 released. +* 03/03/2019: `shorttext` 1.1.0 released. +* 02/14/2019: `shorttext` 1.0.8 released. +* 01/30/2019: `shorttext` 1.0.7 released. +* 01/29/2019: `shorttext` 1.0.6 released. +* 01/13/2019: `shorttext` 1.0.5 released. +* 10/03/2018: `shorttext` 1.0.4 released. +* 08/06/2018: `shorttext` 1.0.3 released. +* 07/24/2018: `shorttext` 1.0.2 released. +* 07/17/2018: `shorttext` 1.0.1 released. +* 07/14/2018: `shorttext` 1.0.0 released. +* 06/18/2018: `shorttext` 0.7.2 released. +* 05/30/2018: `shorttext` 0.7.1 released. +* 05/17/2018: `shorttext` 0.7.0 released. +* 02/27/2018: `shorttext` 0.6.0 released. +* 01/19/2018: `shorttext` 0.5.11 released. +* 01/15/2018: `shorttext` 0.5.10 released. +* 12/14/2017: `shorttext` 0.5.9 released. +* 11/08/2017: `shorttext` 0.5.8 released. +* 10/27/2017: `shorttext` 0.5.7 released. +* 10/17/2017: `shorttext` 0.5.6 released. +* 09/28/2017: `shorttext` 0.5.5 released. +* 09/08/2017: `shorttext` 0.5.4 released. +* 09/02/2017: end of GSoC project. ([Report](https://rare-technologies.com/chinmayas-gsoc-2017-summary-integration-with-sklearn-keras-and-implementing-fasttext/)) +* 08/22/2017: `shorttext` 0.5.1 released. +* 07/28/2017: `shorttext` 0.4.1 released. +* 07/26/2017: `shorttext` 0.4.0 released. * 06/16/2017: `shorttext` 0.3.8 released. * 06/12/2017: `shorttext` 0.3.7 released. * 06/02/2017: `shorttext` 0.3.6 released. @@ -48,6 +165,7 @@ to fix any errors. * 11/25/2016: `shorttext` 0.1.2 released. * 11/21/2016: `shorttext` 0.1.1 released. -# Possible Future Updates +# Acknowledgements -Refer to [UPCOMING.md](UPCOMING.md). +* [Chinmaya Pancholi](https://www.linkedin.com/in/cpancholi/) +* [Minseo Kim](https://kmingseo.github.io/) diff --git a/UPCOMING.md b/UPCOMING.md deleted file mode 100644 index 1e115263..00000000 --- a/UPCOMING.md +++ /dev/null @@ -1,22 +0,0 @@ -Upcoming Updates to `shorttext` -=============================== - -Confirmed Updates ------------------ - -* Maximum entropy models; -* Use of `gensim` Word2Vec `keras` layers. - -Expected Updates ----------------- - -* Incorporating new features from `gensim`; -* Implementation of author-topic model; -* Python 3 compatibility; -* More neural networks; -* More available corpus; -* Generative models; -* Support of seq2seq models; -* Gradual fading-out dependence on `Theano`, and lesser `keras` but more fundamental `Tensorflow`; -* Spelling corrections and fuzzy logic; -* Other word-embedding models. \ No newline at end of file diff --git a/bin/ShortTextCategorizerConsole b/bin/ShortTextCategorizerConsole deleted file mode 100644 index 93157b81..00000000 --- a/bin/ShortTextCategorizerConsole +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/env python - -# argument parsing -import argparse - -def get_argparser(): - argparser = argparse.ArgumentParser(description='Perform prediction on short text with a given trained model.') - argparser.add_argument('model_filepath', help='Path of the trained (compact) model.') - argparser.add_argument('--wv', default='', help='Path of the pre-trained Word2Vec model. (None if not needed)') - argparser.add_argument('--topn', type=int, default=10, help='Number of top-scored results displayed. (Default: 10)') - return argparser - -argparser = get_argparser() -args = argparser.parse_args() - -allowed_classifiers = ['ldatopic', 'lsitopic', 'rptopic', 'kerasautoencoder', 'topic_sklearn', - 'nnlibvec', 'sumvec', 'maxent'] -needembedded_classifiers = ['nnlibvec', 'sumvec'] -topicmodels = ['ldatopic', 'lsitopic', 'rptopic', 'kerasautoencoder'] - -# library loading -import os - -import shorttext -from shorttext.utils.classification_exceptions import Word2VecModelNotExistException, AlgorithmNotExistException - -# main block -if __name__ == '__main__': - # check if the model file is given - if not os.path.exists(args.model_filepath): - raise IOError('Model file '+args.model_filepath+' not found!') - - # get the name of the classifier - print 'Retrieving classifier name...' - classifier_name = shorttext.utils.compactmodel_io.get_model_classifier_name(args.model_filepath) - if not (classifier_name in allowed_classifiers): - raise AlgorithmNotExistException(classifier_name) - - # load the Word2Vec model if necessary - wvmodel = None - if classifier_name in needembedded_classifiers: - # check if thw word embedding model is available - if not os.path.exists(args.wv): - raise Word2VecModelNotExistException(args.wv) - # if there, load it - print 'Loading word-embedding model...', args.wv - wvmodel = shorttext.utils.load_word2vec_model(args.wv) - - # load the classifier - print 'Initializing the classifier...' - classifier = None - if classifier_name in topicmodels: - topicmodel = shorttext.smartload_compact_model(args.model_filepath, wvmodel) - classifier = shorttext.classifiers.TopicVectorCosineDistanceClassifier(topicmodel) - else: - classifier = shorttext.smartload_compact_model(args.model_filepath, wvmodel) - - # Initializing the SpaCy kernel - shorttext.utils.textpreprocessing.spaCyNLPHolder.getNLPInstance() - - # Console - run = True - while run: - shorttext = raw_input('text> ') - if len(shorttext) > 0: - scoredict = classifier.score(shorttext) - for label, score in sorted(scoredict.items(), key=lambda s: s[1], reverse=True)[:args.topn]: - print label, ' : ', score - else: - run = False - - print 'Done.' \ No newline at end of file diff --git a/bin/ShortTextWord2VecSimilarity b/bin/ShortTextWord2VecSimilarity deleted file mode 100644 index 4ba6a80f..00000000 --- a/bin/ShortTextWord2VecSimilarity +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env python - -# argument parsing -import argparse - -def getargparser(): - parser = argparse.ArgumentParser(description='Find the similarity between two short sentences using Word2Vec.') - parser.add_argument('word2vec_modelpath', help='Path of the Word2Vec model') - return parser - -parser = getargparser() -args = parser.parse_args() - -import shorttext -from shorttext.utils import tokenize -from scipy.spatial import distance -from itertools import product -import numpy as np - -class ShortSentenceWord2VecSimilarity: - def __init__(self, modelpath): - self.model = shorttext.utils.load_word2vec_model(modelpath) if modelpath!=None else None - - def sim_words(self, word1, word2): - return 1-distance.cosine(self.model[word1], self.model[word2]) - - def jaccardscore_sents(self, sent1, sent2): - tokens1 = tokenize(sent1) - tokens2 = tokenize(sent2) - tokens1 = filter(lambda w: w in self.model, tokens1) - tokens2 = filter(lambda w: w in self.model, tokens2) - allowable1 = [True]*len(tokens1) - allowable2 = [True]*len(tokens2) - - simdict = dict() - for i, j in product(range(len(tokens1)), range(len(tokens2))): - simdict[(i, j)] = self.sim_words(tokens1[i], tokens2[j]) - - intersection = 0.0 - simdictitems = sorted(simdict.items(), key=lambda s: s[1], reverse=True) - for idxtuple, sim in simdictitems: - i, j = idxtuple - if allowable1[i] and allowable2[j]: - intersection += sim - allowable1[i] = False - allowable2[j] = False - - union = len(tokens1) + len(tokens2) - intersection - - if union > 0: - return intersection / union - elif intersection == 0: - return 1. - else: - return np.inf - -if __name__ == '__main__': - calculator = ShortSentenceWord2VecSimilarity(args.word2vec_modelpath) - end = False - # preload tokenizer - tokenize('Mogu is cute.') - while not end: - sent1 = raw_input('sent1> ') - if len(sent1)==0: - end = True - else: - sent2 = raw_input('sent2> ') - print "Word2Vec Jaccard Score Similarity = ", calculator.jaccardscore_sents(sent1, sent2) \ No newline at end of file diff --git a/bin/switch_kerasbackend b/bin/switch_kerasbackend deleted file mode 100644 index 9372986c..00000000 --- a/bin/switch_kerasbackend +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python - -# Secret code. Welcome for those who find this code. - -# argument parsing -import argparse - -def getargparser(): - parser = argparse.ArgumentParser(description='Switch Keras backend') - parser.add_argument('backend', help="Backend ('theano' or 'tensorflow')") - return parser - -parser = getargparser() -args = parser.parse_args() - -import os -import json - -homedir = os.path.expanduser('~') -kerasconfigfile = os.path.join(homedir, '.keras/keras.json') - -if __name__ == '__main__': - kerasconfig = json.load(open(kerasconfigfile, 'r')) - kerasconfig['backend'] = args.backend - json.dump(kerasconfig, open(kerasconfigfile, 'w')) - print 'Keras backend set to', args.backend \ No newline at end of file diff --git a/data/USInaugural.zip b/data/USInaugural.zip deleted file mode 100644 index eb19e79f..00000000 Binary files a/data/USInaugural.zip and /dev/null differ diff --git a/data/nih_full.csv.zip b/data/nih_full.csv.zip deleted file mode 100644 index 96158d7c..00000000 Binary files a/data/nih_full.csv.zip and /dev/null differ diff --git a/docs/codes.rst b/docs/codes.rst index e57e5921..53c151cb 100644 --- a/docs/codes.rst +++ b/docs/codes.rst @@ -1,137 +1,339 @@ -Documentation -============= +API +=== -Training Data Retrieval ------------------------ +Complete API reference for the shorttext library. -Module `shorttext.data.data_retrieval` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. contents:: + :local: + :backlinks: none -.. automodule:: shorttext.data.data_retrieval +Top-Level Modules +----------------- + +.. automodule:: shorttext :members: + :undoc-members: + :show-inheritance: -Text Preprocessing ------------------- +.. automodule:: shorttext.smartload + :members: + :undoc-members: + :show-inheritance: -Module `shorttext.utils.textpreprocessing` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Classifiers +----------- -.. automodule:: shorttext.utils.textpreprocessing +.. automodule:: shorttext.classifiers :members: + :undoc-members: + :show-inheritance: -Topic Models ------------- +Base Classifier +^^^^^^^^^^^^^^^ -Module `shorttext.generators.bow.LatentTopicModeling` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. automodule:: shorttext.classifiers.base + :members: + :undoc-members: + :show-inheritance: -.. automodule:: shorttext.generators.bow.LatentTopicModeling +Bag-of-Words Classifiers +^^^^^^^^^^^^^^^^^^^^^^^ + +.. automodule:: shorttext.classifiers.bow + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.classifiers.bow.topic.SkLearnClassification + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.classifiers.bow.topic.TopicVectorDistanceClassification + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.classifiers.bow.maxent.MaxEntClassification + :members: + :undoc-members: + :show-inheritance: + +Embedding-Based Classifiers +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. automodule:: shorttext.classifiers.embed + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.classifiers.embed.sumvec.SumEmbedVecClassification :members: + :undoc-members: + :show-inheritance: -Module `shorttext.generators.bow.GensimTopicModeling` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. automodule:: shorttext.classifiers.embed.sumvec.VarNNSumEmbedVecClassification + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.classifiers.embed.sumvec.frameworks + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.classifiers.embed.nnlib.VarNNEmbedVecClassification + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.classifiers.embed.nnlib.frameworks + :members: + :undoc-members: + :show-inheritance: + +Generators +---------- + +.. automodule:: shorttext.generators + :members: + :undoc-members: + :show-inheritance: + +Bag-of-Words Generators +^^^^^^^^^^^^^^^^^^^^^^ + +.. automodule:: shorttext.generators.bow + :members: + :undoc-members: + :show-inheritance: .. automodule:: shorttext.generators.bow.GensimTopicModeling :members: + :undoc-members: + :show-inheritance: -Module `shorttext.generators.bow.AutoEncodingTopicModeling` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. automodule:: shorttext.generators.bow.LatentTopicModeling + :members: + :undoc-members: + :show-inheritance: .. automodule:: shorttext.generators.bow.AutoEncodingTopicModeling :members: + :undoc-members: + :show-inheritance: +Sequence-to-Sequence Generators +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Module `shorttext.classifiers.topic.TopicVectorDistanceClassification` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. automodule:: shorttext.generators.seq2seq + :members: + :undoc-members: + :show-inheritance: -.. automodule:: shorttext.classifiers.bow.topic.TopicVectorDistanceClassification +.. automodule:: shorttext.generators.seq2seq.s2skeras :members: + :undoc-members: + :show-inheritance: -Module `shorttext.classifiers.topic.SkLearnClassification` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. automodule:: shorttext.generators.seq2seq.charbaseS2S + :members: + :undoc-members: + :show-inheritance: -.. automodule:: shorttext.classifiers.bow.topic.SkLearnClassification +Character-Based Generators +^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. automodule:: shorttext.generators.charbase + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.generators.charbase.char2vec :members: + :undoc-members: + :show-inheritance: -Supervised Classification using Word Embedding ----------------------------------------------- +Metrics +------- -Module `shorttext.classifiers.embed.sumvec.SumEmbedVecClassification` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. automodule:: shorttext.metrics + :members: + :undoc-members: + :show-inheritance: -.. automodule:: shorttext.classifiers.embed.sumvec.SumEmbedVecClassification +.. automodule:: shorttext.metrics.dynprog :members: + :undoc-members: + :show-inheritance: -Module `shorttext.classifiers.embed.sumvec.VarNNSumEmbedVecClassification` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. automodule:: shorttext.metrics.dynprog.jaccard + :members: + :undoc-members: + :show-inheritance: -.. automodule:: shorttext.classifiers.embed.sumvec.VarNNSumEmbedVecClassification +.. automodule:: shorttext.metrics.dynprog.dldist :members: + :undoc-members: + :show-inheritance: -Module `shorttext.classifiers.embed.nnlib.VarNNEmbedVecClassification` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. automodule:: shorttext.metrics.dynprog.lcp + :members: + :undoc-members: + :show-inheritance: -.. automodule:: shorttext.classifiers.embed.nnlib.VarNNEmbedVecClassification +.. automodule:: shorttext.metrics.wasserstein :members: + :undoc-members: + :show-inheritance: -Maximum Entropy Classifiers ---------------------------- +.. automodule:: shorttext.metrics.wasserstein.wordmoverdist + :members: + :undoc-members: + :show-inheritance: -Module `shorttext.classifiers.bow.maxent.MaxEntClassification -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. automodule:: shorttext.metrics.embedfuzzy + :members: + :undoc-members: + :show-inheritance: -.. automodule:: shorttext.classifiers.bow.maxent.MaxEntClassification +.. automodule:: shorttext.metrics.embedfuzzy.jaccard :members: + :undoc-members: + :show-inheritance: -Neural Networks ---------------- +Spell Correction +---------------- -Module `shorttext.classifiers.embed.sumvec.frameworks` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. automodule:: shorttext.spell + :members: + :undoc-members: + :show-inheritance: -.. automodule:: shorttext.classifiers.embed.sumvec.frameworks +.. automodule:: shorttext.spell.basespellcorrector :members: + :undoc-members: + :show-inheritance: -Module `shorttext.classifiers.embed.nnlib.frameworks` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. automodule:: shorttext.spell.norvig + :members: + :undoc-members: + :show-inheritance: -.. automodule:: shorttext.classifiers.embed.nnlib.frameworks +.. automodule:: shorttext.spell.editor + :members: + :undoc-members: + :show-inheritance: + +Stacking +-------- + +.. automodule:: shorttext.stack + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.stack.stacking + :members: + :undoc-members: + :show-inheritance: + +Data +---- + +.. automodule:: shorttext.data :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.data.data_retrieval + :members: + :undoc-members: + :show-inheritance: Utilities --------- -Module `shorttext.utils.kerasmodel_io` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. automodule:: shorttext.utils + :members: + :undoc-members: + :show-inheritance: .. automodule:: shorttext.utils.kerasmodel_io :members: + :undoc-members: + :show-inheritance: -Module `shorttext.utils.gensim_corpora` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. automodule:: shorttext.utils.compactmodel_io + :members: + :undoc-members: + :show-inheritance: .. automodule:: shorttext.utils.gensim_corpora :members: + :undoc-members: + :show-inheritance: -Module `shorttext.utils.wordembed` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. automodule:: shorttext.utils.textpreprocessing + :members: + :undoc-members: + :show-inheritance: .. automodule:: shorttext.utils.wordembed :members: + :undoc-members: + :show-inheritance: -Module `shorttext.utils.compactmodel_io` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. automodule:: shorttext.utils.compute + :members: + :undoc-members: + :show-inheritance: -.. automodule:: shorttext.utils.compactmodel_io +.. automodule:: shorttext.utils.misc :members: + :undoc-members: + :show-inheritance: -Stacked Generalization ----------------------- +.. automodule:: shorttext.utils.dtm + :members: + :undoc-members: + :show-inheritance: -Module `shorttext.stack` -^^^^^^^^^^^^^^^^^^^^^^^^ +.. automodule:: shorttext.utils.classification_exceptions + :members: + :undoc-members: + :show-inheritance: -.. automodule:: shorttext.stack.stacking +Schemas +------- + +.. automodule:: shorttext.schemas + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.schemas.models + :members: + :undoc-members: + :show-inheritance: + +CLI +--- + +.. automodule:: shorttext.cli :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.cli.categorization + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: shorttext.cli.wordembedsim + :members: + :undoc-members: + :show-inheritance: + Home: :doc:`index` \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index 744ae2ad..34bc4869 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -18,19 +18,8 @@ # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -sys.path.append(os.path.abspath('.')) -sys.path.append(os.path.abspath('..')) -sys.path.append(os.path.abspath('../shorttext')) -sys.path.append(os.path.abspath('../shorttext/data')) -sys.path.append(os.path.abspath('../shorttext/utils')) -sys.path.append(os.path.abspath('../shorttext/classifiers')) -sys.path.append(os.path.abspath('../shorttext/classifiers/embed')) -sys.path.append(os.path.abspath('../shorttext/classifiers/embed/autoencode')) -sys.path.append(os.path.abspath('../shorttext/classifiers/embed/sumvec')) -sys.path.append(os.path.abspath('../shorttext/classifiers/embed/nnlib')) -sys.path.append(os.path.abspath('../shorttext/classifiers/bow')) -sys.path.append(os.path.abspath('../shorttext/classifiers/bow/topic')) -sys.path.append(os.path.abspath('../bin')) +sys.path.insert(0, os.path.abspath('..')) +sys.path.insert(0, os.path.abspath('../src')) # -- General configuration ------------------------------------------------ @@ -41,10 +30,11 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.mathjax' + 'sphinx.ext.mathjax', 'sphinx.ext.autodoc', 'sphinx.ext.viewcode', + 'sphinx.ext.napoleon', 'sphinx.ext.intersphinx' ] + # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] @@ -61,17 +51,17 @@ # General information about the project. project = u'shorttext' -copyright = u'2017, Kwan-Yuet Ho' -author = u'Kwan-Yuet Ho' +copyright = u'2017, Kwan Yuet Stephen Ho' +author = u'Kwan Yuet Stephen Ho' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = u'0.3' +version = u'4.0' # The full version, including alpha/beta/rc tags. -release = u'0.3.8' +release = u'4.0.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -99,7 +89,7 @@ # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -add_module_names = False +#add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. @@ -117,17 +107,35 @@ # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False +# -- Options for Autodoc -------------------------------------------------- + +# Mock imports for heavy dependencies +autodoc_mock_imports = [ + 'tensorflow', 'keras', 'gensim', 'numba', 'joblib' +] + +autodoc_default_options = { + 'members': True, + 'undoc-members': True, + 'show-inheritance': True, +} + +autodoc_member_order = 'bysource' + # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'alabaster' - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -#html_theme_options = {} +html_theme = 'sphinx_rtd_theme' + +html_theme_options = { + 'description': 'Short text classification toolkit', + 'github_user': 'analytics-warehouse', + 'github_repo': 'shorttext', + 'fixed_sidebar': False, + 'show_related': True, +} # Add any paths that contain custom themes here, relative to this directory. #html_theme_path = [] @@ -153,6 +161,15 @@ # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] +html_sidebars = { + '**': [ + 'about.html', + 'navigation.html', + 'searchbox.html', + 'related.html', + ], +} + # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. @@ -169,6 +186,10 @@ # Custom sidebar templates, maps document names to template names. #html_sidebars = {} +html_show_sourcelink = True +html_use_index = True +html_split_index = False + # Additional templates that should be rendered to pages, maps page names to # template names. #html_additional_pages = {} diff --git a/docs/faq.rst b/docs/faq.rst index f8b20188..27274134 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -1,59 +1,55 @@ Frequently Asked Questions (FAQ) ================================ -1. Can we use Tensorflow backend? +**Q1. Can we use backends other than TensorFlow?** -Ans: Yes, users can use tensorflow backend instead of theano backend, as both as supported -by Keras. Refer to `Keras Backend -`_ for information about switching backends. +Ans: No. -2. Can we use word-embedding algorithms other than Word2Vec? -Ans: Currently only Word2Vec is directly supported. However, you can -convert GloVe models into Word2Vec models. See: :doc:`tutorial_wordembed` . +**Q2. Can we use word-embedding algorithms other than Word2Vec?** -3. Can this package work on Python 3? +Ans: Yes. Besides Word2Vec, you can use FastText and Poincaré embedding. See: :doc:`tutorial_wordembed` . -Ans: This package is written in Python 2.7. It is not guaranteed that the package works perfectly -well in Python 3. -4. This package requires SpaCy, which involves loading several models that -are needed for `shorttext` to run correctly. It gives error whenever I ran -models that require tokenization. What should I do? +**Q3. Can this package work on Python 2?** -If your code gives the error message that includes the following: +Ans: No. -:: - ValueError: Found English model at //anaconda/lib/python2.7/site-packages/spacy/data/en-1.1.0. - This model is not compatible with the current version. - See https://spacy.io/docs/usage/models to download the new model. -Then run the following command in your terminal or console: +**Q4. How should I cite `shorttext` if I use it in my research?** -:: +Ans: For the time being, You do not have to cite a particular paper for using this package. +However, if you use any particular functions or class, check out the docstring. If there is a paper (or papers) +mentioned, cite those papers. For example, if you use `CNNWordEmbed` in `frameworks +`_, +according to the docstring, cite Yoon Kim's paper. Refer to this documentation for the reference too. - python -m spacy download en -Refer to `spaCy webpage -`_ for more information. +**Q5. I am having trouble in install `shorttext` on Google Cloud Platform. What should I do?** -5. Warning or messages pop up when running models involving neural networks. What is the problem? +Ans: There is no "Python.h". Run: `sudo apt-get install python3-dev` in SSH shell of the VM instance. -Make sure your `keras` have version >= 2. -6. The following error message appears while loading shorttext: -:: +**Q6. Where is the Sakiguchi spell corrector? ** - ImportError: dlopen: cannot load any more object with static TLS +Ans: It was removed since release 3.0.0, but you can refer to the `examples\` folder in the +Github repository for the codes. -How do I deal with it? -If you use Tensorflow as your backend, you may experience this problem. This has been pointed -out by Yeung in the community: `issue -`_ . You should either reload tensorflow, -or reinstall, or try to workaround by importing `spaCy` before `shorttext`. + +**Q7. Where are `WrappedBERTEncoder` and `BERTScorer`?** + +Ans: It was removed since release 3.0.0, but you can install another package `shorttext-bert` +to get the same functionality. + + + +**Q8. My model files were created by `shorttext` version < 2.0.0. How do I make them readable for version >= 2.0.0? + +Ans: Simply make those files with names ending with `.h5` to `.weights.h5`. + Home: :doc:`index` diff --git a/docs/index.rst b/docs/index.rst index 1d881df6..c623d70e 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -8,7 +8,9 @@ Homepage of `shorttext` This repository is a collection of algorithms for multi-class classification to short texts using Python. Modules are backward compatible unless otherwise specified. Feel free to give suggestions or report -issues through the Issue_ tab of the Github_ page. +issues through the Issue_ tab of the Github_ page. This is a PyPI_ project. This is an open-source +project under the `MIT License +`_ . Contents: @@ -18,8 +20,8 @@ Contents: intro install tutorial - codes scripts + codes faq refs links @@ -27,6 +29,7 @@ Contents: .. _Github: https://github.com/stephenhky/PyShortTextCategorization .. _Issue: https://github.com/stephenhky/PyShortTextCategorization/issues +.. _PyPI: https://pypi.org/project/shorttext/ Indices and tables ================== diff --git a/docs/install.rst b/docs/install.rst index d48ac600..855cbcf5 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -4,48 +4,66 @@ Installation PIP --- +Package `shorttext` runs in Python 3.9, 3.10, 3.11, and 3.12. However, for Python>=3.7, the backend +of keras_ cannot be Tensorflow_. + To install the package in Linux or OS X, enter the following in the console: :: - pip install -U shorttext + pip install shorttext It is very possible that you have to do it as root, that you have to add ``sudo`` in front of the command. -However, the repository on Python Package Index is not always the most updated. To get -the most updated (not official) version, you can install from Github_: +On the other hand, to get the development version on Github, you can install from Github_: :: - pip install -U git+https://github.com/stephenhky/PyShortTextCategorization - -By adding ``-U`` in the command, it automatically installs the required packages. If not, -you have to install these packages on your own. + pip install git+https://github.com/stephenhky/PyShortTextCategorization@master -.. _Github: https://github.com/stephenhky/PyShortTextCategorization -Required Packages +Backend for Keras ----------------- -- Numpy_ (Numerical Python) -- SciPy_ (Scientific Python) -- Scikit-Learn_ (Machine Learning in Python) -- Theano_ (Symbolic Computing for Deep Learning) -- keras_ (Deep Learning Library for Theano and Tensorflow) -- gensim_ (Topic Modeling for Humans) -- Pandas_ (Python Data Analysis Library) -- spaCy_ (Industrial Strenglth Natural Language Processing in Python) -- stemming_ (stemming in Python) +We use TensorFlow for `keras`. + +Possible Solutions for Installation Failures +-------------------------------------------- + +Most developers can install `shorttext` with the instructions above. If the installation fails, +you may try one (or more) of the following: + +1. Installing Python-dev by typing: + + +:: + + pip install python3-dev + + + +2. Installing `gcc` by entering + +:: + + apt-get install libc6 + + + +.. _Github: https://github.com/stephenhky/PyShortTextCategorization + Home: :doc:`index` .. _Numpy: http://www.numpy.org/ .. _SciPy: https://www.scipy.org/ .. _Scikit-Learn: http://scikit-learn.org/stable/ +.. _Tensorflow: https://www.tensorflow.org/ .. _Theano: http://deeplearning.net/software/theano/ +.. _CNTK: https://github.com/Microsoft/CNTK/wiki .. _keras: https://keras.io/ .. _gensim: https://radimrehurek.com/gensim/ .. _Pandas: http://pandas.pydata.org/ -.. _spaCy: https://spacy.io/ -.. _stemming: https://pypi.python.org/pypi/stemming/ +.. _snowballstemmer: https://github.com/snowballstem/snowball +.. _Joblib: https://joblib.readthedocs.io/en/latest/ \ No newline at end of file diff --git a/docs/intro.rst b/docs/intro.rst index 1d3e5c72..078015af 100644 --- a/docs/intro.rst +++ b/docs/intro.rst @@ -1,13 +1,15 @@ Introduction ============ -This package `shorttext` is a Python package that facilitates supervised +This package `shorttext` is a Python package that facilitates supervised and unsupervised learning for short text categorization. Due to the sparseness of words and the lack of information carried in the short texts themselves, an intermediate representation of the texts and documents are needed before they are put into any classification algorithm. In this package, it facilitates various types of these representations, including topic modeling and word-embedding algorithms. +The package `shorttext` runs on Python 3.9, 3.10, 3.11, and 3.12. + Characteristics: - example data provided (including subject keywords and NIH RePORT); (see :doc:`tutorial_dataprep`) @@ -15,10 +17,22 @@ Characteristics: - pre-trained word-embedding support; (see :doc:`tutorial_wordembed`) - `gensim` topic models (LDA, LSI, Random Projections) and autoencoder; (see :doc:`tutorial_topic`) - topic model representation supported for supervised learning using `scikit-learn`; (see :doc:`tutorial_topic`) -- cosine distance classification; (see :doc:`tutorial_topic`, :doc:`tutorial_umvec`) and -- neural network classification (including ConvNet, and C-LSTM). (see :doc:`tutorial_nnlib`) +- cosine distance classification; (see :doc:`tutorial_topic`, :doc:`tutorial_sumvec`) +- neural network classification (including ConvNet, and C-LSTM); (see :doc:`tutorial_nnlib`) +- maximum entropy classification; (see :doc:`tutorial_maxent`) +- metrics of phrases differences, including soft Jaccard score (using Damerau-Levenshtein distance), and Word Mover's distance (WMD); (see :doc:`tutorial_metrics`) +- character-level sequence-to-sequence (seq2seq) learning; (see :doc:`tutorial_charbaseseq2seq`) +- spell correction; (see :doc:`tutorial_spell`) + +Author: Kwan Yuet Stephen Ho (LinkedIn_, ResearchGate_) +Other contributors: `Chinmaya Pancholi `_, `Minseo Kim `_ + +Contribution +------------ -Author: Kwan-Yuet Ho (LinkedIn_, ResearchGate_) +If you would like to contribute, feel free to submit the pull requests to the `develop` branch. +You can talk to me in advance through e-mails or the `Issues +`_ page. Home: :doc:`index` diff --git a/docs/links.rst b/docs/links.rst index ddb16b80..1608ea36 100644 --- a/docs/links.rst +++ b/docs/links.rst @@ -9,7 +9,14 @@ Project Codes and Package .. _Github: https://github.com/stephenhky/PyShortTextCategorization -.. _PyPI: https://pypi.python.org/pypi/shorttext +.. _PyPI: https://pypi.org/project/shorttext/ + +Issues +------ + +To report bugs and issues, please go to Issues_. + +.. _Issues: https://github.com/stephenhky/PyShortTextCategorization/issues Gensim Incubator ---------------- @@ -19,10 +26,12 @@ by Google Summer of Code (GSoC) project to support the open-source project for ` Part of his project is to employ the wrapping ideas in `shorttext` to integrate `keras`, `scikit-learn` and `gensim`. -Chinmaya's blog post: +Chinmaya's blog posts: `https://rare-technologies.com/author/chinmaya/ +`_ + +Chinmaya's proposal for GSoC: `https://github.com/numfocus/gsoc/blob/master/2017/proposals/Chinmaya_Pancholi.md +`_ -* `Google Summer of Code 2017 – Week 1 on Integrating Gensim with scikit-learn and Keras - `_ Blog Entries ------------ @@ -44,5 +53,22 @@ Blog Entries "Word-Embedding Algorithms," *Everything About Data Analytics*, WordPress (2016). [`WordPress `_] +"Python Package for Short Text Mining," *Everything About Data Analytics*, WordPress (2016). [`WordPress +`_] + +"Short Text Mining using Advanced Keras Layers and Maxent: shorttext 0.4.1," *Everything About Data Analytics*, WordPress (2017). [`WordPress +`_] + +"Word Mover’s Distance as a Linear Programming Problem," *Everything About Data Analytics*, WordPress (2017). [`WordPress +`_] + +"Release of shorttext 0.5.4," *Everything About Data Analytics*, WordPress (2017). [`WordPress +`_] + +"Document-Term Matrix: Text Mining in R and Python," *Everything About Data Analytics*, WordPress (2018). [`WordPress +`_] + +"Package shorttext 1.0.0 Released," Medium (2018). [`Medium +`_] Home: :doc:`index` \ No newline at end of file diff --git a/docs/news.rst b/docs/news.rst index bc83b6aa..f3e58529 100644 --- a/docs/news.rst +++ b/docs/news.rst @@ -1,6 +1,77 @@ News ==== +* 04/19/2026: `shorttext` 4.0.0 released. +* 03/22/2026: `shorttext` 3.1.1 released. +* 03/02/2026: `shorttext` 3.1.0 reelased. +* 10/27/2025: `shorttext` 3.0.1 released. +* 08/10/2025: `shorttext` 3.0.0 released. +* 06/02/2025: `shorttext` 2.2.1 released. +* 05/29/2025: `shorttext` 2.2.0 released. +* 05/08/2025: `shorttext` 2.1.1 released. +* 12/14/2024: `shorttext` 2.1.0 released. +* 07/12/2024: `shorttext` 2.0.0 released. +* 12/21/2023: `shorttext` 1.6.1 released. +* 08/26/2023: `shorttext` 1.6.0 released. +* 06/19/2023: `shorttext` 1.5.9 released. +* 09/23/2022: `shorttext` 1.5.8 released. +* 09/22/2022: `shorttext` 1.5.7 released. +* 08/29/2022: `shorttext` 1.5.6 released. +* 05/28/2022: `shorttext` 1.5.5 released. +* 12/15/2021: `shorttext` 1.5.4 released. +* 07/11/2021: `shorttext` 1.5.3 released. +* 07/06/2021: `shorttext` 1.5.2 released. +* 04/10/2021: `shorttext` 1.5.1 released. +* 04/09/2021: `shorttext` 1.5.0 released. +* 02/11/2021: `shorttext` 1.4.8 released. +* 01/11/2021: `shorttext` 1.4.7 released. +* 01/03/2021: `shorttext` 1.4.6 released. +* 12/28/2020: `shorttext` 1.4.5 released. +* 12/24/2020: `shorttext` 1.4.4 released. +* 11/10/2020: `shorttext` 1.4.3 released. +* 10/18/2020: `shorttext` 1.4.2 released. +* 09/23/2020: `shorttext` 1.4.1 released. +* 09/02/2020: `shorttext` 1.4.0 released. +* 07/23/2020: `shorttext` 1.3.0 released. +* 06/05/2020: `shorttext` 1.2.6 released. +* 05/20/2020: `shorttext` 1.2.5 released. +* 05/13/2020: `shorttext` 1.2.4 released. +* 04/28/2020: `shorttext` 1.2.3 released. +* 04/07/2020: `shorttext` 1.2.2 released. +* 03/23/2020: `shorttext` 1.2.1 released. +* 03/21/2020: `shorttext` 1.2.0 released. +* 12/01/2019: `shorttext` 1.1.6 released. +* 09/24/2019: `shorttext` 1.1.5 released. +* 07/20/2019: `shorttext` 1.1.4 released. +* 07/07/2019: `shorttext` 1.1.3 released. +* 06/05/2019: `shorttext` 1.1.2 released. +* 04/23/2019: `shorttext` 1.1.1 released. +* 03/03/2019: `shorttext` 1.1.0 released. +* 02/14/2019: `shorttext` 1.0.8 released. +* 01/30/2019: `shorttext` 1.0.7 released. +* 01/29/2019: `shorttext` 1.0.6 released. +* 01/13/2019: `shorttext` 1.0.5 released. +* 10/03/2018: `shorttext` 1.0.4 released. +* 08/06/2018: `shorttext` 1.0.3 released. +* 07/24/2018: `shorttext` 1.0.2 released. +* 07/17/2018: `shorttext` 1.0.1 released. +* 07/14/2018: `shorttext` 1.0.0 released. +* 06/18/2018: `shorttext` 0.7.2 released. +* 05/30/2018: `shorttext` 0.7.1 released. +* 05/17/2018: `shorttext` 0.7.0 released. +* 02/27/2018: `shorttext` 0.6.0 released. +* 01/19/2018: `shorttext` 0.5.11 released. +* 01/15/2018: `shorttext` 0.5.10 released. +* 12/14/2017: `shorttext` 0.5.9 released. +* 11/08/2017: `shorttext` 0.5.8 released. +* 10/27/2017: `shorttext` 0.5.7 released. +* 10/17/2017: `shorttext` 0.5.6 released. +* 09/28/2017: `shorttext` 0.5.5 released. +* 09/08/2017: `shorttext` 0.5.4 released. +* 09/02/2017: end of GSoC project. +* 08/22/2017: `shorttext` 0.5.1 released. +* 07/28/2017: `shorttext` 0.4.1 released. +* 07/26/2017: `shorttext` 0.4.0 released. * 06/16/2017: `shorttext` 0.3.8 released. * 06/12/2017: `shorttext` 0.3.7 released. * 06/02/2017: `shorttext` 0.3.6 released. @@ -17,14 +88,455 @@ News * 11/21/2016: `shorttext` 0.1.1 released. What's New -========== +---------- + +Release 4.0.0 (April 19, 2026) +------------------------------ + +* Removed support for Python 3.9 and 3.10; +* New style of documentation; +* New docstrings; +* Type hinting; +* Dependence on `gensim` reduced to topic modeling related functions and Word2Vec embedding; +* Modernizing the use of `keras`; +* The use of `loguru`, `orjson`, `sparse` and `npdict` libraries; +* Code cleanup and debugged; +* Removed the old implementation of document-term matrix, and replaced it with `NumpyDocumentTermMatrix`; +* Implementation of cosine similarity optimized by `numba` instead using the cosine distance from `scipy`; +* All unit tests and regression tests rewritten, and run by `pytest`; + +Release 3.1.1 (March 22, 2026) +------------------------------ + +* Change to Github actions. + +Release 3.1.0 (March, 2, 2026) +------------------------------ + +* Support for Python 3.13; +* Removal of dependence on `nptyping`. + +Release 3.0.1 (October 25, 2025) +-------------------------------- + +* Small bugs fixed. + +Release 3.0.0 (August 10, 2025) +------------------------------- + +* Introduction of Github workflow, publishing package directly to PyPI from Github; +* Removal of Sakiguchi spell corrector; (refer to the `examples\` folder in the repository) +* Removal of `WrappedBERTEncoder` and `BERTScorer`; (they can be installed from the package `shorttext-bert +`_) +* Update of documentation. + + +Release 2.2.1 (June 2, 2025) +---------------------------- + +* Code cleanup for token categorization. (Acknowledgements: Minseo Kim) + + +Release 2.2.0 (May 29, 2025) +---------------------------- + +* Update `keras` to `tensorflow.keras`. (Acknowledgements: Minseo Kim) + + +Release 2.1.1 (May 8, 2025) +--------------------------- + +* Update of Snowball stemmer; +* Codes cleaned up. + +Release 2.1.0 (December 14, 2024) +--------------------------------- + +* Use of `pyproject.toml` for package distribution. +* Removed Cython components. +* Huge relative import refactoring. + +Release 2.0.0 (July 13, 2024) +----------------------------- + +* Decommissioned support for Python 3.8. +* Added support for Python 3.12. +* Updated file extensions for model files. + +Release 1.6.1 (December 21, 2023) +--------------------------------- + +* Updated package requirements. + +Release 1.6.0 (August 26, 2023) +------------------------------- + +* Pinned requirements for ReadTheDocs documentation; +* Fixed bugs in word-embedding model mean pooling classifiers; +* Updated package requirements. + + +Release 1.5.9 (June 19, 2023) +----------------------------- + +* Support for Python 3.11; +* Removing flask. + +Release 1.5.8 (September 23, 2022) +---------------------------------- + +* Package administration. + +Release 1.5.7 (September 22, 2022) +---------------------------------- + +* Removal of requirement of pre-installation of `numpy` and `Cython`. + +Release 1.5.6 (August 29, 2022) +------------------------------- + +* Speeding up inference of `VarNNEmbeddedVecClassifier`. (Acknowledgement: Ritesh Agrawal) + +Release 1.5.5 (May 28, 2022) +----------------------------- + +* Support for Python 3.10. + + +Release 1.5.4 (December 15, 2021) +----------------------------- + +* Non-negative stop words. + +Release 1.5.3 (July 11, 2021) +----------------------------- + +* Documentation updated. + +Release 1.5.2 (July 6, 2021) +---------------------------- + +* Resolved bugs regarding `keras` import. +* Support for Python 3.9. + +Release 1.5.1 (April 10, 2021) +------------------------------ + +* Replaced TravisCI with CircleCI in the continuous integration pipeline. + +Release 1.5.0 (April 09, 2021) +------------------------------ + +* Removed support for Python 3.6. +* Removed buggy BERT representations unit test. + +Release 1.4.8 (February 11, 2021) +--------------------------------- + +* Updated requirements for `scipy` for Python 3.7 or above. + +Release 1.4.7 (January 11, 2021) +-------------------------------- + +* Updated version of `transformers` in `requirement.txt`; +* Updated BERT encoder for the change of implementation; +* Fixed unit tests. + +Release 1.4.6 (January 3, 2021) +------------------------------- + +* Bug regarding Python 3.6 requirement for `scipy`. + +Release 1.4.5 (December 28, 2020) +--------------------------------- + +* Bugs fixed about Python 2 to 3 updates, `filter` in `shorttext.metrics.embedfuzzy`. + +Release 1.4.4 (December 24, 2020) +--------------------------------- + +* Bugs regarding `SumEmbedVeccClassification.py`; +* Fixing bugs due to Python 3.6 restriction on `scipy`. + + +Release 1.4.3 (November 10, 2020) +--------------------------------- + +* Bugs about transformer-based model on different devices resolved. + +Release 1.4.2 (October 18, 2020) +---------------------------------- + +* Documentation requirements and PyUp configs cleaned up. + +Release 1.4.1 (September 23, 2020) +---------------------------------- + +* Documentation and codes cleaned up. + +Release 1.4.0 (September 2, 2020) +--------------------------------- + +* Provided support BERT-based sentence and tokens embeddings; +* Implemented support for BERTScores. + +Release 1.3.0 (July 23, 2020) +----------------------------- + +* Removed all dependencies on `PuLP`; all computations of word mover's distance (WMD) is performed using `SciPy`. + +Release 1.2.6 (June 20, 2020) +----------------------------- + +* Removed Python-2 codes (`urllib2`). + +Release 1.2.5 (May 20, 2020) +---------------------------- + +* Update on `gensim` package usage and requirements; +* Removed some deprecated functions. + +Release 1.2.4 (May 13, 2020) +---------------------------- + +* Update on `scikit-learn` requirements to `>=0.23.0`. +* Directly dependence on `joblib`; +* Support for Python 3.8 added. + +Release 1.2.3 (April 28, 2020) +------------------------------ + +* PyUP scan implemented; +* Support for Python 3.5 decommissioned. + +Release 1.2.2 (April 7, 2020) +----------------------------- + +* Removed dependence on `PyStemmer`, which is replaced by `snowballstemmer`. + +Release 1.2.1 (March 23, 2020) +------------------------------ + +* Added port number adjustability for word-embedding API; +* Removal of Spacy dependency. + +Release 1.2.0 (March 21, 2020) +------------------------------ + +* API for word-embedding algorithm for one-time loading. + + +Release 1.1.6 (December 1, 2019) +-------------------------------- + +* Compatibility with TensorFlow 2.0.0. + + +Release 1.1.5 (September 24, 2019) +---------------------------------- + +* Decommissioned GCP buckets; using data files stored in AWS S3 buckets. + + +Release 1.1.4 (July 20, 2019) +----------------------------- + +* Minor bugs fixed. + +Release 1.1.3 (July 7, 2019) +---------------------------- + +* Updated codes for Console code loading; +* Updated Travis CI script. + +Release 1.1.2 (June 5, 2019) +----------------------------- + +* Updated codes for Fasttext moddel loading as the previous function was deprecated. + +Release 1.1.1 (April 23, 2019) +------------------------------ + +* Bug fixed. (Acknowledgement: `Hamish Dickson + `_ ) + +Release 1.1.0 (March 3, 2019) +----------------------------- + +* Size of embedded vectors set to 300 again when necessary; (possibly break compatibility) +* Moving corpus data from Github to Google Cloud Storage. + + +Release 1.0.8 (February 14, 2019) +--------------------------------- + +* Minor bugs fixed. + + +Release 1.0.7 (January 30, 2019) +-------------------------------- + +* Compatibility with Python 3.7 with TensorFlow as the backend. + +Release 1.0.7 (January 30, 2019) +-------------------------------- + +* Compatibility with Python 3.7 with Theano as the backend; +* Minor documentation changes. + + +Release 1.0.6 (January 29, 2019) +-------------------------------- + +* Documentation change; +* Word-embedding model used in unit test stored in Amazon S3 bucket. + + +Release 1.0.5 (January 13, 2019) +-------------------------------- + +* Minor versioning bug fixed. + + +Release 1.0.4 (October 3, 2018) +------------------------------- + +* Package `keras` requirement updated; +* Less dependence on `pandas`. + + +Release 1.0.3 (August 6, 2018) +------------------------------ + +* Bugs regarding I/O of `SumEmbeddedVecClassifier`. + +Release 1.0.2 (July 24, 2018) +----------------------------- + +* Minor bugs regarding installation fixed. + +Release 1.0.1 (July 14, 2018) +----------------------------- + +* Minor bugs fixed. + +Release 1.0.0 (July 14, 2018) +----------------------------- + +* Python-3 compatibility; +* Replacing the original stemmer to use Snowball; +* Certain functions cythonized; +* Various bugs fixed. + +Release 0.7.2 (June 18, 2018) +----------------------------- + +* Damerau-Levenshtein distance and longest common prefix implemented using Cython. + +Release 0.7.1 (May 30, 2018) +---------------------------- + +* Decorator replaced by base class `CompactIOMachine`; +* API included in documentation. + + +Release 0.7.0 (May 17, 2018) +---------------------------- + +* Spelling corrections and fuzzy logic; +* More unit tests. + + +Release 0.6.0 (February 27, 2018) +--------------------------------- + +* Support of character-based sequence-to-sequence (seq2seq) models. + + +Release 0.5.11 (January 19, 2018) +--------------------------------- + +* Removal of word-embedding `keras`-type layers. + +Release 0.5.10 (January 15, 2018) +--------------------------------- + +* Support of encoder module for character-based models; +* Implementation of document-term matrix (DTM). + +Release 0.5.9 (December 14, 2017) +--------------------------------- + +* Support of Poincare embedding; +* Code optimization; +* Script `ShortTextWord2VecSimilarity` updated to `ShortTextWordEmbedSimilarity`. + +Release 0.5.8 (November 8, 2017) +-------------------------------- + +* Removed most explicit user-specification of `vecsize` for given word-embedding models; +* Removed old namespace for topic models (no more backward compatibility). +* Integration of [FastText](https://github.com/facebookresearch/fastText). + + +Release 0.5.7 (October 27, 2017) +-------------------------------- + +* Removed most explicit user-specification of `vecsize` for given word-embedding models; +* Removed old namespace for topic models (hence no more backward compatibility). + +Release 0.5.6 (October 17, 2017) +-------------------------------- + +* Updated the neural network framework due to the change in `gensim` API. + +Release 0.5.5 (September 28, 2017) +---------------------------------- + +* Script `ShortTextCategorizerConsole` updated. + +Release 0.5.4 (September 8, 2017) +--------------------------------- + +* Bug fixed; +* New scripts for finding distances between sentences; +* Finding similarity between two sentences using Jaccard index. + +End of GSoC Program (September 2, 2017) +--------------------------------------- + +Chinmaya summarized his GSoC program in his blog post posted in `RaRe Incubator +`_. + + +Release 0.5.1 (August 22, 2017) +------------------------------- + +* Implementation of Damerau-Levenshtein distance and soft Jaccard score; +* Implementation of Word Mover's distance. + + +Release 0.4.1 (July 28, 2017) +----------------------------- + +* Further Travis.CI update tests; +* Model file I/O updated (for huge models); +* Migrating documentation to [readthedocs.org](readthedocs.org); previous documentation at `Pythonhosted.org` destroyed. + + +Release 0.4.0 (July 26, 2017) +----------------------------- + +* Maximum entropy models; +* Use of `gensim` Word2Vec `keras` layers; +* Incorporating new features from `gensim`; +* Use of Travis.CI for pull request testing. Release 0.3.8 (June 16, 2017) ----------------------------- * Bug fixed on `sumvecframeworks`. - Release 0.3.7 (June 12, 2017) ----------------------------- @@ -38,7 +550,7 @@ Release 0.3.6 (June 2, 2017) * Added "update" corpus capability to `gensim` models. Google Summer of Code (May 30, 2017) ------------------------------------------ +------------------------------------ Chinamaya Pancholi, a Google Summer of Code (GSoC) student, is involved in the open-source development of `gensim`, that his project will be very related diff --git a/docs/refs.rst b/docs/refs.rst index 8ccc05da..75b7de5a 100644 --- a/docs/refs.rst +++ b/docs/refs.rst @@ -1,7 +1,17 @@ References ========== -Adam L. Berger, Stephen A. Della Pietra, Vincent J. Della Pietra, "A Maximum Entropy Approach to Natural Language Processing," *Computational Linguistics* 22(1): 39-72 (1996). +Adam L. Berger, Stephen A. Della Pietra, Vincent J. Della Pietra, "A Maximum Entropy Approach to Natural Language Processing," *Computational Linguistics* 22(1): 39-72 (1996). [`ACM +`_] + +Aurelien Geron, *Hands-On Machine Learning with Scikit-Learn and TensorFlow* (Sebastopol, CA: O'Reilly Media, 2017). [`O\'Reilly +`_] + +Chinmaya Pancholi, "Gensim integration with scikit-learn and Keras," *Google Summer of Codes* (GSoC) proposal (2017). [`Github +`_] + +Chinmaya Pancholi, "Chinmaya’s GSoC 2017 Summary: Integration with sklearn & Keras and implementing fastText," *RaRe Incubator* (September 2, 2017). [`RaRe +`_] Christopher Manning, Hinrich Schütze, *Foundations of Statistical Natural Language Processing* (Cambridge, MA: MIT Press, 1999). [`MIT Press `_] @@ -12,28 +22,73 @@ Christopher D. Manning, Prabhakar Raghavan, Hinrich Schütze, *Introduction to I Chunting Zhou, Chonglin Sun, Zhiyuan Liu, Francis Lau, "A C-LSTM Neural Network for Text Classification," (arXiv:1511.08630). [`arXiv `_] +Daniel E. Russ, Kwan-Yuet Ho, Calvin A. Johnson, Melissa C. Friesen, "Computer-Based Coding of Occupation Codes for Epidemiological Analyses," *2014 IEEE 27th International Symposium on Computer-Based Medical Systems* (CBMS), pp. 347-350. (2014) [`IEEE +`_] + +Daniel E. Russ, Kwan-Yuet Ho, Joanne S. Colt, Karla R. Armenti, Dalsu Baris, Wong-Ho Chow, Faith Davis, Alison Johnson, Mark P. Purdue, Margaret R. Karagas, Kendra Schwartz, Molly Schwenn, Debra T. Silverman, Patricia A. Stewart, Calvin A. Johnson, Melissa C. Friesen, “Computer-based coding of free-text job descriptions to efficiently and reliably incorporate occupational risk factors into large-scale epidemiological studies”, *Occup. Environ. Med.* 73, 417-424 (2016). [`BMJ +`_] + +Daniel Russ, Kwan-yuet Ho, Melissa Friesen, "It Takes a Village To Solve A Problem in Data Science," Data Science Maryland, presentation at Applied Physics Laboratory (APL), Johns Hopkins University, on June 19, 2017. (2017) [`Slideshare +`_] + David H. Wolpert, "Stacked Generalization," *Neural Netw* 5: 241-259 (1992). -David M. Blei, "Probabilistic Topic Models," *Communications of the ACM* 55(4): 77-84 (2012). +David M. Blei, "Probabilistic Topic Models," *Communications of the ACM* 55(4): 77-84 (2012). [`ACM +`_] + +Francois Chollet, "A ten-minute introduction to sequence-to-sequence learning in Keras," *The Keras Blog*. [`Keras +`_] Francois Chollet, "Building Autoencoders in Keras," *The Keras Blog*. [`Keras `_] -Hsiang-Fu Yu, Chia-Hua Ho, Yu-Chin Juan, and Chih-Jen Lin, "LibShortText: A Library for Short-text Classification." [`NTU +Hsiang-Fu Yu, Chia-Hua Ho, Yu-Chin Juan, Chih-Jen Lin, "LibShortText: A Library for Short-text Classification." [`NTU `_] +Ilya Sutskever, James Martens, Geoffrey Hinton, "Generating Text with Recurrent Neural Networks," *ICML* (2011). [`UToronto +`_] + +Ilya Sutskever, Oriol Vinyals, Quoc V. Le, "Sequence to Sequence Learning with Neural Networks," arXiv:1409.3215 (2014). [`arXiv +`_] + +Jayant Jain, "Implementing Poincaré Embeddings," RaRe Technologies (2017). [`RaRe +`_] + Jeffrey Pennington, Richard Socher, Christopher D. Manning, “GloVe: Global Vectors for Word Representation,” *Empirical Methods in Natural Language Processing (EMNLP)*, pp. 1532-1543 (2014). [`PDF `_] +Keisuke Sakaguchi, Kevin Duh, Matt Post, Benjamin Van Durme, "Robsut Wrod Reocginiton via semi-Character Recurrent Neural Networ," arXiv:1608.02214 (2016). [`arXiv +`_] + "Keras 2.0 Release Notes." (2017) [`Github `_] +Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger, "From Word Embeddings to Document Distances," *ICML* (2015). + +Maximilian Nickel, Douwe Kiela, "Poincaré Embeddings for Learning Hierarchical Representations," arXiv:1705.08039 (2017). [`arXiv +`_] + Michael Czerny, "Modern Methods for Sentiment Analysis," *District Data Labs (2015). [`DistrictDataLabs `_] M. Paz Sesmero, Agapito I. Ledezma, Araceli Sanchis, "Generating ensembles of heterogeneous classifiers using Stacked Generalization," *WIREs Data Mining and Knowledge Discovery* 5: 21-34 (2015). +Nal Kalchbrenner, Edward Grefenstette, Phil Blunsom, "A Convolutional Neural Network for Modelling Sentences," *Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics*, pp. 655-665 (2014). [`arXiv +`_] + +Oriol Vinyals, Quoc Le, "A Neural Conversational Model," arXiv:1506.05869 (2015). [`arXiv +`_] + +Peter Norvig, "How to write a spell corrector." (2016) [`Norvig +`_] + +Piotr Bojanowski, Edouard Grave, Armand Joulin, Tomas Mikolov, "Enriching Word Vectors with Subword Information," arXiv:1607.04606 (2016). [`arXiv +`_] + +Radim Rehurek, Petr Sojka, "Software Framework for Topic Modelling with Large Corpora," In Proceedings of LREC 2010 workshop New Challenges for NLP Frameworks (2010). [`ResearchGate +`_] + Sebastian Ruder, "An overview of gradient descent optimization algorithms," blog of Sebastian Ruder, arXiv:1609.04747 (2016). [`Ruder `_ or `arXiv `_] @@ -48,6 +103,9 @@ Thomas W. Jones, "textmineR: Functions for Text Mining and Topic Modeling," CRAN Tomas Mikolov, Kai Chen, Greg Corrado, Jeffrey Dean, “Efficient Estimation of Word Representations in Vector Space,” *ICLR* 2013 (2013). [`arXiv `_] +Tom Young, Devamanyu Hazarika, Soujanya Poria, Erik Cambria, "Recent Trends in Deep Learning Based Natural Language Processing," arXiv:1708.02709 (2017). [`arXiv +`_] + Xuan Hieu Phan, Cam-Tu Nguyen, Dieu-Thu Le, Minh Le Nguyen, Susumu Horiguchi, Quang-Thuy Ha, "A Hidden Topic-Based Framework toward Building Applications with Short Web Documents," *IEEE Trans. Knowl. Data Eng.* 23(7): 961-976 (2011). diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 00000000..f832a351 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,17 @@ +numpy==2.4.4 +scipy==1.17.1 +joblib==1.5.3 +scikit-learn==1.8.0 +tensorflow==2.21.0 +keras==3.14.0 +gensim==4.4.0 +pandas==3.0.2 +snowballstemmer==3.0.1 +transformers==5.5.4 +torch==2.11.0 +numba==0.65.0 +deprecation==2.1.0 +npdict==0.0.10 +orjson==3.11.8 +sparse==0.18.0 +loguru==0.7.3 diff --git a/docs/requirements_minimal.txt b/docs/requirements_minimal.txt new file mode 100644 index 00000000..9fa0372a --- /dev/null +++ b/docs/requirements_minimal.txt @@ -0,0 +1,10 @@ +numpy>=1.23.3 +scipy>=1.12.0 +snowballstemmer>=3.0.0 +scikit-learn>=1.2.0 +pandas>=1.2.0 +deprecation>=2.1 +orjson>=3.11.0 +loguru>=0.7.0 +npdict>=0.0.10 +sparse>=0.18.0 diff --git a/docs/scripts.rst b/docs/scripts.rst index c70c9356..a729b6ed 100644 --- a/docs/scripts.rst +++ b/docs/scripts.rst @@ -10,33 +10,46 @@ ShortTextCategorizerConsole :: - usage: ShortTextCategorizerConsole [-h] [--wv WV] [--topn TOPN] model_filepath + usage: ShortTextCategorizerConsole [-h] [--wv WV] [--vecsize VECSIZE] + [--topn TOPN] [--inputtext INPUTTEXT] + [--type TYPE] + model_filepath Perform prediction on short text with a given trained model. positional arguments: - model_filepath Path of the trained (compact) model. + model_filepath Path of the trained (compact) model. - optional arguments: - -h, --help show this help message and exit - --wv WV Path of the pre-trained Word2Vec model. (None if not needed) - --topn TOPN Number of top-scored results displayed. (Default: 10) + options: + -h, --help show this help message and exit + --wv WV Path of the pre-trained Word2Vec model. (None if not + needed) + --vecsize VECSIZE Vector dimensions. (Default: 300) + --topn TOPN Number of top-scored results displayed. (Default: 10) + --inputtext INPUTTEXT + single input text for classification. Run console if + set to None. (Default: None) + --type TYPE Type of word-embedding model (default: "word2vec"; + other options: "fasttext", "poincare", + "word2vec_nonbinary", "poincare_binary") -ShortTextWord2VecSimilarity ---------------------------- +ShortTextWordEmbedSimilarity +---------------------------- :: - usage: ShortTextWord2VecSimilarity [-h] word2vec_modelpath + usage: ShortTextWordEmbedSimilarity [-h] [--type TYPE] modelpath - Find the similarity between two short sentences using Word2Vec. + Find the similarities between two short sentences using Word2Vec. positional arguments: - word2vec_modelpath Path of the Word2Vec model + modelpath Path of the Word2Vec model optional arguments: - -h, --help show this help message and exit + -h, --help show this help message and exit + --type TYPE Type of word-embedding model (default: "word2vec"; other + options: "fasttext", "poincare") Home: :doc:`index` diff --git a/docs/tutorial.rst b/docs/tutorial.rst index e74c13f1..16ff691b 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -8,20 +8,22 @@ Before using, type >>> import shorttext -You will get the message that `Theano` or `Tensorflow` backend is used for `keras`. Refer to `Keras Backend -`_ for information about switching backends. - .. toctree:: :maxdepth: 2 tutorial_dataprep tutorial_textpreprocessing + tutorial_dtm + tutorial_charbaseonehot tutorial_topic tutorial_wordembed tutorial_sumvec tutorial_nnlib tutorial_maxent + tutorial_charbaseseq2seq tutorial_stacking + tutorial_metrics + tutorial_spell Home: :doc:`index` diff --git a/docs/tutorial_charbaseonehot.rst b/docs/tutorial_charbaseonehot.rst new file mode 100644 index 00000000..0fb38993 --- /dev/null +++ b/docs/tutorial_charbaseonehot.rst @@ -0,0 +1,66 @@ +Character to One-Hot Vector +=========================== + +Since version 0.6.1, the package `shorttext` deals with character-based model. A first important +component of character-based model is to convert every character to a one-hot vector. We provide a class +:class:`shorttext.generators.SentenceToCharVecEncoder` to deal with this. Thi class incorporates +the `OneHotEncoder` in `scikit-learn` and `Dictionary` in `gensim`. + +To use this, import the packages first: + +>>> import numpy as np +>>> import shorttext + +Then we incorporate a text file as the source of all characters to be coded. In this case, we choose +the file `big.txt` in Peter Norvig's websites: + +>>> from urllib.request import urlopen +>>> textfile = urlopen('http://norvig.com/big.txt', 'r') + +Then instantiate the class using the function :func:`shorttext.generators.initSentenceToCharVecEncoder`: + +>>> chartovec_encoder = shorttext.generators.initialize_SentenceToCharVecEncoder(textfile) + +Now, the object + +>>> chartovec_encoder = shorttext.generators.initialize_SentenceToCharVecEncoder(textfile) + +Now, the object + +>>> chartovec_encoder = shorttext.generators.initialize_SentenceToCharVecEncoder(textfile) + +Now, the object `chartovec_encoder` is an instance of :class:`shorttext.generators.SentenceToCharVecEncoder` . The +default signal character is `\n`, which is also encoded, and can be checked by looking at the field: + +>>> chartovec_encoder.signalchar + +We can convert a sentence into a bunch of one-hot vectors in terms of a matrix. For example, + +>>> chartovec_encoder.encode_sentence('Maryland blue crab!', 100) +<1x93 sparse matrix of type '' + with 1 stored elements in Compressed Sparse Column format> + +This outputs a sparse matrix. Depending on your needs, you can add signal character to the beginning +or the end of the sentence in the output matrix by: + +>>> chartovec_encoder.encode_sentence('Maryland blue crab!', 100, startsig=True, endsig=False) +>>> chartovec_encoder.encode_sentence('Maryland blue crab!', 100, startsig=False, endsig=True) + +We can also convert a list of sentences by + +>>> chartovec_encoder.encode_sentences(sentences, 100, startsig=False, endsig=True, sparse=False) + +You can decide whether or not to output a sparse matrix by specifiying the parameter `sparse`. + + +.. automodule:: shorttext.generators.charbase.char2vec + :members: + + +Reference +--------- + +Aurelien Geron, *Hands-On Machine Learning with Scikit-Learn and TensorFlow* (Sebastopol, CA: O'Reilly Media, 2017). [`O\'Reilly +`_] + +Home: :doc:`index` \ No newline at end of file diff --git a/docs/tutorial_charbaseseq2seq.rst b/docs/tutorial_charbaseseq2seq.rst new file mode 100644 index 00000000..2d6ca988 --- /dev/null +++ b/docs/tutorial_charbaseseq2seq.rst @@ -0,0 +1,100 @@ +Character-Based Sequence-to-Sequence (seq2seq) Models +===================================================== + +Since release 0.6.0, `shorttext` supports sequence-to-sequence (seq2seq) learning. While there is a general seq2seq class +behind, it provides a character-based seq2seq implementation. + +Creating One-hot Vectors +------------------------ + +To use it, create an instance of the class :class:`shorttext.generators.SentenceToCharVecEncoder`: + +>>> import numpy as np +>>> import shorttext +>>> from urllib.request import urlopen +>>> chartovec_encoder = shorttext.generators.initialize_SentenceToCharVecEncoder(urlopen('http://norvig.com/big.txt', 'r')) + +The above code is the same as + +>>> import numpy as np +>>> import shorttext +>>> from urllib.request import urlopen +>>> chartovec_encoder = shorttext.generators.initialize_SentenceToCharVecEncoder(urlopen('http://norvig.com/big.txt', 'r')) + +The above code is the same as + +>>> import numpy as np +>>> import shorttext +>>> from urllib.request import urlopen +>>> chartovec_encoder = shorttext.generators.initialize_SentenceToCharVecEncoder(urlopen('http://norvig.com/big.txt', 'r')) + +The above code is the same as :doc:`tutorial_charbaseonehot` . + +.. automodule:: shorttext.generators.charbase.char2vec + :members: initSentenceToCharVecEncoder + + +Training +-------- + +Then we can train the model by creating an instance of :class:`shorttext.generators.CharBasedSeq2SeqGenerator`: + +>>> latent_dim = 100 +>>> seq2seqer = shorttext.generators.CharBasedSeq2SeqGenerator(chartovec_encoder, latent_dim, 120) + +And then train this neural network model: + +>>> seq2seqer.train(text, epochs=100) + +This model takes several hours to train on a laptop. + + +.. autoclass:: shorttext.generators.seq2seq.charbaseS2S.CharBasedSeq2SeqGenerator + :members: + +Decoding +-------- + +After training, we can use this class as a generative model +of answering questions as a chatbot: + +>>> seq2seqer.decode('Happy Holiday!') + +It does not give definite answers because there is a stochasticity in the prediction. + +Model I/O +--------- + +This model can be saved by entering: + +>>> seq2seqer.save_compact_model('/path/to/norvigtxt_iter5model.bin') + +And can be loaded by: + +>>> seq2seqer2 = shorttext.generators.seq2seq.charbaseS2S.loadCharBasedSeq2SeqGenerator('/path/to/norvigtxt_iter5model.bin') + +.. automodule:: shorttext.generators.seq2seq.charbaseS2S + :members: loadCharBasedSeq2SeqGenerator + + +Reference +--------- + +Aurelien Geron, *Hands-On Machine Learning with Scikit-Learn and TensorFlow* (Sebastopol, CA: O'Reilly Media, 2017). [`O\'Reilly +`_] + +Ilya Sutskever, James Martens, Geoffrey Hinton, "Generating Text with Recurrent Neural Networks," *ICML* (2011). [`UToronto +`_] + +Ilya Sutskever, Oriol Vinyals, Quoc V. Le, "Sequence to Sequence Learning with Neural Networks," arXiv:1409.3215 (2014). [`arXiv +`_] + +Oriol Vinyals, Quoc Le, "A Neural Conversational Model," arXiv:1506.05869 (2015). [`arXiv +`_] + +Tom Young, Devamanyu Hazarika, Soujanya Poria, Erik Cambria, "Recent Trends in Deep Learning Based Natural Language Processing," arXiv:1708.02709 (2017). [`arXiv +`_] + +Zackary C. Lipton, John Berkowitz, "A Critical Review of Recurrent Neural Networks for Sequence Learning," arXiv:1506.00019 (2015). [`arXiv +`_] + diff --git a/docs/tutorial_dataprep.rst b/docs/tutorial_dataprep.rst index 25080bf6..9d7f9d4e 100644 --- a/docs/tutorial_dataprep.rst +++ b/docs/tutorial_dataprep.rst @@ -32,6 +32,10 @@ the subject keywords, as below: 'Holy Trinity', 'eschatology', 'scripture', 'ecclesiology', 'predestination', 'divine degree', 'creedal confessionalism', 'scholasticism', 'prayer', 'eucharist']} + +.. automodule:: shorttext.data.data_retrieval + :members: subjectkeywords + Example Training Data 2: NIH RePORT ----------------------------------- @@ -55,9 +59,9 @@ randomly drawn from the original data. However, there are other configurations: -.. autofunction:: shorttext.data.nihreports +.. automodule:: shorttext.data.data_retrieval + :members: nihreports -If `sample_size` is specified to be `None`, all the data will be retrieved without sampling. Example Training Data 3: Inaugural Addresses -------------------------------------------- @@ -73,7 +77,8 @@ Enter: >>> trainclassdict = shorttext.data.inaugural() -.. autfunction:: shorttext.data.inaugural +.. automodule:: shorttext.data.data_retrieval + :members: inaugural User-Provided Training Data @@ -110,4 +115,8 @@ To load this data file, just enter: >>> trainclassdict = shorttext.data.retrieve_csvdata_as_dict('/path/to/file.csv') +.. automodule:: shorttext.data.data_retrieval + :members: retrieve_csvdata_as_dict + + Home: :doc:`index` diff --git a/docs/tutorial_dtm.rst b/docs/tutorial_dtm.rst new file mode 100644 index 00000000..eb2355c9 --- /dev/null +++ b/docs/tutorial_dtm.rst @@ -0,0 +1,80 @@ +Document-Term Matrix +==================== + +Preparing for the Corpus +------------------------ + +We can create and handle document-term matrix (DTM) with `shorttext`. Use the dataset of Presidents' +Inaugural Addresses as an example. + +>>> import shorttext +>>> usprez = shorttext.data.inaugural() + +We have to make each presidents' address to be one document to achieve our purpose. Enter this: + +>>> docids = sorted(usprez.keys()) +>>> usprez = [' '.join(usprez[docid]) for docid in docids] + +Now the variable `usprez` is a list of 56 Inaugural Addresses from George Washington (1789) to +Barack Obama (2009), with the IDs stored in `docids`. We apply the standard text preprocessor and +produce a list of lists (of tokens) (or a corpus in `gensim`): + +>>> preprocess = shorttext.utils.standard_text_preprocessor_1() +>>> corpus = [preprocess(address).split(' ') for address in usprez] + +Then now the variable `corpus` is a list of lists of tokens. For example, + +>>> corpus[0] # shows all the preprocessed tokens of the first Presidential Inaugural Addresses + +Using Class `NumpyDocumentTermMatrix` +------------------------------------- + +Note: the old class `DocumentTermMatrix` has been removed in release 5.0.0. + +With the corpus ready in this form, we can create a `NumpyDocumentTermMatrix` class for DTM by: +(imposing tf-idf while creating the instance of the class by enforceing `tfidf` to be `True`) + +>>> dtm = shorttext.utils.NumpyDocumentTermMatrix(corpus, docids, tfidf=True) + +.. autoclass:: shorttext.utils.dtm.NumpyDocumentTermMatrix + :members: + +One can get the document frequency of any token (the number of documents that the given +token is in) by: + +>>> dtm.get_doc_frequency('peopl') # gives 54, the document frequency of the token "peopl" + +or the total term frequencies (the total number of occurrences of the given tokens in all documents) by: + +>>> dtm.get_total_termfreq('justic') # gives 32.32, the total term frequency of the token "justic" + +or the term frequency for a token in a given document by: + +>>> dtm.get_termfreq('2009-Obama', 'chang') # gives 0.94 + +We can also query the number of occurrences of a particular word of all documents, +stored in a dictionary, by: + +>>> dtm.get_token_occurences('god') + +To save the class, enter: + +>>> usprez_dtm.save_compact_model('/path/to/whatever.bin') + +To load this class later, enter: + +>>> usprez_dtm2 = shorttext.utils.load_numpy_documentmatrixmatrix('/path/to/whatever.bin') + +.. automodule:: shorttext.utils.dtm + :members: load_DocumentTermMatrix + +Reference +--------- + +Christopher Manning, Hinrich Schuetze, *Foundations of Statistical Natural Language Processing* (Cambridge, MA: MIT Press, 1999). [`MIT Press +`_] + +"Document-Term Matrix: Text Mining in R and Python," *Everything About Data Analytics*, WordPress (2018). [`WordPress +`_] + +Home: :doc:`index` \ No newline at end of file diff --git a/docs/tutorial_maxent.rst b/docs/tutorial_maxent.rst index 407a4234..02994c11 100644 --- a/docs/tutorial_maxent.rst +++ b/docs/tutorial_maxent.rst @@ -1,6 +1,9 @@ Maximum Entropy (MaxEnt) Classifier =================================== +Maxent +------ + Maximum entropy (maxent) classifier has been a popular text classifier, by parameterizing the model to achieve maximum categorical entropy, with the constraint that the resulting probability on the training data with the model being equal to the real distribution. @@ -25,7 +28,7 @@ The classifier can be instantiated by: Train the classifier: ->>> classifier.train(classdict, nb_epochs=1000) +>>> classifier.train(classdict, nb_epochs=300) After training, it can be used for classification, such as @@ -35,8 +38,27 @@ After training, it can be used for classification, such as To save the model, ->>> classifier.score('/path/to/filename.bin') +>>> classifier.save_compact_model('/path/to/filename.bin') To load the model to be a classifier, enter: >>> classifier2 = shorttext.classifiers.load_maxent_classifier('/path/to/filename.bin') + + +.. automodule:: shorttext.classifiers.bow.maxent.MaxEntClassification + :members: + + +Reference +--------- + +Adam L. Berger, Stephen A. Della Pietra, Vincent J. Della Pietra, "A Maximum Entropy Approach to Natural Language Processing," *Computational Linguistics* 22(1): 39-72 (1996). [`ACM +`_] + +Daniel E. Russ, Kwan-Yuet Ho, Joanne S. Colt, Karla R. Armenti, Dalsu Baris, Wong-Ho Chow, Faith Davis, Alison Johnson, Mark P. Purdue, Margaret R. Karagas, Kendra Schwartz, Molly Schwenn, Debra T. Silverman, Patricia A. Stewart, Calvin A. Johnson, Melissa C. Friesen, “Computer-based coding of free-text job descriptions to efficiently and reliably incorporate occupational risk factors into large-scale epidemiological studies”, *Occup. Environ. Med.* 73, 417-424 (2016). [`BMJ +`_] + +Daniel Russ, Kwan-yuet Ho, Melissa Friesen, "It Takes a Village To Solve A Problem in Data Science," Data Science Maryland, presentation at Applied Physics Laboratory (APL), Johns Hopkins University, on June 19, 2017. (2017) [`Slideshare +`_] + +Home: :doc:`index` \ No newline at end of file diff --git a/docs/tutorial_metrics.rst b/docs/tutorial_metrics.rst new file mode 100644 index 00000000..920e3648 --- /dev/null +++ b/docs/tutorial_metrics.rst @@ -0,0 +1,418 @@ +Metrics +======= + +The package `shorttext` provides a few metrics that measure the distances of some kind. They are all +under :module:`shorttext.metrics`. The soft Jaccard score is based on spellings, and the Word Mover's +distance (WMD) embedded word vectors. + +Edit Distance and Soft Jaccard Score +------------------------------------ + +Edit distance, or Damerau-Levenshtein distance, measures the differences +between two words due to insertion, deletion, transposition, substitution etc. +Each of this change causes a distance of 1. The algorithm was written in C. + +First import the package: + +>>> from shorttext import damerau_levenshtein, longest_common_prefix, similarity, soft_jaccard_score +>>> from shorttext.metrics.dynprog.lcp import damerau_levenshtein, longest_common_prefix, similarity, soft_jaccard_score +>>> from shorttext.metrics.dynprog import similarity, soft_jaccard_score + +The distance can be calculated by: + +>>> damerau_levenshtein('diver', 'driver') # insertion, gives 1 +>>> damerau_levenshtein('driver', 'diver') # deletion, gives 1 +>>> damerau_levenshtein('topology', 'tooplogy') # transposition, gives 1 +>>> damerau_levenshtein('book', 'blok') # subsitution, gives 1 + +The longest common prefix finds the length of common prefix: + +>>> longest_common_prefix('topology', 'topological') # gives 7 +>>> longest_common_prefix('police', 'policewoman') # gives 6 + +The similarity between words is defined as the larger of the following: +between two words due to insertion, deletion, transposition, substitution etc. +Each of this change causes a distance of 1. The algorithm was written in C. + +First import the package: + +>>> from shorttext import damerau_levenshtein, longest_common_prefix, similarity, soft_jaccard_score +>>> from shorttext.metrics.dynprog.lcp import damerau_levenshtein, longest_common_prefix, similarity, soft_jaccard_score +>>> from shorttext.metrics.dynprog import similarity, soft_jaccard_score + +The distance can be calculated by: + +>>> damerau_levenshtein('diver', 'driver') # insertion, gives 1 +>>> damerau_levenshtein('driver', 'diver') # deletion, gives 1 +>>> damerau_levenshtein('topology', 'tooplogy') # transposition, gives 1 +>>> damerau_levenshtein('book', 'blok') # subsitution, gives 1 + +The longest common prefix finds the length of common prefix: + +>>> longest_common_prefix('topology', 'topological') # gives 7 +>>> longest_common_prefix('police', 'policewoman') # gives 6 + +The similarity between words is defined as the larger of the following: +between two words due to insertion, deletion, transposition, substitution etc. +Each of this change causes a distance of 1. The algorithm was written in C. + +First import the package: + +>>> from shorttext import damerau_levenshtein, longest_common_prefix, similarity, soft_jaccard_score +>>> from shorttext.metrics.dynprog.lcp import damerau_levenshtein, longest_common_prefix, similarity, soft_jaccard_score +>>> from shorttext.metrics.dynprog import similarity, soft_jaccard_score + +The distance can be calculated by: + +>>> damerau_levenshtein('diver', 'driver') # insertion, gives 1 +>>> damerau_levenshtein('driver', 'diver') # deletion, gives 1 +>>> damerau_levenshtein('topology', 'tooplogy') # transposition, gives 1 +>>> damerau_levenshtein('book', 'blok') # subsitution, gives 1 + +The longest common prefix finds the length of common prefix: + +>>> longest_common_prefix('topology', 'topological') # gives 7 +>>> longest_common_prefix('police', 'policewoman') # gives 6 + +The similarity between words is defined as the larger of the following: +between two words due to insertion, deletion, transposition, substitution etc. +Each of this change causes a distance of 1. The algorithm was written in C. + +First import the package: + +>>> from shorttext import damerau_levenshtein, longest_common_prefix, similarity, soft_jaccard_score +>>> from shorttext.metrics.dynprog.lcp import damerau_levenshtein, longest_common_prefix, similarity, soft_jaccard_score +>>> from shorttext.metrics.dynprog import similarity, soft_jaccard_score + +The distance can be calculated by: + +>>> damerau_levenshtein('diver', 'driver') # insertion, gives 1 +>>> damerau_levenshtein('driver', 'diver') # deletion, gives 1 +>>> damerau_levenshtein('topology', 'tooplogy') # transposition, gives 1 +>>> damerau_levenshtein('book', 'blok') # subsitution, gives 1 + +The longest common prefix finds the length of common prefix: + +>>> longest_common_prefix('topology', 'topological') # gives 7 +>>> longest_common_prefix('police', 'policewoman') # gives 6 + +The similarity between words is defined as the larger of the following: +between two words due to insertion, deletion, transposition, substitution etc. +Each of this change causes a distance of 1. The algorithm was written in C. + +First import the package: + +>>> from shorttext import damerau_levenshtein, longest_common_prefix, similarity, soft_jaccard_score +>>> from shorttext.metrics.dynprog.lcp import damerau_levenshtein, longest_common_prefix, similarity, soft_jaccard_score +>>> from shorttext.metrics.dynprog import similarity, soft_jaccard_score + +The distance can be calculated by: + +>>> damerau_levenshtein('diver', 'driver') # insertion, gives 1 +>>> damerau_levenshtein('driver', 'diver') # deletion, gives 1 +>>> damerau_levenshtein('topology', 'tooplogy') # transposition, gives 1 +>>> damerau_levenshtein('book', 'blok') # subsitution, gives 1 + +The longest common prefix finds the length of common prefix: + +>>> longest_common_prefix('topology', 'topological') # gives 7 +>>> longest_common_prefix('police', 'policewoman') # gives 6 + +The similarity between words is defined as the larger of the following: +between two words due to insertion, deletion, transposition, substitution etc. +Each of this change causes a distance of 1. The algorithm was written in C. + +First import the package: + +>>> from shorttext import damerau_levenshtein, longest_common_prefix, similarity, soft_jaccard_score +>>> from shorttext.metrics.dynprog.lcp import damerau_levenshtein, longest_common_prefix, similarity, soft_jaccard_score +>>> from shorttext.metrics.dynprog import similarity, soft_jaccard_score + +The distance can be calculated by: + +>>> damerau_levenshtein('diver', 'driver') # insertion, gives 1 +>>> damerau_levenshtein('driver', 'diver') # deletion, gives 1 +>>> damerau_levenshtein('topology', 'tooplogy') # transposition, gives 1 +>>> damerau_levenshtein('book', 'blok') # subsitution, gives 1 + +The longest common prefix finds the length of common prefix: + +>>> longest_common_prefix('topology', 'topological') # gives 7 +>>> longest_common_prefix('police', 'policewoman') # gives 6 + +The similarity between words is defined as the larger of the following: +between two words due to insertion, deletion, transposition, substitution etc. +Each of this change causes a distance of 1. The algorithm was written in C. + +First import the package: + +>>> from shorttext.metrics.dynprog.dldist import damerau_levenshtein, longest_common_prefix, similarity, soft_jaccard_score +>>> from shorttext.metrics.dynprog.lcp import damerau_levenshtein, longest_common_prefix, similarity, soft_jaccard_score +>>> from shorttext.metrics.dynprog import similarity, soft_jaccard_score + +The distance can be calculated by: + +>>> damerau_levenshtein('diver', 'driver') # insertion, gives 1 +>>> damerau_levenshtein('driver', 'diver') # deletion, gives 1 +>>> damerau_levenshtein('topology', 'tooplogy') # transposition, gives 1 +>>> damerau_levenshtein('book', 'blok') # subsitution, gives 1 + +The longest common prefix finds the length of common prefix: + +>>> longest_common_prefix('topology', 'topological') # gives 7 +>>> longest_common_prefix('police', 'policewoman') # gives 6 + +The similarity between words is defined as the larger of the following: + +:math:`s = 1 - \frac{\text{DL distance}}{\max( \text(len(word1)), \text(len(word2)) )}` +and +:math:`s = \frac{\text{longest common prefix}}{\max( \text(len(word1)), \text(len(word2)) )}` + +>>> similarity('topology', 'topological') # gives 0.6363636363636364 +>>> similarity('book', 'blok') # gives 0.75 + +Given the similarity, we say that the intersection, for example, between 'book' and 'blok', has 0.75 elements, or the +union has 1.25 elements. Then the similarity between two sets of tokens can be measured using Jaccard index, with this +"soft" numbers of intersection. Therefore, + +>>> soft_jaccard_score(['book', 'seller'], ['blok', 'sellers']) # gives 0.6716417910447762 +>>> soft_jaccard_score(['police', 'station'], ['policeman']) # gives 0.2857142857142858 + +The functions `damerau_levenshtein` and `longest_common_prefix` are implemented using Cython_ . +(Before release 0.7.2, they were interfaced to Python using SWIG_ (Simplified Wrapper and Interface Generator)). + + +.. automodule:: shorttext.metrics.dynprog.jaccard + :members: similarity, soft_jaccard_score + + +Word Mover's Distance +--------------------- + +Unlike soft Jaccard score that bases similarity on the words' spellings, Word Mover's distance (WMD) +the embedded word vectors. WMD is a special case for Earth Mover's distance (EMD), or Wasserstein +distance. The calculation of WMD in this package is based on linear programming, and the distance between +words are the Euclidean distance by default (not cosine distance), but user can set it accordingly. + +Import the modules, and load the word-embedding models: + +>>> from shorttext import word_mover_distance +>>> from shorttext.utils import load_word2vec_model +>>> wvmodel = load_word2vec_model('/path/to/model_file.bin') + +Examples: + +>>> word_mover_distance(['police', 'station'], ['policeman'], wvmodel) # gives 3.060708999633789 +>>> word_mover_distance(['physician', 'assistant'], ['doctor', 'assistants'], wvmodel) # gives 2.276337146759033 + +More examples can be found in this +the embedded word vectors. WMD is a special case for Earth Mover's distance (EMD), or Wasserstein +distance. The calculation of WMD in this package is based on linear programming, and the distance between +words are the Euclidean distance by default (not cosine distance), but user can set it accordingly. + +Import the modules, and load the word-embedding models: + +>>> from shorttext import word_mover_distance +>>> from shorttext.utils import load_word2vec_model +>>> wvmodel = load_word2vec_model('/path/to/model_file.bin') + +Examples: + +>>> word_mover_distance(['police', 'station'], ['policeman'], wvmodel) # gives 3.060708999633789 +>>> word_mover_distance(['physician', 'assistant'], ['doctor', 'assistants'], wvmodel) # gives 2.276337146759033 + +More examples can be found in this +the embedded word vectors. WMD is a special case for Earth Mover's distance (EMD), or Wasserstein +distance. The calculation of WMD in this package is based on linear programming, and the distance between +words are the Euclidean distance by default (not cosine distance), but user can set it accordingly. + +Import the modules, and load the word-embedding models: + +>>> from shorttext import word_mover_distance +>>> from shorttext.utils import load_word2vec_model +>>> wvmodel = load_word2vec_model('/path/to/model_file.bin') + +Examples: + +>>> word_mover_distance(['police', 'station'], ['policeman'], wvmodel) # gives 3.060708999633789 +>>> word_mover_distance(['physician', 'assistant'], ['doctor', 'assistants'], wvmodel) # gives 2.276337146759033 + +More examples can be found in this +the embedded word vectors. WMD is a special case for Earth Mover's distance (EMD), or Wasserstein +distance. The calculation of WMD in this package is based on linear programming, and the distance between +words are the Euclidean distance by default (not cosine distance), but user can set it accordingly. + +Import the modules, and load the word-embedding models: + +>>> from shorttext import word_mover_distance +>>> from shorttext.utils import load_word2vec_model +>>> wvmodel = load_word2vec_model('/path/to/model_file.bin') + +Examples: + +>>> word_mover_distance(['police', 'station'], ['policeman'], wvmodel) # gives 3.060708999633789 +>>> word_mover_distance(['physician', 'assistant'], ['doctor', 'assistants'], wvmodel) # gives 2.276337146759033 + +More examples can be found in this +the embedded word vectors. WMD is a special case for Earth Mover's distance (EMD), or Wasserstein +distance. The calculation of WMD in this package is based on linear programming, and the distance between +words are the Euclidean distance by default (not cosine distance), but user can set it accordingly. + +Import the modules, and load the word-embedding models: + +>>> from shorttext import word_mover_distance +>>> from shorttext.utils import load_word2vec_model +>>> wvmodel = load_word2vec_model('/path/to/model_file.bin') + +Examples: + +>>> word_mover_distance(['police', 'station'], ['policeman'], wvmodel) # gives 3.060708999633789 +>>> word_mover_distance(['physician', 'assistant'], ['doctor', 'assistants'], wvmodel) # gives 2.276337146759033 + +More examples can be found in this +the embedded word vectors. WMD is a special case for Earth Mover's distance (EMD), or Wasserstein +distance. The calculation of WMD in this package is based on linear programming, and the distance between +words are the Euclidean distance by default (not cosine distance), but user can set it accordingly. + +Import the modules, and load the word-embedding models: + +>>> from shorttext import word_mover_distance +>>> from shorttext.utils import load_word2vec_model +>>> wvmodel = load_word2vec_model('/path/to/model_file.bin') + +Examples: + +>>> word_mover_distance(['police', 'station'], ['policeman'], wvmodel) # gives 3.060708999633789 +>>> word_mover_distance(['physician', 'assistant'], ['doctor', 'assistants'], wvmodel) # gives 2.276337146759033 + +More examples can be found in this +the embedded word vectors. WMD is a special case for Earth Mover's distance (EMD), or Wasserstein +distance. The calculation of WMD in this package is based on linear programming, and the distance between +words are the Euclidean distance by default (not cosine distance), but user can set it accordingly. + +Import the modules, and load the word-embedding models: + +>>> from shorttext.metrics.wasserstein import word_mover_distance +>>> from shorttext.utils import load_word2vec_model +>>> wvmodel = load_word2vec_model('/path/to/model_file.bin') + +Examples: + +>>> word_mover_distance(['police', 'station'], ['policeman'], wvmodel) # gives 3.060708999633789 +>>> word_mover_distance(['physician', 'assistant'], ['doctor', 'assistants'], wvmodel) # gives 2.276337146759033 + +More examples can be found in this `IPython Notebook +`_ . + +In `gensim`, the Word2Vec model allows the calculation of WMD if user installed the package PyEMD_. It is based on the +scale invariant feature transform (SIFT), an algorithm for EMD based on L1-distance (Manhattan distance). +For more details, +please refer to their `tutorial +`_ , and cite the two papers by Ofir Pele and Michael Werman +if it is used. + +.. automodule:: shorttext.metrics.wasserstein.wordmoverdist + :members: word_mover_distance + +Jaccard Index Due to Cosine Distances +------------------------------------- + +In the above section of edit distance, the Jaccard score was calculated by considering soft membership +using spelling. However, we can also compute the soft membership by cosine similarity with + +>>> from shorttext import load_word2vec_model +>>> wvmodel = load_word2vec_model('/path/to/model_file.bin') +>>> from shorttext.metrics.embedfuzzy import jaccardscore_sents + +For example, the number of words between the set containing 'doctor' and that containing 'physician' +is 0.78060223420956831 (according to Google model), and therefore the Jaccard score is +using spelling. However, we can also compute the soft membership by cosine similarity with + +>>> from shorttext import load_word2vec_model +>>> wvmodel = load_word2vec_model('/path/to/model_file.bin') +>>> from shorttext.metrics.embedfuzzy import jaccardscore_sents + +For example, the number of words between the set containing 'doctor' and that containing 'physician' +is 0.78060223420956831 (according to Google model), and therefore the Jaccard score is +using spelling. However, we can also compute the soft membership by cosine similarity with + +>>> from shorttext import load_word2vec_model +>>> wvmodel = load_word2vec_model('/path/to/model_file.bin') +>>> from shorttext.metrics.embedfuzzy import jaccardscore_sents + +For example, the number of words between the set containing 'doctor' and that containing 'physician' +is 0.78060223420956831 (according to Google model), and therefore the Jaccard score is +using spelling. However, we can also compute the soft membership by cosine similarity with + +>>> from shorttext import load_word2vec_model +>>> wvmodel = load_word2vec_model('/path/to/model_file.bin') +>>> from shorttext.metrics.embedfuzzy import jaccardscore_sents + +For example, the number of words between the set containing 'doctor' and that containing 'physician' +is 0.78060223420956831 (according to Google model), and therefore the Jaccard score is +using spelling. However, we can also compute the soft membership by cosine similarity with + +>>> from shorttext import load_word2vec_model +>>> wvmodel = load_word2vec_model('/path/to/model_file.bin') +>>> from shorttext.metrics.embedfuzzy import jaccardscore_sents + +For example, the number of words between the set containing 'doctor' and that containing 'physician' +is 0.78060223420956831 (according to Google model), and therefore the Jaccard score is +using spelling. However, we can also compute the soft membership by cosine similarity with + +>>> from shorttext import load_word2vec_model +>>> wvmodel = load_word2vec_model('/path/to/model_file.bin') +>>> from shorttext.metrics.embedfuzzy import jaccardscore_sents + +For example, the number of words between the set containing 'doctor' and that containing 'physician' +is 0.78060223420956831 (according to Google model), and therefore the Jaccard score is +using spelling. However, we can also compute the soft membership by cosine similarity with + +>>> from shorttext.utils import load_word2vec_model +>>> wvmodel = load_word2vec_model('/path/to/model_file.bin') +>>> from shorttext.metrics.embedfuzzy import jaccardscore_sents + +For example, the number of words between the set containing 'doctor' and that containing 'physician' +is 0.78060223420956831 (according to Google model), and therefore the Jaccard score is + +:math:`0.78060223420956831 / (2-0.78060223420956831) = 0.6401538990056869` + +And it can be seen by running it: + +>>> jaccardscore_sents('doctor', 'physician', wvmodel) # gives 0.6401538990056869 +>>> jaccardscore_sents('chief executive', 'computer cluster', wvmodel) # gives 0.0022515450768836143 +>>> jaccardscore_sents('topological data', 'data of topology', wvmodel) # gives 0.67588977344632573 + +.. automodule:: shorttext.metrics.embedfuzzy.jaccard + :members: + + +Reference +--------- + +"Damerau-Levenshtein Distance." [`Wikipedia +`_] + +"Jaccard index." [`Wikipedia +`_] + +Daniel E. Russ, Kwan-Yuet Ho, Calvin A. Johnson, Melissa C. Friesen, "Computer-Based Coding of Occupation Codes for Epidemiological Analyses," *2014 IEEE 27th International Symposium on Computer-Based Medical Systems* (CBMS), pp. 347-350. (2014) [`IEEE +`_] + +Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger, "From Word Embeddings to Document Distances," *ICML* (2015). + +Ofir Pele, Michael Werman, "A linear time histogram metric for improved SIFT matching," *Computer Vision - ECCV 2008*, 495-508 (2008). [`ACM +`_] + +Ofir Pele, Michael Werman, "Fast and robust earth mover's distances," *Proc. 2009 IEEE 12th Int. Conf. on Computer Vision*, 460-467 (2009). [`IEEE +`_] + +"Word Mover’s Distance as a Linear Programming Problem," *Everything About Data Analytics*, WordPress (2017). [`WordPress +`_] + + +Home: :doc:`index` + +.. _SWIG: http://www.swig.org/ +.. _PyEMD: https://github.com/wmayner/pyemd +.. _Cython: http://cython.org/ \ No newline at end of file diff --git a/docs/tutorial_nnlib.rst b/docs/tutorial_nnlib.rst index 0e2269b1..989857b3 100644 --- a/docs/tutorial_nnlib.rst +++ b/docs/tutorial_nnlib.rst @@ -21,7 +21,10 @@ and they are good for short text or document classification. Of course, users ca own neural networks, written in `keras`. A pre-trained Google Word2Vec model can be downloaded `here -`_. +`_, +and a pre-trained Facebook FastText model can be downloaded `here +`_. + See: :doc:`tutorial_wordembed` . @@ -39,12 +42,16 @@ Then load the training data Then we choose a neural network. We choose ConvNet: ->>> kmodel = shorttext.classifiers.frameworks.CNNWordEmbed(len(trainclassdict.keys())) +>>> kmodel = shorttext.classifiers.frameworks.CNNWordEmbed(len(trainclassdict.keys()), vecsize=300) Initialize the classifier: >>> classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(wvmodel) +.. autoclass:: shorttext.classifiers.embed.nnlib.VarNNEmbedVecClassification.VarNNEmbeddedVecClassifier + :members: + + Then train the classifier: >>> classifier.train(trainclassdict, kmodel) @@ -82,12 +89,20 @@ To load it, enter: >>> classifier2 = shorttext.classifiers.load_varnnlibvec_classifier(wvmodel, '/path/to/nnlibvec_convnet_subdata.bin') +.. automodule:: shorttext.classifiers.embed.nnlib.VarNNEmbedVecClassification + :members: load_varnnlibvec_classifier + + Provided Neural Networks ------------------------ There are three neural networks available in this package for the use in :class:`shorttext.classifiers.VarNNEmbeddedVecClassifier`, -and they are available in the module :module:`shorttext.classifiers.frameworks`. +and they are available in the module `shorttext.classifiers.frameworks`. + +.. automodule:: shorttext.classifiers.embed.nnlib.frameworks + :members: + ConvNet (Convolutional Neural Network) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -97,26 +112,22 @@ as demonstrated in Kim's paper. .. image:: images/nnlib_cnn.png -The function in the frameworks returns a :class:`keras.models.Sequential`. - -.. autofunction:: shorttext.classifiers.embed.nnlib.frameworks.CNNWordEmbed +The function in the frameworks returns a :class:`keras.models.Sequential` or :class:`keras.models.Model`. Its input parameters are: The parameter `maxlen` defines the maximum length of the sentences. If the sentence has less than `maxlen` words, then the empty words will be filled with zero vectors. ->>> kmodel = fr.CNNWordEmbed(len(trainclassdict.keys())) +>>> kmodel = fr.CNNWordEmbed(len(trainclassdict.keys()), vecsize=wvmodel.vector_size) Double ConvNet ^^^^^^^^^^^^^^ -This neural network is nothing more than two ConvNet layers. - -.. autofunction:: shorttext.classifiers.embed.nnlib.frameworks.DoubleCNNWordEmbed +This neural network is nothing more than two ConvNet layers. The function in the frameworks returns a :class:`keras.models.Sequential` or :class:`keras.models.Model`. Its input parameters are: The parameter `maxlen` defines the maximum length of the sentences. If the sentence has less than `maxlen` words, then the empty words will be filled with zero vectors. ->>> kmodel = fr.DoubleCNNWordEmbed(len(trainclassdict.keys())) +>>> kmodel = fr.DoubleCNNWordEmbed(len(trainclassdict.keys()), vecsize=wvmodel.vector_size) C-LSTM (Convolutional Long Short-Term Memory) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -127,14 +138,12 @@ and then followed by LSTM (long short-term memory), a type of recurrent neural n .. image:: images/nnlib_clstm.png -The function in the frameworks returns a :class:`keras.models.Sequential`. - -.. autofunction:: shorttext.classifiers.embed.nnlib.frameworks.CLSTMWordEmbed +The function in the frameworks returns a :class:`keras.models.Sequential` or :class:`keras.models.Model`. The parameter `maxlen` defines the maximum length of the sentences. If the sentence has less than `maxlen` words, then the empty words will be filled with zero vectors. ->>> kmodel = fr.CLSTMWordEmbed(len(trainclassdict.keys())) +>>> kmodel = fr.CLSTMWordEmbed(len(trainclassdict.keys()), vecsize=wvmodel.vector_size) User-Defined Neural Network ^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -149,6 +158,16 @@ of the embedded vectors. The output is a one-dimensional array, of size equal to the number of classes provided by the training data. The order of the class labels is assumed to be the same as the order of the given training data (stored as a Python dictionary). +Putting Word2Vec Model As an Input Keras Layer (Deprecated) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This functionality is removed since release 0.5.11, due to the following reasons: + +* `keras` changed its code that produces this bug; +* the layer is consuming memory; +* only Word2Vec is supported; and +* the results are incorrect. + Reference --------- @@ -158,6 +177,9 @@ Chunting Zhou, Chonglin Sun, Zhiyuan Liu, Francis Lau, "A C-LSTM Neural Network "CS231n Convolutional Neural Networks for Visual Recognition," Stanford Online Course. [`link `_] +Nal Kalchbrenner, Edward Grefenstette, Phil Blunsom, "A Convolutional Neural Network for Modelling Sentences," *Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics*, pp. 655-665 (2014). [`arXiv +`_] + Tal Perry, "Convolutional Methods for Text," *Medium* (2017). [`Medium `_] diff --git a/docs/tutorial_spell.rst b/docs/tutorial_spell.rst new file mode 100644 index 00000000..e6baefd7 --- /dev/null +++ b/docs/tutorial_spell.rst @@ -0,0 +1,38 @@ +Spell Correctors +================ + +This package supports the use of spell correctors, because typos are very common in relatively short text data. + +There are two types of spell correctors provided: the one described by Peter Norvig (using n-grams Bayesian method), +and another by Keisuke Sakaguchi and his colleagues (using semi-character level recurrent neural network). + +>>> import shorttext + +We use the Norvig's training corpus as an example. To load it, + +>>> from urllib.request import urlopen +>>> text = urlopen('https://norvig.com/big.txt').read() + +The developer just has to instantiate the spell corrector, and then train it with a corpus to get a correction model. +Then one can use it for correction. + +Norvig +------ + +Peter Norvig described a spell corrector based on Bayesian approach and edit distance. You can refer to his blog for +more information. + +>>> norvig_corrector = shorttext.spell.NorvigSpellCorrector() +>>> norvig_corrector.train(text) +>>> norvig_corrector.correct('oranhe') # gives "orange" + +.. automodule:: shorttext.spell.norvig + :members: + + + +Reference +--------- + +Peter Norvig, "How to write a spell corrector." (2016) [`Norvig +`_] diff --git a/docs/tutorial_stacking.rst b/docs/tutorial_stacking.rst index 7fd67185..037ccd0c 100644 --- a/docs/tutorial_stacking.rst +++ b/docs/tutorial_stacking.rst @@ -77,6 +77,11 @@ offered in this package. To load them, initialize it in the same way: >>> stacker2 = shorttext.stack.LogisticStackedGeneralization(intermediate_classifiers={'clstm': clstm_classifier, 'lda128': lda128_svm_classifier}) >>> stacker2.load_compact_model('/path/to/logitmodel.bin') + +.. automodule:: shorttext.stack.stacking + :members: + + Reference --------- diff --git a/docs/tutorial_sumvec.rst b/docs/tutorial_sumvec.rst index de62a725..c5ffaf70 100644 --- a/docs/tutorial_sumvec.rst +++ b/docs/tutorial_sumvec.rst @@ -23,6 +23,318 @@ Import the package: To load the Word2Vec model, +>>> from shorttext import load_word2vec_model +>>> wvmodel = load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz') + +Then we load a set of data: + +>>> nihtraindata = shorttext.data.nihreports(sample_size=None) + +Then initialize the classifier: + +>>> classifier = shorttext.classifiers.SumEmbeddedVecClassifier(wvmodel) # for Google model, the vector size is 300 (default: 100) +>>> classifier.train(nihtraindata) + +This classifier takes relatively little time to train compared with others +in this package. Then we can perform classification: + +>>> classifier.score('bioinformatics') + +Or the result can be sorted and only the five top-scored results are displayed: + +>>> sorted(classifier.score('stem cell research').items(), key=lambda item: item[1], reverse=True)[:5] +[('NIGMS', 0.44962596182682935), + ('NIAID', 0.4494126990050461), + ('NINDS', 0.43435236806719524), + ('NIDCR', 0.43042338197002483), + ('NHGRI', 0.42878346869968731)] +>>> sorted(classifier.score('bioinformatics').items(), key=lambda item: item[1], reverse=True)[:5] +[('NHGRI', 0.54200061864847038), + ('NCATS', 0.49097267547279988), + ('NIGMS', 0.47818129591411118), + ('CIT', 0.46874987052158501), + ('NLM', 0.46869259072562974)] +>>> sorted(classifier.score('cancer immunotherapy').items(), key=lambda item: item[1], reverse=True)[:5] +[('NCI', 0.53734097785976076), + ('NIAID', 0.50616582142027433), + ('NIDCR', 0.48596330887674788), + ('NIDDK', 0.46875755765903215), + ('NCCAM', 0.4642233792198418)] + +The trained model can be saved: + +>>> classifier.save_compact_model('/path/to/sumvec_nihdata_model.bin') + +And with the same pre-trained Word2Vec model, this classifier can be loaded: + +>>> classifier2 = shorttext.classifiers.load_sumword2vec_classifier(wvmodel, '/path/to/sumvec_nihdata_model.bin') + +Import the package: + +>>> import shorttext + +To load the Word2Vec model, + +>>> from shorttext import load_word2vec_model +>>> wvmodel = load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz') + +Then we load a set of data: + +>>> nihtraindata = shorttext.data.nihreports(sample_size=None) + +Then initialize the classifier: + +>>> classifier = shorttext.classifiers.SumEmbeddedVecClassifier(wvmodel) # for Google model, the vector size is 300 (default: 100) +>>> classifier.train(nihtraindata) + +This classifier takes relatively little time to train compared with others +in this package. Then we can perform classification: + +>>> classifier.score('bioinformatics') + +Or the result can be sorted and only the five top-scored results are displayed: + +>>> sorted(classifier.score('stem cell research').items(), key=lambda item: item[1], reverse=True)[:5] +[('NIGMS', 0.44962596182682935), + ('NIAID', 0.4494126990050461), + ('NINDS', 0.43435236806719524), + ('NIDCR', 0.43042338197002483), + ('NHGRI', 0.42878346869968731)] +>>> sorted(classifier.score('bioinformatics').items(), key=lambda item: item[1], reverse=True)[:5] +[('NHGRI', 0.54200061864847038), + ('NCATS', 0.49097267547279988), + ('NIGMS', 0.47818129591411118), + ('CIT', 0.46874987052158501), + ('NLM', 0.46869259072562974)] +>>> sorted(classifier.score('cancer immunotherapy').items(), key=lambda item: item[1], reverse=True)[:5] +[('NCI', 0.53734097785976076), + ('NIAID', 0.50616582142027433), + ('NIDCR', 0.48596330887674788), + ('NIDDK', 0.46875755765903215), + ('NCCAM', 0.4642233792198418)] + +The trained model can be saved: + +>>> classifier.save_compact_model('/path/to/sumvec_nihdata_model.bin') + +And with the same pre-trained Word2Vec model, this classifier can be loaded: + +>>> classifier2 = shorttext.classifiers.load_sumword2vec_classifier(wvmodel, '/path/to/sumvec_nihdata_model.bin') + +Import the package: + +>>> import shorttext + +To load the Word2Vec model, + +>>> from shorttext import load_word2vec_model +>>> wvmodel = load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz') + +Then we load a set of data: + +>>> nihtraindata = shorttext.data.nihreports(sample_size=None) + +Then initialize the classifier: + +>>> classifier = shorttext.classifiers.SumEmbeddedVecClassifier(wvmodel) # for Google model, the vector size is 300 (default: 100) +>>> classifier.train(nihtraindata) + +This classifier takes relatively little time to train compared with others +in this package. Then we can perform classification: + +>>> classifier.score('bioinformatics') + +Or the result can be sorted and only the five top-scored results are displayed: + +>>> sorted(classifier.score('stem cell research').items(), key=lambda item: item[1], reverse=True)[:5] +[('NIGMS', 0.44962596182682935), + ('NIAID', 0.4494126990050461), + ('NINDS', 0.43435236806719524), + ('NIDCR', 0.43042338197002483), + ('NHGRI', 0.42878346869968731)] +>>> sorted(classifier.score('bioinformatics').items(), key=lambda item: item[1], reverse=True)[:5] +[('NHGRI', 0.54200061864847038), + ('NCATS', 0.49097267547279988), + ('NIGMS', 0.47818129591411118), + ('CIT', 0.46874987052158501), + ('NLM', 0.46869259072562974)] +>>> sorted(classifier.score('cancer immunotherapy').items(), key=lambda item: item[1], reverse=True)[:5] +[('NCI', 0.53734097785976076), + ('NIAID', 0.50616582142027433), + ('NIDCR', 0.48596330887674788), + ('NIDDK', 0.46875755765903215), + ('NCCAM', 0.4642233792198418)] + +The trained model can be saved: + +>>> classifier.save_compact_model('/path/to/sumvec_nihdata_model.bin') + +And with the same pre-trained Word2Vec model, this classifier can be loaded: + +>>> classifier2 = shorttext.classifiers.load_sumword2vec_classifier(wvmodel, '/path/to/sumvec_nihdata_model.bin') + +Import the package: + +>>> import shorttext + +To load the Word2Vec model, + +>>> from shorttext import load_word2vec_model +>>> wvmodel = load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz') + +Then we load a set of data: + +>>> nihtraindata = shorttext.data.nihreports(sample_size=None) + +Then initialize the classifier: + +>>> classifier = shorttext.classifiers.SumEmbeddedVecClassifier(wvmodel) # for Google model, the vector size is 300 (default: 100) +>>> classifier.train(nihtraindata) + +This classifier takes relatively little time to train compared with others +in this package. Then we can perform classification: + +>>> classifier.score('bioinformatics') + +Or the result can be sorted and only the five top-scored results are displayed: + +>>> sorted(classifier.score('stem cell research').items(), key=lambda item: item[1], reverse=True)[:5] +[('NIGMS', 0.44962596182682935), + ('NIAID', 0.4494126990050461), + ('NINDS', 0.43435236806719524), + ('NIDCR', 0.43042338197002483), + ('NHGRI', 0.42878346869968731)] +>>> sorted(classifier.score('bioinformatics').items(), key=lambda item: item[1], reverse=True)[:5] +[('NHGRI', 0.54200061864847038), + ('NCATS', 0.49097267547279988), + ('NIGMS', 0.47818129591411118), + ('CIT', 0.46874987052158501), + ('NLM', 0.46869259072562974)] +>>> sorted(classifier.score('cancer immunotherapy').items(), key=lambda item: item[1], reverse=True)[:5] +[('NCI', 0.53734097785976076), + ('NIAID', 0.50616582142027433), + ('NIDCR', 0.48596330887674788), + ('NIDDK', 0.46875755765903215), + ('NCCAM', 0.4642233792198418)] + +The trained model can be saved: + +>>> classifier.save_compact_model('/path/to/sumvec_nihdata_model.bin') + +And with the same pre-trained Word2Vec model, this classifier can be loaded: + +>>> classifier2 = shorttext.classifiers.load_sumword2vec_classifier(wvmodel, '/path/to/sumvec_nihdata_model.bin') + +Import the package: + +>>> import shorttext + +To load the Word2Vec model, + +>>> from shorttext import load_word2vec_model +>>> wvmodel = load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz') + +Then we load a set of data: + +>>> nihtraindata = shorttext.data.nihreports(sample_size=None) + +Then initialize the classifier: + +>>> classifier = shorttext.classifiers.SumEmbeddedVecClassifier(wvmodel) # for Google model, the vector size is 300 (default: 100) +>>> classifier.train(nihtraindata) + +This classifier takes relatively little time to train compared with others +in this package. Then we can perform classification: + +>>> classifier.score('bioinformatics') + +Or the result can be sorted and only the five top-scored results are displayed: + +>>> sorted(classifier.score('stem cell research').items(), key=lambda item: item[1], reverse=True)[:5] +[('NIGMS', 0.44962596182682935), + ('NIAID', 0.4494126990050461), + ('NINDS', 0.43435236806719524), + ('NIDCR', 0.43042338197002483), + ('NHGRI', 0.42878346869968731)] +>>> sorted(classifier.score('bioinformatics').items(), key=lambda item: item[1], reverse=True)[:5] +[('NHGRI', 0.54200061864847038), + ('NCATS', 0.49097267547279988), + ('NIGMS', 0.47818129591411118), + ('CIT', 0.46874987052158501), + ('NLM', 0.46869259072562974)] +>>> sorted(classifier.score('cancer immunotherapy').items(), key=lambda item: item[1], reverse=True)[:5] +[('NCI', 0.53734097785976076), + ('NIAID', 0.50616582142027433), + ('NIDCR', 0.48596330887674788), + ('NIDDK', 0.46875755765903215), + ('NCCAM', 0.4642233792198418)] + +The trained model can be saved: + +>>> classifier.save_compact_model('/path/to/sumvec_nihdata_model.bin') + +And with the same pre-trained Word2Vec model, this classifier can be loaded: + +>>> classifier2 = shorttext.classifiers.load_sumword2vec_classifier(wvmodel, '/path/to/sumvec_nihdata_model.bin') + +Import the package: + +>>> import shorttext + +To load the Word2Vec model, + +>>> from shorttext import load_word2vec_model +>>> wvmodel = load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz') + +Then we load a set of data: + +>>> nihtraindata = shorttext.data.nihreports(sample_size=None) + +Then initialize the classifier: + +>>> classifier = shorttext.classifiers.SumEmbeddedVecClassifier(wvmodel) # for Google model, the vector size is 300 (default: 100) +>>> classifier.train(nihtraindata) + +This classifier takes relatively little time to train compared with others +in this package. Then we can perform classification: + +>>> classifier.score('bioinformatics') + +Or the result can be sorted and only the five top-scored results are displayed: + +>>> sorted(classifier.score('stem cell research').items(), key=lambda item: item[1], reverse=True)[:5] +[('NIGMS', 0.44962596182682935), + ('NIAID', 0.4494126990050461), + ('NINDS', 0.43435236806719524), + ('NIDCR', 0.43042338197002483), + ('NHGRI', 0.42878346869968731)] +>>> sorted(classifier.score('bioinformatics').items(), key=lambda item: item[1], reverse=True)[:5] +[('NHGRI', 0.54200061864847038), + ('NCATS', 0.49097267547279988), + ('NIGMS', 0.47818129591411118), + ('CIT', 0.46874987052158501), + ('NLM', 0.46869259072562974)] +>>> sorted(classifier.score('cancer immunotherapy').items(), key=lambda item: item[1], reverse=True)[:5] +[('NCI', 0.53734097785976076), + ('NIAID', 0.50616582142027433), + ('NIDCR', 0.48596330887674788), + ('NIDDK', 0.46875755765903215), + ('NCCAM', 0.4642233792198418)] + +The trained model can be saved: + +>>> classifier.save_compact_model('/path/to/sumvec_nihdata_model.bin') + +And with the same pre-trained Word2Vec model, this classifier can be loaded: + +>>> classifier2 = shorttext.classifiers.load_sumword2vec_classifier(wvmodel, '/path/to/sumvec_nihdata_model.bin') + +Import the package: + +>>> import shorttext + +To load the Word2Vec model, + >>> from shorttext.utils import load_word2vec_model >>> wvmodel = load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz') @@ -32,7 +344,7 @@ Then we load a set of data: Then initialize the classifier: ->>> classifier = shorttext.classifiers.SumEmbeddedVecClassifier(wvmodel) +>>> classifier = shorttext.classifiers.SumEmbeddedVecClassifier(wvmodel) # for Google model, the vector size is 300 (default: 100) >>> classifier.train(nihtraindata) This classifier takes relatively little time to train compared with others @@ -69,6 +381,10 @@ And with the same pre-trained Word2Vec model, this classifier can be loaded: >>> classifier2 = shorttext.classifiers.load_sumword2vec_classifier(wvmodel, '/path/to/sumvec_nihdata_model.bin') +.. autoclass:: shorttext.classifiers.embed.sumvec.SumEmbedVecClassification.SumEmbeddedVecClassifier + :members: + + Appendix: Model I/O in Previous Versions ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/docs/tutorial_textpreprocessing.rst b/docs/tutorial_textpreprocessing.rst index df294fa4..e414c56b 100644 --- a/docs/tutorial_textpreprocessing.rst +++ b/docs/tutorial_textpreprocessing.rst @@ -15,7 +15,121 @@ following steps: - removing numerals, - converting all alphabets to lower cases, - removing stop words, and -- stemming the words (using Porter stemmer). +- stemming the words (using Snowball Porter stemmer). + +To do this, load the preprocesser generator: + +>>> from shorttext import standard_text_preprocessor_1 + +Then define the preprocessor, a function, by just calling: + +>>> preprocessor1 = standard_text_preprocessor_1() +specify how the text is preprocessed before it is passed to the trainers or the +classifiers. + +This package provides a standard way of text preprocessing, which goes through the +following steps: + +- removing special characters, +- removing numerals, +- converting all alphabets to lower cases, +- removing stop words, and +- stemming the words (using Snowball Porter stemmer). + +To do this, load the preprocesser generator: + +>>> from shorttext import standard_text_preprocessor_1 + +Then define the preprocessor, a function, by just calling: + +>>> preprocessor1 = standard_text_preprocessor_1() +specify how the text is preprocessed before it is passed to the trainers or the +classifiers. + +This package provides a standard way of text preprocessing, which goes through the +following steps: + +- removing special characters, +- removing numerals, +- converting all alphabets to lower cases, +- removing stop words, and +- stemming the words (using Snowball Porter stemmer). + +To do this, load the preprocesser generator: + +>>> from shorttext import standard_text_preprocessor_1 + +Then define the preprocessor, a function, by just calling: + +>>> preprocessor1 = standard_text_preprocessor_1() +specify how the text is preprocessed before it is passed to the trainers or the +classifiers. + +This package provides a standard way of text preprocessing, which goes through the +following steps: + +- removing special characters, +- removing numerals, +- converting all alphabets to lower cases, +- removing stop words, and +- stemming the words (using Snowball Porter stemmer). + +To do this, load the preprocesser generator: + +>>> from shorttext import standard_text_preprocessor_1 + +Then define the preprocessor, a function, by just calling: + +>>> preprocessor1 = standard_text_preprocessor_1() +specify how the text is preprocessed before it is passed to the trainers or the +classifiers. + +This package provides a standard way of text preprocessing, which goes through the +following steps: + +- removing special characters, +- removing numerals, +- converting all alphabets to lower cases, +- removing stop words, and +- stemming the words (using Snowball Porter stemmer). + +To do this, load the preprocesser generator: + +>>> from shorttext import standard_text_preprocessor_1 + +Then define the preprocessor, a function, by just calling: + +>>> preprocessor1 = standard_text_preprocessor_1() +specify how the text is preprocessed before it is passed to the trainers or the +classifiers. + +This package provides a standard way of text preprocessing, which goes through the +following steps: + +- removing special characters, +- removing numerals, +- converting all alphabets to lower cases, +- removing stop words, and +- stemming the words (using Snowball Porter stemmer). + +To do this, load the preprocesser generator: + +>>> from shorttext import standard_text_preprocessor_1 + +Then define the preprocessor, a function, by just calling: + +>>> preprocessor1 = standard_text_preprocessor_1() +specify how the text is preprocessed before it is passed to the trainers or the +classifiers. + +This package provides a standard way of text preprocessing, which goes through the +following steps: + +- removing special characters, +- removing numerals, +- converting all alphabets to lower cases, +- removing stop words, and +- stemming the words (using Snowball Porter stemmer). To do this, load the preprocesser generator: @@ -25,6 +139,9 @@ Then define the preprocessor, a function, by just calling: >>> preprocessor1 = standard_text_preprocessor_1() +.. automodule:: shorttext.utils.textpreprocessing + :members: standard_text_preprocessor_1 + It is a function that perform the preprocessing in the steps above: >>> preprocessor1('Maryland Blue Crab') # output: 'maryland blue crab' @@ -45,6 +162,90 @@ let's develop a preprocessor that 1) convert it to base form if it is a verb, or Load the function that generates the preprocessor function: +>>> from shorttext import text_preprocessor + +Initialize a WordNet lemmatizer using +but some users may want to define their own preprocessors for their own purposes. +This preprocessor is used in topic modeling, and is desired to be *a function that takes +a string, and returns a string*. + +If the user wants to develop a preprocessor that contains a few steps, he can make it by providing +the pipeline, which is a list of functions that input a string and return a string. For example, +let's develop a preprocessor that 1) convert it to base form if it is a verb, or keep it original; +2) convert it to upper case; and 3) tag the number of characters after each token. + +Load the function that generates the preprocessor function: + +>>> from shorttext import text_preprocessor + +Initialize a WordNet lemmatizer using +but some users may want to define their own preprocessors for their own purposes. +This preprocessor is used in topic modeling, and is desired to be *a function that takes +a string, and returns a string*. + +If the user wants to develop a preprocessor that contains a few steps, he can make it by providing +the pipeline, which is a list of functions that input a string and return a string. For example, +let's develop a preprocessor that 1) convert it to base form if it is a verb, or keep it original; +2) convert it to upper case; and 3) tag the number of characters after each token. + +Load the function that generates the preprocessor function: + +>>> from shorttext import text_preprocessor + +Initialize a WordNet lemmatizer using +but some users may want to define their own preprocessors for their own purposes. +This preprocessor is used in topic modeling, and is desired to be *a function that takes +a string, and returns a string*. + +If the user wants to develop a preprocessor that contains a few steps, he can make it by providing +the pipeline, which is a list of functions that input a string and return a string. For example, +let's develop a preprocessor that 1) convert it to base form if it is a verb, or keep it original; +2) convert it to upper case; and 3) tag the number of characters after each token. + +Load the function that generates the preprocessor function: + +>>> from shorttext import text_preprocessor + +Initialize a WordNet lemmatizer using +but some users may want to define their own preprocessors for their own purposes. +This preprocessor is used in topic modeling, and is desired to be *a function that takes +a string, and returns a string*. + +If the user wants to develop a preprocessor that contains a few steps, he can make it by providing +the pipeline, which is a list of functions that input a string and return a string. For example, +let's develop a preprocessor that 1) convert it to base form if it is a verb, or keep it original; +2) convert it to upper case; and 3) tag the number of characters after each token. + +Load the function that generates the preprocessor function: + +>>> from shorttext import text_preprocessor + +Initialize a WordNet lemmatizer using +but some users may want to define their own preprocessors for their own purposes. +This preprocessor is used in topic modeling, and is desired to be *a function that takes +a string, and returns a string*. + +If the user wants to develop a preprocessor that contains a few steps, he can make it by providing +the pipeline, which is a list of functions that input a string and return a string. For example, +let's develop a preprocessor that 1) convert it to base form if it is a verb, or keep it original; +2) convert it to upper case; and 3) tag the number of characters after each token. + +Load the function that generates the preprocessor function: + +>>> from shorttext import text_preprocessor + +Initialize a WordNet lemmatizer using +but some users may want to define their own preprocessors for their own purposes. +This preprocessor is used in topic modeling, and is desired to be *a function that takes +a string, and returns a string*. + +If the user wants to develop a preprocessor that contains a few steps, he can make it by providing +the pipeline, which is a list of functions that input a string and return a string. For example, +let's develop a preprocessor that 1) convert it to base form if it is a verb, or keep it original; +2) convert it to upper case; and 3) tag the number of characters after each token. + +Load the function that generates the preprocessor function: + >>> from shorttext.utils import text_preprocessor Initialize a WordNet lemmatizer using `nltk`: @@ -54,9 +255,9 @@ Initialize a WordNet lemmatizer using `nltk`: Define the pipeline. Functions for each of the steps are: ->>> step1fcn = lambda s: ' '.join(map(lambda s1: lemmatizer.lemmatize(s1), s.split(' '))) +>>> step1fcn = lambda s: ' '.join([lemmatizer.lemmatize(s1) for s1 in s.split(' ')]) >>> step2fcn = lambda s: s.upper() ->>> step3fcn = lambda s: ' '.join(map(lambda s1: s1+'-'+str(len(s1)), s.split(' '))) +>>> step3fcn = lambda s: ' '.join([s1+'-'+str(len(s1)) for s1 in s.split(' ')]) Then the pipeline is: @@ -72,11 +273,14 @@ Some examples are: >>> preprocessor2('Maryland blue crab in Annapolis') # output: 'MARYLAND-8 BLUE-4 CRAB-4 IN-2 ANNAPOLIS-9' >>> preprocessor2('generative adversarial networks') # output: 'GENERATIVE-10 ADVERSARIAL-11 NETWORK-7' +.. automodule:: shorttext.utils.textpreprocessing + :members: text_preprocessor + Tokenization ------------ Users are free to choose any tokenizer they wish. In `shorttext`, the tokenizer is -implemented with `spaCy`, and can be called: +simply the space delimiter, and can be called: >>> shorttext.utils.tokenize('Maryland blue crab') # output: ['Maryland', 'blue', 'crab'] diff --git a/docs/tutorial_topic.rst b/docs/tutorial_topic.rst index 97c21def..4043420d 100644 --- a/docs/tutorial_topic.rst +++ b/docs/tutorial_topic.rst @@ -47,14 +47,17 @@ with the trained model. For example, >>> topicmodeler.retrieve_topicvec('stem cell research') ->>> topicmodeler.retrieve_topicvec('bioinformatics') +>>> topicmodeler.retrieve_topicvec('informatics') By default, the vectors are normalized. Another way to retrieve the topic vector representation is as follow: >>> topicmodeler['stem cell research'] ->>> topicmodeler['bioinformatics'] +>>> topicmodeler['informatics'] + +If the dictionary does not have the processed tokens, it will return a numpy +array with all values `nan`. In the training and the retrieval above, the same preprocessing process is applied. Users can provide their own preprocessor while initiating the topic modeler. @@ -71,29 +74,15 @@ While initialize the instance of the topic modeler, the user can also specify whether to weigh the terms using tf-idf (term frequency - inverse document frequency). The default is to weigh. To not weigh, initialize it as ->>> topicmodeler3 = shorttext.generators.GensimTopicModeler(toweight=False) - -Appendix: Model I/O in Previous Versions -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -For previous versions of `shorttext`, the trained models are saved by calling: - ->>> topicmodeler.savemodel('/path/to/nihlda128') +>>> topicmodeler3 = shorttext.generators.GensimTopicModeler(toweigh=False) -However, we discourage users using this anymore, because the model I/O for various models -in gensim have been different. It produces errors. +.. automodule:: shorttext.generators.bow.GensimTopicModeling + :members: -All of them have to be present in order to be loaded. Note that the preprocessor is -not saved. To load the model, enter: - ->>> topicmodeler2 = shorttext.classifiers.load_gensimtopicmodel('/path/to/nihlda128', compact=False) AutoEncoder ----------- -Note: Previous version (<=0.2.1) of this autoencoder has a serious bug. Current version is -incompatible with the autoencoder of version <=0.2.1 . - Another way to find a new topic vector representation is to use the autoencoder, a neural network model which compresses a vector representation into another one of a shorter (or longer, rarely though) representation, by minimizing the difference between the input layer and the decoding layer. @@ -135,38 +124,11 @@ Like other topic models, while initialize the instance of the topic modeler, the whether to weigh the terms using tf-idf (term frequency - inverse document frequency). The default is to weigh. To not weigh, initialize it as: ->>> autoencoder3 = shorttext.generators.AutoencodingTopicModeler(toweight=False) - -Appendix: Unzipping Model I/O -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -For previous versions of `shorttext`, the trained models are saved by calling: - ->>> autoencoder.savemodel('/path/to/sub_autoencoder8') - -The following files are produced for the autoencoder: - -:: - - /path/to/sub_autoencoder.json - /path/to/sub_autoencoder.gensimdict - /path/to/sub_autoencoder_encoder.json - /path/to/sub_autoencoder_encoder.h5 - /path/to/sub_autoencoder_classtopicvecs.pkl - -If specifying `save_complete_autoencoder=True`, then four more files are found: +>>> autoencoder3 = shorttext.generators.AutoencodingTopicModeler(toweigh=False) -:: - - /path/to/sub_autoencoder_decoder.json - /path/to/sub_autoencoder_decoder.h5 - /path/to/sub_autoencoder_autoencoder.json - /path/to/sub_autoencoder_autoencoder.h5 - -Users can load the same model later by entering: - ->>> autoencoder2 = shorttext.classifiers.load_autoencoder_topic('/path/to/sub_autoencoder8', compact=False) +.. automodule:: shorttext.generators.bow.AutoEncodingTopicModeling + :members: Abstract Latent Topic Modeling Class ------------------------------------ @@ -178,27 +140,11 @@ an abstract class virtually. If user wants to develop its own topic model that e this, he has to define the methods `train`, `retrieve_topic_vec`, `loadmodel`, and `savemodel`. -Appendix: Namespaces for Topic Modeler in Previous Versions -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -All generative topic modeling algorithms were placed under the package `shorttext.classifiers` for version <=0.3.4. -In current version (>= 0.3.5), however, all generative models will be moved to `shorttext.generators`, -while any classifiers making use of these topic models are still kept under `shorttext.classifiers`. -A list include: - -:: - - shorttext.classifiers.GensimTopicModeler -> shorttext.generators.GensimTopicModeler - shorttext.classifiers.LDAModeler -> shorttext.generators.LDAModeler - shorttext.classifiers.LSIModeler -> shorttext.generators.LSIModeler - shorttext.classifiers.RPModeler -> shorttext.generators.RPModeler - shorttext.classifiers.AutoencodingTopicModeler -> shorttext.generators.AutoencodingTopicModeler - shorttext.classifiers.load_gensimtopicmodel -> shorttext.generators.load_gensimtopicmodel - shorttext.classifiers.load_autoencoder_topic -> shorttext.generators.load_autoencoder_topicmodel +.. automodule:: shorttext.generators.bow.LatentTopicModeling + :members: - -For backward compatibility, developers can still call the topic models as if there were no such changes, -although they are advised to make this change. +.. automodule:: shorttext.generators.bow.GensimTopicModeling + :members: Classification Using Cosine Similarity -------------------------------------- @@ -231,6 +177,10 @@ The same thing for autoencoder, but the classifier based on autoencoder can be l >>> cos_classifier = shorttext.classifiers.load_autoencoder_cosineClassifier('/path/to/sub_autoencoder8.bin') +.. automodule:: shorttext.classifiers.bow.topic.TopicVectorDistanceClassification + :members: + + Classification Using Scikit-Learn Classifiers --------------------------------------------- @@ -239,7 +189,7 @@ algorithms. We can take any supervised learning algorithms in `scikit-learn` her We use Gaussian naive Bayes as an example. For faster demonstration, use the subject keywords as the example dataset. ->>> subtopicmodeler = shorttext.generators.GensimTopicModeler() +>>> subtopicmodeler = shorttext.generators.LDAModeler() >>> subtopicmodeler.train(subdict, 8) We first import the class: @@ -271,6 +221,13 @@ will still do the work. However, to load the saved classifier with an autoencode >>> classifier2 = shorttext.classifiers.load_autoencoder_topic_sklearnclassifier('/path/to/filename.bin') +Compact model files saved by `TopicVectorSkLearnClassifier` in `shorttext` >= 1.0.0 cannot be read +by earlier version of `shorttext`; vice versa is not true though: old compact model files can be read in. + +.. automodule:: shorttext.classifiers.bow.topic.SkLearnClassification + :members: + + Notes about Text Preprocessing ------------------------------ diff --git a/docs/tutorial_wordembed.rst b/docs/tutorial_wordembed.rst index 1b719c74..692b3d74 100644 --- a/docs/tutorial_wordembed.rst +++ b/docs/tutorial_wordembed.rst @@ -10,13 +10,84 @@ their page. To load the model, call: >>> import shorttext >>> wvmodel = shorttext.utils.load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz') -It is a binary file, and the default is set to be `binary=True`. In fact, it is equivalent to calling, -if you have `gensim` version before 1.0.0: +It is a binary file, and the default is set to be +their page. To load the model, call: ->>> import gensim ->>> wvmodel = gensim.models.Word2Vec.load_word2vec_format('/path/to/GoogleNews-vectors-negative300.bin.gz', binary=True) +>>> import shorttext +>>> wvmodel = shorttext.utils.load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz') + +It is a binary file, and the default is set to be +their page. To load the model, call: + +>>> import shorttext +>>> wvmodel = shorttext.utils.load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz') + +It is a binary file, and the default is set to be +their page. To load the model, call: + +>>> import shorttext +>>> wvmodel = shorttext.utils.load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz') + +It is a binary file, and the default is set to be +their page. To load the model, call: + +>>> import shorttext +>>> wvmodel = shorttext.utils.load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz') + +It is a binary file, and the default is set to be +their page. To load the model, call: + +>>> import shorttext +>>> wvmodel = shorttext.utils.load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz') + +It is a binary file, and the default is set to be +their page. To load the model, call: + +>>> import shorttext +>>> wvmodel = shorttext.utils.load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz') + +It is a binary file, and the default is set to be +their page. To load the model, call: + +>>> import shorttext +>>> wvmodel = shorttext.utils.load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz') + +It is a binary file, and the default is set to be +their page. To load the model, call: + +>>> import shorttext +>>> wvmodel = shorttext.utils.load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz') + +It is a binary file, and the default is set to be +their page. To load the model, call: + +>>> import shorttext +>>> wvmodel = shorttext.utils.load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz') + +It is a binary file, and the default is set to be +their page. To load the model, call: -Or beyond version 1.0.0, +>>> import shorttext +>>> wvmodel = shorttext.utils.load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz') + +It is a binary file, and the default is set to be +their page. To load the model, call: + +>>> import shorttext +>>> wvmodel = shorttext.utils.load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz') + +It is a binary file, and the default is set to be +their page. To load the model, call: + +>>> import shorttext +>>> wvmodel = shorttext.utils.load_word2vec_model('/path/to/GoogleNews-vectors-negative300.bin.gz') + +It is a binary file, and the default is set to be `binary=True`. + +.. automodule:: shorttext.utils.wordembed + :members: load_word2vec_model + +It is equivalent to calling, >>> import gensim >>> wvmodel = gensim.models.KeyedVectors.load_word2vec_format('/path/to/GoogleNews-vectors-negative300.bin.gz', binary=True) @@ -72,18 +143,69 @@ One can convert a text-format GloVe model into a text-format Word2Vec model. Mor in the documentation of `gensim`: `Converting GloVe to Word2Vec `_ +FastText +-------- + +FastText is a similar word-embedding model from Facebook. You can download pre-trained models here: + +`Pre-trained word vectors +`_ + +To load a pre-trained FastText model, run: + +>>> import shorttext +>>> ftmodel = shorttext.utils.load_fasttext_model('/path/to/model.bin') + +And it is used exactly the same way as Word2Vec. + +.. automodule:: shorttext.utils.wordembed + :members: load_fasttext_model + +Poincaré Embeddings +------------------- + +Poincaré embeddings is a new embedding that learns both semantic similarity and hierarchical structures. To load a +pre-trained model, run: + +>>> import shorttext +>>> pemodel = shorttext.utils.load_poincare_model('/path/to/model.txt') + +For preloaded word-embedding models, please refer to :doc:`tutorial_wordembed`. + +.. automodule:: shorttext.utils.wordembed + :members: load_poincare_model + + + +Other Functions +--------------- + +.. automodule:: shorttext.utils.wordembed + :members: shorttext_to_avgvec + + Links ----- - Word2Vec_ - GloVe_ +- FastText_ Reference --------- +Jayant Jain, "Implementing Poincaré Embeddings," RaRe Technologies (2017). [`RaRe +`_] + Jeffrey Pennington, Richard Socher, Christopher D. Manning, “GloVe: Global Vectors for Word Representation,” *Empirical Methods in Natural Language Processing (EMNLP)*, pp. 1532-1543 (2014). [`PDF `_] +Maximilian Nickel, Douwe Kiela, "Poincaré Embeddings for Learning Hierarchical Representations," arXiv:1705.08039 (2017). [`arXiv +`_] + +Piotr Bojanowski, Edouard Grave, Armand Joulin, Tomas Mikolov, "Enriching Word Vectors with Subword Information," arXiv:1607.04606 (2016). [`arXiv +`_] + Tomas Mikolov, Kai Chen, Greg Corrado, Jeffrey Dean, “Efficient Estimation of Word Representations in Vector Space,” *ICLR* 2013 (2013). [`arXiv `_] @@ -102,4 +224,5 @@ Radim Řehůřek, "Making sense of word2vec," RaRe Technologies (2014). [`RaRe Home: :doc:`index` .. _Word2Vec: https://code.google.com/archive/p/word2vec/ -.. _GloVe: http://nlp.stanford.edu/projects/glove/ \ No newline at end of file +.. _GloVe: http://nlp.stanford.edu/projects/glove/ +.. _FastText: https://github.com/facebookresearch/fastText diff --git a/examples/sakaguchi_spell/binarize.py b/examples/sakaguchi_spell/binarize.py new file mode 100644 index 00000000..835b8442 --- /dev/null +++ b/examples/sakaguchi_spell/binarize.py @@ -0,0 +1,164 @@ + +import re +import string +from functools import reduce + +import numpy as np +from shorttext.generators.charbase.char2vec import initialize_SentenceToCharVecEncoder +from shorttext.utils import OperationNotDefinedException + + +default_alph = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz.,:;'*!?`$%&(){}[]-/\@_#" +# NB. # is , _ is , @ is number +default_specialsignals = {'eos': '#', 'unk': '_', 'number': '@'} +default_signaldenotions = {'': 'eos', '': 'unk'} + + +class SpellingToConcatCharVecEncoder: + def __init__(self, alph): + self.charevec_encoder = initialize_SentenceToCharVecEncoder(alph) + + def encode_spelling(self, spelling): + spmat = self.charevec_encoder.encode_sentence(spelling, len(spelling)) + return spmat.sum(axis=0) + + def __len__(self): + return len(self.charevec_encoder) + + +def hasnum(word): + return len(re.findall('\\d', word)) > 0 + + +class SCRNNBinarizer: + """ A class used by Sakaguchi's spell corrector to convert text into numerical vectors. + + No documentation for this class. + + """ + def __init__(self, alpha, signalchar_dict): + self.signalchar_dict = signalchar_dict + self.concatchar_encoder = SpellingToConcatCharVecEncoder(alpha) + self.char_dict = self.concatchar_encoder.charevec_encoder.dictionary + + def noise_char(self, word, opt, unchanged=False): + bin_all = np.zeros((len(self.concatchar_encoder), 1)) + w = word + if word in default_signaldenotions.keys(): + bin_all[self.char_dict.token2id[default_specialsignals[default_signaldenotions[word]]]] += 1 + elif hasnum(word): + bin_all[self.char_dict.token2id[default_specialsignals['number']]] += 1 + elif unchanged: + bin_all = self.concatchar_encoder.encode_spelling(w).transpose() + elif opt=='DELETE': + if len(word) > 1: + idx = np.random.randint(0, len(word)) + w = word[:idx] + word[(idx+1):] + else: + w = word + bin_all = self.concatchar_encoder.encode_spelling(w).transpose() + elif opt=='INSERT': + ins_idx = np.random.randint(0, len(word)+1) + ins_char = np.random.choice([c for c in string.ascii_lowercase]) + w = word[:ins_idx] + ins_char + word[ins_idx:] + bin_all = self.concatchar_encoder.encode_spelling(w).transpose() + elif opt=='REPLACE': + rep_idx = np.random.randint(0, len(word)) + rep_char = np.random.choice([c for c in string.ascii_lowercase]) + w = word[:rep_idx] + rep_char + w[(rep_idx+1):] + bin_all = self.concatchar_encoder.encode_spelling(w).transpose() + else: + raise OperationNotDefinedException('NOISE-'+opt) + return np.array([ np.repeat(np.array([bin_all]), 3, axis=0).reshape((1, len(self.concatchar_encoder)*3))[0] ]).transpose(), w + + def jumble_char(self, word, opt, unchanged=False): + if opt=='WHOLE': + return self.jumble_char_whole(word, unchanged=unchanged) + elif opt=='BEG': + return self.jumble_char_beg(word, unchanged=unchanged) + elif opt=='END': + return self.jumble_char_end(word, unchanged=unchanged) + elif opt=='INT': + return self.jumble_char_int(word, unchanged=unchanged) + else: + raise OperationNotDefinedException('JUMBLE-'+opt) + + def jumble_char_whole(self, word, unchanged=False): + bin_all = np.zeros((len(self.concatchar_encoder), 1)) + w = word + if word in default_signaldenotions.keys(): + bin_all[self.char_dict.token2id[default_specialsignals[default_signaldenotions[word]]]] += 1 + elif hasnum(word): + bin_all[self.char_dict.token2id[default_specialsignals['number']]] += 1 + else: + w = ''.join(np.random.choice([c for c in word], len(word), replace=False)) if not unchanged else word + bin_all = self.concatchar_encoder.encode_spelling(w).transpose() + bin_filler = np.zeros((len(self.concatchar_encoder)*2, 1)) + return np.concatenate((bin_all, bin_filler), axis=0), w + + def jumble_char_beg(self, word, unchanged=False): + bin_initial = np.zeros((len(self.concatchar_encoder), 1)) + bin_end = np.zeros((len(self.concatchar_encoder), 1)) + bin_filler = np.zeros((len(self.concatchar_encoder), 1)) + w = word + if word in default_signaldenotions.keys(): + bin_initial[self.char_dict.token2id[default_specialsignals[default_signaldenotions[word]]]] += 1 + bin_end[self.char_dict.token2id[default_specialsignals[default_signaldenotions[word]]]] += 1 + elif hasnum(word): + bin_initial[self.char_dict.token2id[default_specialsignals['number']]] += 1 + bin_end[self.char_dict.token2id[default_specialsignals['number']]] += 1 + else: + w_init = ''.join(np.random.choice([c for c in word[:-1]], len(word)-1)) if not unchanged and len(w)>3 else word[:-1] + w = w_init + word[-1] + if len(w_init) > 0: + bin_initial = self.concatchar_encoder.encode_spelling(w_init).transpose() + bin_end = self.concatchar_encoder.encode_spelling(word[-1]).transpose() + return reduce(lambda a, b: np.concatenate((a, b), axis=0), [bin_initial, bin_end, bin_filler]), w + + def jumble_char_end(self, word, unchanged=False): + bin_initial = np.zeros((len(self.concatchar_encoder), 1)) + bin_end = np.zeros((len(self.concatchar_encoder), 1)) + bin_filler = np.zeros((len(self.concatchar_encoder), 1)) + w = word + if word in default_signaldenotions.keys(): + bin_initial[self.char_dict.token2id[default_specialsignals[default_signaldenotions[word]]]] += 1 + bin_end[self.char_dict.token2id[default_specialsignals[default_signaldenotions[word]]]] += 1 + elif hasnum(word): + bin_initial[self.char_dict.token2id[default_specialsignals['number']]] += 1 + bin_end[self.char_dict.token2id[default_specialsignals['number']]] += 1 + else: + w_end = ''.join(np.random.choice([c for c in word[1:]], len(word)-1)) if not unchanged and len(w)>3 else word[1:] + w = word[0] + w_end + bin_initial = self.concatchar_encoder.encode_spelling(word[0]).transpose() + if len(w_end) > 0: + bin_end = self.concatchar_encoder.encode_spelling(w_end).transpose() + return reduce(lambda a, b: np.concatenate((a, b), axis=0), [bin_initial, bin_end, bin_filler]), w + + def jumble_char_int(self, word, unchanged=False): + bin_initial = np.zeros((len(self.concatchar_encoder), 1)) + bin_middle = np.zeros((len(self.concatchar_encoder), 1)) + bin_end = np.zeros((len(self.concatchar_encoder), 1)) + w = word + if word in default_signaldenotions.keys(): + bin_initial[self.char_dict.token2id[default_specialsignals[default_signaldenotions[word]]]] += 1 + bin_middle[self.char_dict.token2id[default_specialsignals[default_signaldenotions[word]]]] += 1 + bin_end[self.char_dict.token2id[default_specialsignals[default_signaldenotions[word]]]] += 1 + elif hasnum(word): + bin_initial[self.char_dict.token2id[default_specialsignals['number']]] += 1 + bin_middle[self.char_dict.token2id[default_specialsignals['number']]] += 1 + bin_end[self.char_dict.token2id[default_specialsignals['number']]] += 1 + else: + w_mid = ''.join(np.random.choice([c for c in word[1:-1]], len(word)-2)) if not unchanged and len(w)>3 else w[1:-1] + w = word[0] + w_mid + word[-1] + bin_initial = self.concatchar_encoder.encode_spelling(word[0]).transpose() + if len(w_mid)>0: + bin_middle = self.concatchar_encoder.encode_spelling(w_mid).transpose() + bin_end = self.concatchar_encoder.encode_spelling(word[-1]).transpose() + return reduce(lambda a, b: np.append(a, b, axis=0), [bin_initial, bin_middle, bin_end]), w + + def change_nothing(self, word, operation): + if operation.upper().startswith('NOISE'): + return self.noise_char(word, operation[6:], unchanged=True) + else: + return self.jumble_char(word, operation[7:], unchanged=True) + diff --git a/examples/sakaguchi_spell/sakaguchi.py b/examples/sakaguchi_spell/sakaguchi.py new file mode 100644 index 00000000..3d29e6c9 --- /dev/null +++ b/examples/sakaguchi_spell/sakaguchi.py @@ -0,0 +1,202 @@ + +# Reference: https://github.com/keisks/robsut-wrod-reocginiton +# Article: http://cs.jhu.edu/~kevinduh/papers/sakaguchi17robsut.pdf + +import json + +import numpy as np +from gensim.corpora import Dictionary +from sklearn.preprocessing import OneHotEncoder +from tensorflow.keras.models import Sequential +from tensorflow.keras.layers import LSTM, Activation, Dropout, Dense, TimeDistributed + +import shorttext.utils.kerasmodel_io as kerasio +from shorttext.spell import SpellCorrector +from .binarize import default_alph, default_specialsignals +from shorttext.utils import classification_exceptions as ce +from .binarize import SpellingToConcatCharVecEncoder, SCRNNBinarizer +from shorttext.utils import CompactIOMachine + + +nospace_tokenize = lambda sentence: [t.strip() for t in sentence.split() if len(t.strip())>0] + + +class SCRNNSpellCorrector(SpellCorrector, CompactIOMachine): + """ scRNN (semi-character-level recurrent neural network) Spell Corrector. + + Reference: + Keisuke Sakaguchi, Kevin Duh, Matt Post, Benjamin Van Durme, "Robsut Wrod Reocginiton via semi-Character Recurrent Neural Networ," arXiv:1608.02214 (2016). [`arXiv + `_] + + """ + def __init__(self, operation, + alph=default_alph, + specialsignals=default_specialsignals, + concatcharvec_encoder=None, + batchsize=1, + nb_hiddenunits=650): + """ Instantiate the scRNN spell corrector. + + :param operation: types of distortion of words in training (options: "NOISE-INSERT", "NOISE-DELETE", "NOISE-REPLACE", "JUMBLE-WHOLE", "JUMBLE-BEG", "JUMBLE-END", and "JUMBLE-INT") + :param alph: default string of characters (Default: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz.,:;'*!?`$%&(){}[]-/\@_#") + :param specialsignals: dictionary of special signals (Default built-in) + :param concatcharvec_encoder: one-hot encoder for characters, initialize if None. (Default: None) + :param batchsize: batch size. (Default: 1) + :param nb_hiddenunits: number of hidden units. (Default: 650) + :type operation: str + :type alpha: str + :type specialsignals: dict + :type concatcharvec_encoder: shorttext.spell.binarize.SpellingToConcatCharVecEncoder + :type batchsize: int + :type nb_hiddenunits: int + """ + CompactIOMachine.__init__(self, {'classifier': 'scrnn_spell'}, 'scrnn_spell', ['_config.json', '_vocabs.gensimdict', '.weights.h5', '.json']) + self.operation = operation + self.alph = alph + self.specialsignals = specialsignals + self.binarizer = SCRNNBinarizer(self.alph, self.specialsignals) + self.concatcharvec_encoder = SpellingToConcatCharVecEncoder(self.alph) if concatcharvec_encoder==None else concatcharvec_encoder + self.onehotencoder = OneHotEncoder() + self.trained = False + self.batchsize = batchsize + self.nb_hiddenunits = nb_hiddenunits + + def preprocess_text_train(self, text): + """ A generator that output numpy vectors for the text for training. + + :param text: text + :return: generator that outputs the numpy vectors for training + :type text: str + :rtype: generator + """ + for token in nospace_tokenize(text): + if self.operation.upper().startswith('NOISE'): + xvec, _ = self.binarizer.noise_char(token, self.operation.upper()[6:]) + elif self.operation.upper().startswith('JUMBLE'): + xvec, _ = self.binarizer.jumble_char(token, self.operation.upper()[7:]) + normtoken = token if token in self.dictionary.token2id else '' + yvec = self.onehotencoder.transform([[self.dictionary.token2id[normtoken]]]).toarray().reshape((len(self.dictionary), 1)) + yield xvec, yvec + + def preprocess_text_correct(self, text): + """ A generator that output numpy vectors for the text for correction. + + ModelNotTrainedException is raised if the model has not been trained. + + :param text: text + :return: generator that outputs the numpy vectors for correction + :type text: str + :rtype: generator + :raise: ModelNotTrainedException + """ + if not self.trained: + raise ce.ModelNotTrainedException() + for token in nospace_tokenize(text): + xvec, _ = self.binarizer.change_nothing(token, self.operation) + yield xvec + + def train(self, text, nb_epoch=100, dropout_rate=0.01, optimizer='rmsprop'): + """ Train the scRNN model. + + :param text: training corpus + :param nb_epoch: number of epochs (Default: 100) + :param dropout_rate: dropout rate (Default: 0.01) + :param optimizer: optimizer (Default: "rmsprop") + :type text: str + :type nb_epoch: int + :type dropout_rate: float + :type optimizer: str + """ + self.dictionary = Dictionary([nospace_tokenize(text), default_specialsignals.values()]) + self.onehotencoder.fit(np.arange(len(self.dictionary)).reshape((len(self.dictionary), 1))) + xylist = [(xvec.transpose(), yvec.transpose()) for xvec, yvec in self.preprocess_text_train(text)] + xtrain = np.array([item[0] for item in xylist]) + ytrain = np.array([item[1] for item in xylist]) + + # neural network here + model = Sequential() + model.add(LSTM(self.nb_hiddenunits, return_sequences=True)) + model.add(Dropout(dropout_rate)) + model.add(TimeDistributed(Dense(len(self.dictionary)))) + model.add(Activation('softmax')) + + # compile... more arguments + model.compile(loss='categorical_crossentropy', optimizer=optimizer) + + # training + model.fit(xtrain, ytrain, epochs=nb_epoch) + + self.model = model + self.trained = True + + def correct(self, word): + """ Recommend a spell correction to given the word. + + :param word: a given word + :return: recommended correction + :type word: str + :rtype: str + :raise: ModelNotTrainedException + """ + if not self.trained: + raise ce.ModelNotTrainedException() + + xmat = np.array([xvec.transpose() for xvec in self.preprocess_text_correct(word)]) + yvec = self.model.predict(xmat) + + maxy = yvec.argmax(axis=-1) + return ' '.join([self.dictionary[y] for y in maxy[0]]) + + def loadmodel(self, prefix): + """ Load the model. + + :param prefix: prefix of the model path + :return: None + :type prefix: str + """ + self.dictionary = Dictionary.load(prefix+'_vocabs.gensimdict') + parameters = json.load(open(prefix+'_config.json', 'r')) + self.operation = parameters['operation'] + self.alph = parameters['alph'] + self.specialsignals = parameters['special_signals'] + self.binarizer = SCRNNBinarizer(self.alph, self.specialsignals) + self.concatcharvec_encoder = SpellingToConcatCharVecEncoder(self.alph) + self.batchsize = parameters['batchsize'] + self.nb_hiddenunits = parameters['nb_hiddenunits'] + self.onehotencoder = OneHotEncoder() + self.onehotencoder.fit(np.arange(len(self.dictionary)).reshape((len(self.dictionary), 1))) + self.model = kerasio.load_model(prefix) + self.trained = True + + def savemodel(self, prefix): + """ Save the model. + + :param prefix: prefix of the model path + :return: None + :type prefix: str + """ + if not self.trained: + raise ce.ModelNotTrainedException() + kerasio.save_model(prefix, self.model) + self.dictionary.save(prefix+'_vocabs.gensimdict') + parameters = {'alph': self.alph, 'special_signals': self.specialsignals, 'operation': self.operation, + 'batchsize': self.batchsize, 'nb_hiddenunits': self.nb_hiddenunits} + json.dump(parameters, open(prefix+'_config.json', 'w')) + + +def loadSCRNNSpellCorrector(filepath, compact=True): + """ Load a pre-trained scRNN spell corrector instance. + + :param filepath: path of the model if compact==True; prefix of the model oath if compact==False + :param compact: whether model file is compact (Default: True) + :return: an instance of scRnn spell corrector + :type filepath: str + :type compact: bool + :rtype: SCRNNSpellCorrector + """ + corrector = SCRNNSpellCorrector('JUMBLE-WHOLE') + if compact: + corrector.load_compact_model(filepath) + else: + corrector.loadmodel(filepath) + return corrector \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..46ee1e72 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,87 @@ +[build-system] +requires = ["setuptools", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "shorttext" +version = "4.0.1" +authors = [ + {name = "Kwan Yuet Stephen Ho", email = "stephenhky@yahoo.com.hk"} +] +description = "Short Text Mining" +readme = {file = "README.md", content-type = "text/markdown"} +license = {text = "MIT"} +keywords = ["shorttext", "natural language processing", "text mining"] +requires-python = ">=3.11" +classifiers = [ + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Scientific/Engineering :: Mathematics", + "Topic :: Text Processing :: Linguistic", + "Topic :: Software Development :: Libraries :: Python Modules", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Natural Language :: English", + "License :: OSI Approved :: MIT License", + "Intended Audience :: Developers", + "Intended Audience :: Education", + "Intended Audience :: Information Technology", + "Intended Audience :: Science/Research" +] +dependencies = [ + "numpy>=1.23.3", + "scipy>=1.12.0", + "joblib>=1.3.0", + "scikit-learn>=1.2.0", + "tensorflow>=2.13.0", + "keras>=3.0.0", + "gensim>=4.0.0", + "pandas>=1.2.0", + "snowballstemmer>=3.0.0", + "numba>=0.57.0", + "deprecation>=2.0.0", + "npdict>=0.0.10", + "sparse>=0.10.0", + "orjson>=3.0.0", + "loguru>=0.6.0" +] + +[project.urls] +Repository = "https://github.com/stephenhky/PyShortTextCategorization" +Issues = "https://github.com/stephenhky/PyShortTextCategorization/issues" +Documentation = "https://shorttext.readthedocs.io" + +[tool.setuptools] +packages = [ + "shorttext", + "shorttext.cli", + "shorttext.utils", + "shorttext.classifiers", + "shorttext.classifiers.embed", + "shorttext.classifiers.embed.nnlib", + "shorttext.classifiers.embed.sumvec", + "shorttext.classifiers.bow", + "shorttext.classifiers.bow.topic", + "shorttext.classifiers.bow.maxent", + "shorttext.data", + "shorttext.stack", + "shorttext.generators", + "shorttext.generators.bow", + "shorttext.generators.charbase", + "shorttext.generators.seq2seq", + "shorttext.metrics", + "shorttext.metrics.dynprog", + "shorttext.metrics.wasserstein", + "shorttext.metrics.embedfuzzy", + "shorttext.schemas", + "shorttext.spell" +] +zip-safe = false +package-dir = {"" = "src"} + +[project.scripts] +ShortTextCategorizerConsole = "shorttext.cli.categorization:main" +ShortTextWordEmbedSimilarity = "shorttext.cli.wordembedsim:main" + +[project.optional-dependencies] +test = ["pytest"] diff --git a/readthedocs/Makefile b/readthedocs/Makefile deleted file mode 100644 index 109df405..00000000 --- a/readthedocs/Makefile +++ /dev/null @@ -1,192 +0,0 @@ -# Makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = sphinx-build -PAPER = -BUILDDIR = build - -# User-friendly check for sphinx-build -ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) -$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) -endif - -# Internal variables. -PAPEROPT_a4 = -D latex_paper_size=a4 -PAPEROPT_letter = -D latex_paper_size=letter -ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source -# the i18n builder cannot share the environment and doctrees with the others -I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source - -.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext - -help: - @echo "Please use \`make ' where is one of" - @echo " html to make standalone HTML files" - @echo " dirhtml to make HTML files named index.html in directories" - @echo " singlehtml to make a single large HTML file" - @echo " pickle to make pickle files" - @echo " json to make JSON files" - @echo " htmlhelp to make HTML files and a HTML help project" - @echo " qthelp to make HTML files and a qthelp project" - @echo " applehelp to make an Apple Help Book" - @echo " devhelp to make HTML files and a Devhelp project" - @echo " epub to make an epub" - @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" - @echo " latexpdf to make LaTeX files and run them through pdflatex" - @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" - @echo " text to make text files" - @echo " man to make manual pages" - @echo " texinfo to make Texinfo files" - @echo " info to make Texinfo files and run them through makeinfo" - @echo " gettext to make PO message catalogs" - @echo " changes to make an overview of all changed/added/deprecated items" - @echo " xml to make Docutils-native XML files" - @echo " pseudoxml to make pseudoxml-XML files for display purposes" - @echo " linkcheck to check all external links for integrity" - @echo " doctest to run all doctests embedded in the documentation (if enabled)" - @echo " coverage to run coverage check of the documentation (if enabled)" - -clean: - rm -rf $(BUILDDIR)/* - -html: - $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." - -dirhtml: - $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." - -singlehtml: - $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml - @echo - @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." - -pickle: - $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle - @echo - @echo "Build finished; now you can process the pickle files." - -json: - $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json - @echo - @echo "Build finished; now you can process the JSON files." - -htmlhelp: - $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp - @echo - @echo "Build finished; now you can run HTML Help Workshop with the" \ - ".hhp project file in $(BUILDDIR)/htmlhelp." - -qthelp: - $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp - @echo - @echo "Build finished; now you can run "qcollectiongenerator" with the" \ - ".qhcp project file in $(BUILDDIR)/qthelp, like this:" - @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/shorttext.qhcp" - @echo "To view the help file:" - @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/shorttext.qhc" - -applehelp: - $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp - @echo - @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." - @echo "N.B. You won't be able to view it unless you put it in" \ - "~/Library/Documentation/Help or install it in your application" \ - "bundle." - -devhelp: - $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp - @echo - @echo "Build finished." - @echo "To view the help file:" - @echo "# mkdir -p $$HOME/.local/share/devhelp/shorttext" - @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/shorttext" - @echo "# devhelp" - -epub: - $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub - @echo - @echo "Build finished. The epub file is in $(BUILDDIR)/epub." - -latex: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo - @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." - @echo "Run \`make' in that directory to run these through (pdf)latex" \ - "(use \`make latexpdf' here to do that automatically)." - -latexpdf: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through pdflatex..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -latexpdfja: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through platex and dvipdfmx..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -text: - $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text - @echo - @echo "Build finished. The text files are in $(BUILDDIR)/text." - -man: - $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man - @echo - @echo "Build finished. The manual pages are in $(BUILDDIR)/man." - -texinfo: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo - @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." - @echo "Run \`make' in that directory to run these through makeinfo" \ - "(use \`make info' here to do that automatically)." - -info: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo "Running Texinfo files through makeinfo..." - make -C $(BUILDDIR)/texinfo info - @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." - -gettext: - $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale - @echo - @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." - -changes: - $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes - @echo - @echo "The overview file is in $(BUILDDIR)/changes." - -linkcheck: - $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck - @echo - @echo "Link check complete; look for any errors in the above output " \ - "or in $(BUILDDIR)/linkcheck/output.txt." - -doctest: - $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest - @echo "Testing of doctests in the sources finished, look at the " \ - "results in $(BUILDDIR)/doctest/output.txt." - -coverage: - $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage - @echo "Testing of coverage in the sources finished, look at the " \ - "results in $(BUILDDIR)/coverage/python.txt." - -xml: - $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml - @echo - @echo "Build finished. The XML files are in $(BUILDDIR)/xml." - -pseudoxml: - $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml - @echo - @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/readthedocs/source/conf.py b/readthedocs/source/conf.py deleted file mode 100644 index 001d7f0c..00000000 --- a/readthedocs/source/conf.py +++ /dev/null @@ -1,287 +0,0 @@ -# -*- coding: utf-8 -*- -# -# shorttext documentation build configuration file, created by -# sphinx-quickstart on Sun Dec 11 16:15:57 2016. -# -# This file is execfile()d with the current directory set to its -# containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. - -import sys -import os -import shlex - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.insert(0, os.path.abspath('.')) - -# -- General configuration ------------------------------------------------ - -# If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.mathjax', -] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# The suffix(es) of source filenames. -# You can specify multiple suffix as a list of string: -# source_suffix = ['.rst', '.md'] -source_suffix = '.rst' - -# The encoding of source files. -#source_encoding = 'utf-8-sig' - -# The master toctree document. -master_doc = 'index' - -# General information about the project. -project = u'shorttext' -copyright = u'2017, Kwan-Yuet Ho' -author = u'Kwan-Yuet Ho' - -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. -# -# The short X.Y version. -version = '0.3' -# The full version, including alpha/beta/rc tags. -release = '0.3.8' - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# -# This is also used if you do content translation via gettext catalogs. -# Usually you set "language" from the command line for these cases. -language = None - -# There are two options for replacing |today|: either, you set today to some -# non-false value, then it is used: -#today = '' -# Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -exclude_patterns = [] - -# The reST default role (used for this markup: `text`) to use for all -# documents. -#default_role = None - -# If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True - -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -#add_module_names = True - -# If true, sectionauthor and moduleauthor directives will be shown in the -# output. They are ignored by default. -#show_authors = False - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' - -# A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] - -# If true, keep warnings as "system message" paragraphs in the built documents. -#keep_warnings = False - -# If true, `todo` and `todoList` produce output, else they produce nothing. -todo_include_todos = False - - -# -- Options for HTML output ---------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -html_theme = 'alabaster' - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -#html_theme_options = {} - -# Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] - -# The name for this set of Sphinx documents. If None, it defaults to -# " v documentation". -#html_title = None - -# A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None - -# The name of an image file (relative to this directory) to place at the top -# of the sidebar. -#html_logo = None - -# The name of an image file (within the static path) to use as favicon of the -# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 -# pixels large. -#html_favicon = None - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] - -# Add any extra paths that contain custom files (such as robots.txt or -# .htaccess) here, relative to this directory. These files are copied -# directly to the root of the documentation. -#html_extra_path = [] - -# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, -# using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' - -# If true, SmartyPants will be used to convert quotes and dashes to -# typographically correct entities. -#html_use_smartypants = True - -# Custom sidebar templates, maps document names to template names. -#html_sidebars = {} - -# Additional templates that should be rendered to pages, maps page names to -# template names. -#html_additional_pages = {} - -# If false, no module index is generated. -#html_domain_indices = True - -# If false, no index is generated. -#html_use_index = True - -# If true, the index is split into individual pages for each letter. -#html_split_index = False - -# If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True - -# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True - -# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True - -# If true, an OpenSearch description file will be output, and all pages will -# contain a tag referring to it. The value of this option must be the -# base URL from which the finished HTML is served. -#html_use_opensearch = '' - -# This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None - -# Language to be used for generating the HTML full-text search index. -# Sphinx supports the following languages: -# 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' -# 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr' -#html_search_language = 'en' - -# A dictionary with options for the search language support, empty by default. -# Now only 'ja' uses this config value -#html_search_options = {'type': 'default'} - -# The name of a javascript file (relative to the configuration directory) that -# implements a search results scorer. If empty, the default will be used. -#html_search_scorer = 'scorer.js' - -# Output file base name for HTML help builder. -htmlhelp_basename = 'shorttextdoc' - -# -- Options for LaTeX output --------------------------------------------- - -latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', - -# Latex figure (float) alignment -#'figure_align': 'htbp', -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). -latex_documents = [ - (master_doc, 'shorttext.tex', u'shorttext Documentation', - u'Kwan-Yuet Ho', 'manual'), -] - -# The name of an image file (relative to this directory) to place at the top of -# the title page. -#latex_logo = None - -# For "manual" documents, if this is true, then toplevel headings are parts, -# not chapters. -#latex_use_parts = False - -# If true, show page references after internal links. -#latex_show_pagerefs = False - -# If true, show URL addresses after external links. -#latex_show_urls = False - -# Documents to append as an appendix to all manuals. -#latex_appendices = [] - -# If false, no module index is generated. -#latex_domain_indices = True - - -# -- Options for manual page output --------------------------------------- - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'shorttext', u'shorttext Documentation', - [author], 1) -] - -# If true, show URL addresses after external links. -#man_show_urls = False - - -# -- Options for Texinfo output ------------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [ - (master_doc, 'shorttext', u'shorttext Documentation', - author, 'shorttext', 'One line description of project.', - 'Miscellaneous'), -] - -# Documents to append as an appendix to all manuals. -#texinfo_appendices = [] - -# If false, no module index is generated. -#texinfo_domain_indices = True - -# How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' - -# If true, do not generate a @detailmenu in the "Top" node's menu. -#texinfo_no_detailmenu = False diff --git a/readthedocs/source/doclink.rst b/readthedocs/source/doclink.rst deleted file mode 100644 index 273c08cc..00000000 --- a/readthedocs/source/doclink.rst +++ /dev/null @@ -1,6 +0,0 @@ -Documentation -============= - -Go to: PythonHosted_ - -.. _PythonHosted: http://pythonhosted.org/shorttext/ \ No newline at end of file diff --git a/readthedocs/source/index.rst b/readthedocs/source/index.rst deleted file mode 100644 index 2e53fe08..00000000 --- a/readthedocs/source/index.rst +++ /dev/null @@ -1,38 +0,0 @@ -.. shorttext documentation master file, created by - sphinx-quickstart on Sun Dec 11 16:15:57 2016. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -Welcome to shorttext's documentation! -===================================== - -This repository is a collection of algorithms for multi-class classification to short texts using Python. -Modules are backward compatible unless otherwise specified. Feel free to give suggestions or report -issues through the Github_ page. - -Contents: - -.. toctree:: - :maxdepth: 2 - - install - doclink - -Links: - -- Github_ : repository of the package -- PythonHosted_ : documentation and tutorial of the package -- PyPI_ : PyPI - - -.. _Github: https://github.com/stephenhky/PyShortTextCategorization -.. _PythonHosted: http://pythonhosted.org/shorttext/ -.. _PyPI: https://pypi.python.org/pypi/shorttext - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` - diff --git a/readthedocs/source/install.rst b/readthedocs/source/install.rst deleted file mode 100644 index 782522c9..00000000 --- a/readthedocs/source/install.rst +++ /dev/null @@ -1,46 +0,0 @@ -Installation Guide -================== - -To install the package in Linux or OS X, enter the following in the console: - -:: - - pip install -U shorttext - -It is very possible that you have to do it as root, that you have to add ``sudo`` in -front of the command. - -However, the repository on Python Package Index is not always the most updated. To get -the most updated (not official) version, you can install from Github_: - -:: - - pip install -U git+https://github.com/stephenhky/PyShortTextCategorization@master - -By adding ``-U`` in the command, it automatically installs the required packages. If not, -you have to install these packages on your own. - -.. _Github: https://github.com/stephenhky/PyShortTextCategorization - -Required Packages ------------------ - -- Numpy_ (Numerical Python) -- SciPy_ (Scientific Python) -- Scikit-Learn_ (Machine Learning in Python) -- Theano_ (Symbolic Computing for Deep Learning) -- keras_ (Deep Learning Library for Theano and Tensorflow) -- gensim_ (Topic Modeling for Humans) -- Pandas_ (Python Data Analysis Library) -- spaCy_ (Industrial Strenglth Natural Language Processing in Python) -- stemming_ (stemming in Python) - -.. _Numpy: http://www.numpy.org/ -.. _SciPy: https://www.scipy.org/ -.. _Scikit-Learn: http://scikit-learn.org/stable/ -.. _Theano: http://deeplearning.net/software/theano/ -.. _keras: https://keras.io/ -.. _gensim: https://radimrehurek.com/gensim/ -.. _Pandas: http://pandas.pydata.org/ -.. _spaCy: https://spacy.io/ -.. _stemming: https://pypi.python.org/pypi/stemming/ \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100644 index 356e5b07..00000000 --- a/setup.py +++ /dev/null @@ -1,46 +0,0 @@ -from setuptools import setup - -def readme(): - with open('README.md') as f: - return f.read() - -setup(name='shorttext', - version="0.4.0", - description="Short Text Categorization", - long_description="Supervised learning algorithms for short text categorization using embedded word vectors such as Word2Vec, or immediate feature vectors using topic models", - classifiers=[ - "Topic :: Scientific/Engineering :: Artificial Intelligence", - "Natural Language :: English", - "Topic :: Scientific/Engineering :: Mathematics", - "Programming Language :: Python :: 2.7", - "License :: OSI Approved :: MIT License", - ], - keywords="short text natural language processing text mining", - url="https://github.com/stephenhky/PyShortTextCategorization", - author="Kwan-Yuet Ho", - author_email="stephenhky@yahoo.com.hk", - license='MIT', - packages=['shorttext', - 'shorttext.utils', - 'shorttext.classifiers', - 'shorttext.classifiers.embed', - 'shorttext.classifiers.embed.nnlib', - 'shorttext.classifiers.embed.sumvec', - 'shorttext.classifiers.bow', - 'shorttext.classifiers.bow.topic', - 'shorttext.classifiers.bow.maxent', - 'shorttext.data', - 'shorttext.stack', - 'shorttext.generators', - 'shorttext.generators.bow'], - package_dir={'shorttext': 'shorttext'}, - package_data={'shorttext': ['data/*.csv', 'utils/*.pkl']}, - setup_requires=['numpy'], - install_requires=[ - 'numpy', 'scipy', 'scikit-learn', 'keras>=2.0.0', 'gensim>=2.2.0', 'pandas', 'spacy', 'stemming', - ], - scripts=['bin/ShortTextCategorizerConsole', - 'bin/ShortTextWord2VecSimilarity', - 'bin/switch_kerasbackend'], - # include_package_data=False, - zip_safe=False) diff --git a/shorttext/__init__.py b/shorttext/__init__.py deleted file mode 100644 index 160306cc..00000000 --- a/shorttext/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -import os -import sys - -thisdir, _ = os.path.split(__file__) -sys.path.append(thisdir) - -from . import utils -from . import data -from . import classifiers -from . import generators -from . import stack -from .smartload import smartload_compact_model \ No newline at end of file diff --git a/shorttext/classifiers/bow/maxent/MaxEntClassification.py b/shorttext/classifiers/bow/maxent/MaxEntClassification.py deleted file mode 100644 index dcbab243..00000000 --- a/shorttext/classifiers/bow/maxent/MaxEntClassification.py +++ /dev/null @@ -1,265 +0,0 @@ - -import pickle - -from scipy.sparse import dok_matrix -from gensim.corpora import Dictionary -from keras.models import Sequential -from keras.layers import Dense -from keras.regularizers import l2 - -import shorttext.utils.kerasmodel_io as kerasio -from shorttext.utils import tokenize -from shorttext.utils import gensim_corpora as gc -from shorttext.utils import classification_exceptions as e -import shorttext.utils.compactmodel_io as cio -from shorttext.utils import deprecated - - -def logistic_framework(nb_features, nb_outputs, l2reg=0.01, bias_l2reg=0.01, optimizer='adam'): - """ Construct the neural network of maximum entropy classifier. - - Given the numbers of features and the output labels, return a keras neural network - for implementing maximum entropy (multinomial) classifier. - - :param nb_features: number of features - :param nb_outputs: number of output labels - :param l2reg: L2 regularization coefficient (Default: 0.01) - :param bias_l2reg: L2 regularization coefficient for bias (Default: 0.01) - :param optimizer: optimizer for gradient descent. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. (Default: adam) - :return: keras sequential model for maximum entropy classifier - :type nb_features: int - :type nb_outputs: int - :type l2reg: float - :type bias_l2reg: float - :type optimizer: str - :rtype: keras.model.Sequential - """ - kmodel = Sequential() - kmodel.add(Dense(units=nb_outputs, - activation='softmax', - input_shape=(nb_features,), - kernel_regularizer=l2(l2reg), - bias_regularizer=l2(bias_l2reg)) - ) - kmodel.compile(loss='categorical_crossentropy', optimizer=optimizer) - return kmodel - - -@cio.compactio({'classifier': 'maxent'}, 'maxent', ['_classlabels.txt', '.json', '.h5', '_labelidx.pkl', '_dictionary.dict']) -class MaxEntClassifier: - """ - This is a classifier that implements the principle of maximum entropy. - - Reference: - * Adam L. Berger, Stephen A. Della Pietra, Vincent J. Della Pietra, "A Maximum Entropy Approach to Natural Language Processing," *Computational Linguistics* 22(1): 39-72 (1996). - """ - def __init__(self, preprocessor=lambda s: s.lower()): - """ Initializer. - - :param preprocessor: text preprocessor - :type preprocessor: function - """ - self.preprocessor = preprocessor - self.trained = False - - def shorttext_to_vec(self, shorttext): - """ Convert the shorttext into a sparse vector given the dictionary. - - According to the dictionary (gensim.corpora.Dictionary), convert the given text - into a vector representation, according to the occurence of tokens. - - This function is deprecated and no longer used because it is too slow to run in a loop. - But this is used while doing prediction. - - :param shorttext: short text to be converted. - :return: sparse vector of the vector representation - :type shorttext: str - :rtype: scipy.sparse.dok_matrix - """ - # too slow, deprecated - tokens = tokenize(self.preprocessor(shorttext)) - - vec = dok_matrix((1, len(self.dictionary))) - for token in tokens: - if token in self.dictionary.token2id: - vec[0, self.dictionary.token2id[token]] = 1.0 - - return vec[0, :] - - @deprecated - def gensimcorpus_to_matrix(self, corpus): - """ Convert the gensim corpus into a sparse matrix. (deprecated) - - :param corpus: gensim corpus - :return: matrix representing the corpus - :type corpus: list - :rtype: scipy.sparse.dok_matrix - """ - # not used, deprecated - matrix = dok_matrix((len(corpus), len(self.dictionary))) - for docid, doc in enumerate(corpus): - for tokenid, count in doc: - matrix[docid, tokenid] = count - return matrix - - def index_classlabels(self): - """ Index the class outcome labels. - - Index the class outcome labels into integers, for neural network implementation. - - """ - self.labels2idx = {label: idx for idx, label in enumerate(self.classlabels)} - - def convert_classdict_to_XY(self, classdict): - """ Convert the training data into sparse matrices for training. - - :param classdict: training data - :return: a tuple, consisting of sparse matrices for X (training data) and y (the labels of the training data) - :type classdict: dict - :rtype: tuple - """ - nb_data = sum(map(lambda k: len(classdict[k]), classdict.keys())) - X = dok_matrix((nb_data, len(self.dictionary))) - y = dok_matrix((nb_data, len(self.labels2idx))) - - rowid = 0 - for label in classdict: - if label in self.labels2idx.keys(): - for shorttext in classdict[label]: - tokens = tokenize(self.preprocessor(shorttext)) - #X[rowid, :] = self.shorttext_to_vec(shorttext) - for token in tokens: - X[rowid, self.dictionary.token2id[token]] += 1.0 - y[rowid, self.labels2idx[label]] = 1. - rowid += 1 - - return X, y - - def train(self, classdict, nb_epochs=500, l2reg=0.01, bias_l2reg=0.01, optimizer='adam'): - """ Train the classifier. - - Given the training data, train the classifier. - - :param classdict: training data - :param nb_epochs: number of epochs (Defauly: 500) - :param l2reg: L2 regularization coefficient (Default: 0.01) - :param bias_l2reg: L2 regularization coefficient for bias (Default: 0.01) - :param optimizer: optimizer for gradient descent. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. (Default: adam) - :return: None - :type classdict: dict - :type nb_epochs: int - :type l2reg: float - :type bias_l2reg: float - :type optimizer: str - """ - self.dictionary, self.corpus, self.classlabels = gc.generate_gensim_corpora(classdict, - preprocess_and_tokenize=lambda s: tokenize(self.preprocessor(s))) - self.index_classlabels() - - X, y = self.convert_classdict_to_XY(classdict) - - kmodel = logistic_framework(len(self.dictionary), - len(self.classlabels), - l2reg=l2reg, - bias_l2reg=bias_l2reg, - optimizer=optimizer) - kmodel.fit(X.toarray(), y.toarray(), epochs=nb_epochs) - - self.model = kmodel - self.trained = True - - def savemodel(self, nameprefix): - """ Save the trained model into files. - - Given the prefix of the file paths, save the model into files, with name given by the prefix. - There will be give files produced, one name ending with "_classlabels.txt", one with ".json", - one with ".h5", one with "_labelidx.pkl", and one with "_dictionary.dict". - - If there is no trained model, a `ModelNotTrainedException` will be thrown. - - :param nameprefix: prefix of the file path - :return: None - :type nameprefix: str - :raise: ModelNotTrainedException - """ - if not self.trained: - raise e.ModelNotTrainedException() - - kerasio.save_model(nameprefix, self.model) - - self.dictionary.save(nameprefix+'_dictionary.dict') - - labelfile = open(nameprefix+'_classlabels.txt', 'w') - labelfile.write('\n'.join(self.classlabels)) - labelfile.close() - - pickle.dump(self.labels2idx, open(nameprefix+'_labelidx.pkl', 'w')) - - def loadmodel(self, nameprefix): - """ Load a trained model from files. - - Given the prefix of the file paths, load the model from files with name given by the prefix - followed by "_classlabels.txt", ".json", ".h5", "_labelidx.pkl", and "_dictionary.dict". - - If this has not been run, or a model was not trained by :func:`~train`, - a `ModelNotTrainedException` will be raised while performing prediction or saving the model. - - :param nameprefix: prefix of the file path - :return: None - :type nameprefix: str - """ - self.model = kerasio.load_model(nameprefix) - - self.dictionary = Dictionary.load(nameprefix+'_dictionary.dict') - - labelfile = open(nameprefix+'_classlabels.txt', 'r') - self.classlabels = labelfile.readlines() - labelfile.close() - self.classlabels = map(lambda s: s.strip(), self.classlabels) - - self.labels2idx = pickle.load(open(nameprefix+'_labelidx.pkl', 'r')) - - self.trained = True - - def score(self, shorttext): - """ Calculate the scores for all the class labels for the given short sentence. - - Given a short sentence, calculate the classification scores for all class labels, - returned as a dictionary with key being the class labels, and values being the scores. - If the short sentence is empty, or if other numerical errors occur, the score will be `numpy.nan`. - If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. - - :param shorttext: a short sentence - :return: a dictionary with keys being the class labels, and values being the corresponding classification scores - :type shorttext: str - :rtype: dict - :raise: ModelNotTrainedException - """ - if not self.trained: - raise e.ModelNotTrainedException() - - vec = self.shorttext_to_vec(shorttext) - predictions = self.model.predict(vec.toarray()) - - # wrangle output result - scoredict = {classlabel: predictions[0][idx] for idx, classlabel in enumerate(self.classlabels)} - return scoredict - -def load_maxent_classifier(name, compact=True): - """ Load the maximum entropy classifier from saved model. - - Given a moel file(s), load the maximum entropy classifier. - - :param name: name or prefix of the file, if compact is True or False respectively - :param compact: whether the model file is compact (Default:True) - :return: maximum entropy classifier - :type name: str - :type compact: bool - :rtype: MaxEntClassifier - """ - classifier = MaxEntClassifier() - if compact: - classifier.load_compact_model(name) - else: - classifier.loadmodel(name) - return classifier \ No newline at end of file diff --git a/shorttext/classifiers/bow/topic/LatentTopicModeling.py b/shorttext/classifiers/bow/topic/LatentTopicModeling.py deleted file mode 100644 index fc109bc5..00000000 --- a/shorttext/classifiers/bow/topic/LatentTopicModeling.py +++ /dev/null @@ -1,12 +0,0 @@ - -# for backward compatibility - -from shorttext.generators.bow.GensimTopicModeling import gensim_topic_model_dict -from shorttext.generators.bow.LatentTopicModeling import LatentTopicModeler -from shorttext.generators.bow.GensimTopicModeling import GensimTopicModeler -from shorttext.generators.bow.GensimTopicModeling import LDAModeler -from shorttext.generators.bow.GensimTopicModeling import LSIModeler -from shorttext.generators.bow.GensimTopicModeling import RPModeler -from shorttext.generators.bow.AutoEncodingTopicModeling import AutoencodingTopicModeler, load_autoencoder_topicmodel -from shorttext.generators import load_gensimtopicmodel -from shorttext.generators import load_autoencoder_topicmodel as load_autoencoder_topic diff --git a/shorttext/classifiers/bow/topic/SkLearnClassification.py b/shorttext/classifiers/bow/topic/SkLearnClassification.py deleted file mode 100644 index 7b66c505..00000000 --- a/shorttext/classifiers/bow/topic/SkLearnClassification.py +++ /dev/null @@ -1,378 +0,0 @@ -from collections import defaultdict - -from sklearn.externals import joblib - -from shorttext.utils import textpreprocessing as textpreprocess -from .LatentTopicModeling import AutoencodingTopicModeler, load_autoencoder_topicmodel -from .LatentTopicModeling import LDAModeler, LSIModeler, RPModeler -from .LatentTopicModeling import load_gensimtopicmodel -import shorttext.utils.classification_exceptions as e -import shorttext.utils.compactmodel_io as cio - - -class TopicVectorSkLearnClassifier: - """ - This is a classifier that wraps any supervised learning algorithm in `scikit-learn`, - and use the topic vectors output by the topic modeler :class:`LatentTopicModeler` that - wraps the topic models in `gensim`. - - # Reference - - Xuan Hieu Phan, Cam-Tu Nguyen, Dieu-Thu Le, Minh Le Nguyen, Susumu Horiguchi, Quang-Thuy Ha, - "A Hidden Topic-Based Framework toward Building Applications with Short Web Documents," - *IEEE Trans. Knowl. Data Eng.* 23(7): 961-976 (2011). - - Xuan Hieu Phan, Le-Minh Nguyen, Susumu Horiguchi, "Learning to Classify Short and Sparse Text & Web withHidden Topics from Large-scale Data Collections," - WWW '08 Proceedings of the 17th international conference on World Wide Web. (2008) [`ACL - `_] - """ - def __init__(self, topicmodeler, sklearn_classifier): - """ Initialize the classifier. - - :param topicmodeler: a topic modeler - :param sklearn_classifier: a scikit-learn classifier - :type topicmodeler: LatentTopicModeler - :type sklearn_classifier: sklearn.base.BaseEstimator - """ - self.topicmodeler = topicmodeler - self.classifier = sklearn_classifier - self.trained = False - - def train(self, classdict, *args, **kwargs): - """ Train the classifier. - - If the topic modeler does not have a trained model, it will raise `ModelNotTrainedException`. - - :param classdict: training data - :param args: arguments to be passed to the `fit` method of the scikit-learn classifier - :param kwargs: arguments to be passed to the `fit` method of the scikit-learn classifier - :return: None - :raise: ModelNotTrainedException - :type classdict: dict - """ - X = [] - y = [] - self.classlabels = classdict.keys() - for classidx, classlabel in zip(range(len(self.classlabels)), self.classlabels): - topicvecs = map(self.topicmodeler.retrieve_topicvec, classdict[classlabel]) - X += topicvecs - y += [classidx]*len(topicvecs) - self.classifier.fit(X, y, *args, **kwargs) - self.trained = True - - def getvector(self, shorttext): - """ Retrieve the topic vector representation of the given short text. - - If the topic modeler does not have a trained model, it will raise `ModelNotTrainedException`. - - :param shorttext: short text - :return: topic vector representation - :raise: ModelNotTrainedException - :type shorttext: str - :rtype: numpy.ndarray - """ - if not self.trained: - raise e.ModelNotTrainedException() - return self.topicmodeler.retrieve_topicvec(shorttext) - - def classify(self, shorttext): - """ Give the highest-scoring class of the given short text according to the classifier. - - If neither :func:`~train` nor :func:`~loadmodel` was run, or if the - topic model was not trained, it will raise `ModelNotTrainedException`. - - :param shorttext: short text - :return: class label of the classification result of the given short text - :raise: ModelNotTrainedException - :type shorttext: str - :rtype: str - """ - if not self.trained: - raise e.ModelNotTrainedException() - topicvec = self.getvector(shorttext) - return self.classlabels[self.classifier.predict([topicvec])[0]] - - def score(self, shorttext, default_score=0.0): - """ Calculate the score, which is the cosine similarity with the topic vector of the model, - of the short text against each class labels. - - If neither :func:`~train` nor :func:`~loadmodel` was run, or if the - topic model was not trained, it will raise `ModelNotTrainedException`. - - :param shorttext: short text - :param default_score: default score if no score is assigned (Default: 0.0) - :return: dictionary of scores of the text to all classes - :raise: ModelNotTrainedException - :type shorttext: str - :type default_score: float - :rtype: dict - """ - if not self.trained: - raise e.ModelNotTrainedException() - scoredict = defaultdict(lambda : default_score) - topicvec = self.getvector(shorttext) - for classidx, classlabel in zip(range(len(self.classlabels)), self.classlabels): - scoredict[classlabel] = self.classifier.score([topicvec], [classidx]) - return dict(scoredict) - - def savemodel(self, nameprefix): - """ Save the model. - - Save the topic model and the trained scikit-learn classification model. The scikit-learn - model will have the name `nameprefix` followed by the extension `.pkl`. The - topic model is the same as the one in `LatentTopicModeler`. - - If neither :func:`~train` nor :func:`~loadmodel` was run, or if the - topic model was not trained, it will raise `ModelNotTrainedException`. - - :param nameprefix: prefix of the paths of the model files - :return: None - :raise: ModelNotTrainedException - :type nameprefix: str - """ - if not self.trained: - raise e.ModelNotTrainedException() - self.topicmodeler.savemodel(nameprefix) - joblib.dump(self.classifier, nameprefix+'.pkl') - - def loadmodel(self, nameprefix): - """ Load the classification model together with the topic model. - - :param nameprefix: prefix of the paths of the model files - :return: None - :type nameprefix: str - """ - self.topicmodeler.loadmodel(nameprefix) - self.classifier = joblib.load(nameprefix+'.pkl') - self.classlabels = self.topicmodeler.classlabels - - def save_compact_model(self, name): - """ Save the model. - - Save the topic model and the trained scikit-learn classification model in one compact model file. - - If neither :func:`~train` nor :func:`~loadmodel` was run, or if the - topic model was not trained, it will raise `ModelNotTrainedException`. - - :param name: name of the compact model file - :return: None - :type name: str - """ - topicmodel_info = self.topicmodeler.get_info() - cio.save_compact_model(name, self.savemodel, 'topic_sklearn', - topicmodel_info['suffices']+['.pkl'], - {'classifier': 'topic_sklearn', 'topicmodel': topicmodel_info['classifier']}) - - def load_compact_model(self, name): - """ Load the classification model together with the topic model from a compact file. - - :param name: name of the compact model file - :return: None - :type name: str - """ - cio.load_compact_model(name, self.loadmodel, 'topic_sklearn', - {'classifier': 'topic_sklearn', 'topicmodel': None}) - self.trained = True - -def train_gensim_topicvec_sklearnclassifier(classdict, - nb_topics, - sklearn_classifier, - preprocessor=textpreprocess.standard_text_preprocessor_1(), - topicmodel_algorithm='lda', - toweigh=True, - normalize=True, - gensim_paramdict={}, - sklearn_paramdict={}): - """ Train the supervised learning classifier, with features given by topic vectors. - - It trains a topic model, and with its topic vector representation, train a supervised - learning classifier. The instantiated (not trained) scikit-learn classifier must be - passed into the argument. - - # Reference - - Xuan Hieu Phan, Cam-Tu Nguyen, Dieu-Thu Le, Minh Le Nguyen, Susumu Horiguchi, Quang-Thuy Ha, - "A Hidden Topic-Based Framework toward Building Applications with Short Web Documents," - *IEEE Trans. Knowl. Data Eng.* 23(7): 961-976 (2011). - - Xuan Hieu Phan, Le-Minh Nguyen, Susumu Horiguchi, "Learning to Classify Short and Sparse Text & Web withHidden Topics from Large-scale Data Collections," - WWW '08 Proceedings of the 17th international conference on World Wide Web. (2008) [`ACL - `_] - - :param classdict: training data - :param nb_topics: number of topics in the topic model - :param sklearn_classifier: instantiated scikit-learn classifier - :param preprocessor: function that preprocesses the text (Default: `utils.textpreprocess.standard_text_preprocessor_1`) - :param topicmodel_algorithm: topic model algorithm (Default: 'lda') - :param toweigh: whether to weigh the words using tf-idf (Default: True) - :param normalize: whether the retrieved topic vectors are normalized (Default: True) - :param gensim_paramdict: arguments to be passed on to the `train` method of the `gensim` topic model - :param sklearn_paramdict: arguments to be passed on to the `fit` method of the `sklearn` classification algorithm - :return: a trained classifier - :type classdict: dict - :type nb_topics: int - :type sklearn_classifier: sklearn.base.BaseEstimator - :type preprocessor: function - :type topicmodel_algorithm: str - :type toweigh: bool - :type normalize: bool - :type gensim_paramdict: dict - :type sklearn_paramdict: dict - :rtype: TopicVectorSkLearnClassifier - """ - # topic model training - modelerdict = {'lda': LDAModeler, 'lsi': LSIModeler, 'rp': RPModeler} - topicmodeler = modelerdict[topicmodel_algorithm](preprocessor=preprocessor, - toweigh=toweigh, - normalize=normalize) - topicmodeler.train(classdict, nb_topics, **gensim_paramdict) - - # intermediate classification training - classifier = TopicVectorSkLearnClassifier(topicmodeler, sklearn_classifier) - classifier.train(classdict, **sklearn_paramdict) - - return classifier - -def load_gensim_topicvec_sklearnclassifier(name, - preprocessor=textpreprocess.standard_text_preprocessor_1(), - compact=True): - """ Load the classifier, a wrapper that uses scikit-learn classifier, with - feature vectors given by a topic model, from files. - - # Reference - - Xuan Hieu Phan, Cam-Tu Nguyen, Dieu-Thu Le, Minh Le Nguyen, Susumu Horiguchi, Quang-Thuy Ha, - "A Hidden Topic-Based Framework toward Building Applications with Short Web Documents," - *IEEE Trans. Knowl. Data Eng.* 23(7): 961-976 (2011). - - Xuan Hieu Phan, Le-Minh Nguyen, Susumu Horiguchi, "Learning to Classify Short and Sparse Text & Web withHidden Topics from Large-scale Data Collections," - WWW '08 Proceedings of the 17th international conference on World Wide Web. (2008) [`ACL - `_] - - :param name: name (if compact==True) or prefix (if compact==False) of the paths of model files - :param preprocessor: function that preprocesses the text (Default: `utils.textpreprocess.standard_text_preprocessor_1`) - :param compact: whether model file is compact (Default: True) - :return: a trained classifier - :type name: str - :type preprocessor: function - :type compact: bool - :rtype: TopicVectorSkLearnClassifier - """ - if compact: - # load the compact model - modelerdict = {'ldatopic': LDAModeler, 'lsitopic': LSIModeler, 'rptopic': RPModeler} - topicmodel_name = cio.get_model_config_field(name, 'topicmodel') - classifier = TopicVectorSkLearnClassifier(modelerdict[topicmodel_name](preprocessor=preprocessor), None) - classifier.load_compact_model(name) - classifier.trained = True - - # return the instance - return classifier - else: - # loading topic model - topicmodeler = load_gensimtopicmodel(name, preprocessor=preprocessor) - - # loading intermediate model - sklearn_classifier = joblib.load(name + '.pkl') - - # the wrapped classifier - classifier = TopicVectorSkLearnClassifier(topicmodeler, sklearn_classifier) - classifier.trained = True - - # return the instance - return classifier - -def train_autoencoder_topic_sklearnclassifier(classdict, - nb_topics, - sklearn_classifier, - preprocessor=textpreprocess.standard_text_preprocessor_1(), - normalize=True, - keras_paramdict={}, - sklearn_paramdict={}): - """ Train the supervised learning classifier, with features given by topic vectors. - - It trains an autoencoder topic model, and with its encoded vector representation, train a supervised - learning classifier. The instantiated (not trained) scikit-learn classifier must be - passed into the argument. - - # Reference - - Xuan Hieu Phan, Cam-Tu Nguyen, Dieu-Thu Le, Minh Le Nguyen, Susumu Horiguchi, Quang-Thuy Ha, - "A Hidden Topic-Based Framework toward Building Applications with Short Web Documents," - *IEEE Trans. Knowl. Data Eng.* 23(7): 961-976 (2011). - - Xuan Hieu Phan, Le-Minh Nguyen, Susumu Horiguchi, "Learning to Classify Short and Sparse Text & Web withHidden Topics from Large-scale Data Collections," - WWW '08 Proceedings of the 17th international conference on World Wide Web. (2008) [`ACL - `_] - - :param classdict: training data - :param nb_topics: number topics, i.e., number of encoding dimensions - :param sklearn_classifier: instantiated scikit-learn classifier - :param preprocessor: function that preprocesses the text (Default: `utils.textpreprocess.standard_text_preprocessor_1`) - :param normalize: whether the retrieved topic vectors are normalized (Default: True) - :param keras_paramdict: arguments to be passed to keras for training autoencoder - :param sklearn_paramdict: arguemtnst to be passed to scikit-learn for fitting the classifier - :return: a trained classifier - :type classdict: dict - :type nb_topics: int - :type sklearn_classifier: sklearn.base.BaseEstimator - :type preprocessor: function - :type normalize: bool - :rtype: TopicVectorSkLearnClassifier - """ - # train the autoencoder - autoencoder = AutoencodingTopicModeler(preprocessor=preprocessor, normalize=normalize) - autoencoder.train(classdict, nb_topics, **keras_paramdict) - - # intermediate classification training - classifier = TopicVectorSkLearnClassifier(autoencoder, sklearn_classifier) - classifier.train(classdict, **sklearn_paramdict) - - return classifier - -def load_autoencoder_topic_sklearnclassifier(name, - preprocessor=textpreprocess.standard_text_preprocessor_1(), - compact=True): - """ Load the classifier, a wrapper that uses scikit-learn classifier, with - feature vectors given by an autocoder topic model, from files. - - # Reference - - Xuan Hieu Phan, Cam-Tu Nguyen, Dieu-Thu Le, Minh Le Nguyen, Susumu Horiguchi, Quang-Thuy Ha, - "A Hidden Topic-Based Framework toward Building Applications with Short Web Documents," - *IEEE Trans. Knowl. Data Eng.* 23(7): 961-976 (2011). - - Xuan Hieu Phan, Le-Minh Nguyen, Susumu Horiguchi, "Learning to Classify Short and Sparse Text & Web withHidden Topics from Large-scale Data Collections," - WWW '08 Proceedings of the 17th international conference on World Wide Web. (2008) [`ACL - `_] - - :param name: name (if compact==True) or prefix (if compact==False) of the paths of model files - :param preprocessor: function that preprocesses the text (Default: `utils.textpreprocess.standard_text_preprocessor_1`) - :param compact: whether model file is compact (Default: True) - :return: a trained classifier - :type name: str - :type preprocessor: function - :type compact: bool - :rtype: TopicVectorSkLearnClassifier - """ - if compact: - # load the compact model - classifier = TopicVectorSkLearnClassifier(AutoencodingTopicModeler(preprocessor=preprocessor), None) - classifier.load_compact_model(name) - classifier.trained = True - - # return the instance - return classifier - else: - # load the autoencoder - autoencoder = load_autoencoder_topicmodel(name, preprocessor=preprocessor) - - # load intermediate model - sklearn_classifier = joblib.load(name + '.pkl') - - # the wrapper classifier - classifier = TopicVectorSkLearnClassifier(autoencoder, sklearn_classifier) - classifier.trained = True - - # return the instance - return classifier \ No newline at end of file diff --git a/shorttext/classifiers/bow/topic/TopicVectorDistanceClassification.py b/shorttext/classifiers/bow/topic/TopicVectorDistanceClassification.py deleted file mode 100644 index 3e47f838..00000000 --- a/shorttext/classifiers/bow/topic/TopicVectorDistanceClassification.py +++ /dev/null @@ -1,185 +0,0 @@ - -from shorttext.utils import textpreprocessing as textpreprocess -from .LatentTopicModeling import LatentTopicModeler, GensimTopicModeler -from .LatentTopicModeling import AutoencodingTopicModeler, load_autoencoder_topicmodel -from .LatentTopicModeling import load_gensimtopicmodel - - -class TopicVecCosineDistanceClassifier: - """ - This is a class that implements a classifier that perform classification based on - the cosine similarity between the topic vectors of the user-input short texts and various classes. - The topic vectors are calculated using :class:`LatentTopicModeler`. - """ - def __init__(self, topicmodeler): - """ Initialize the classifier. - - :param topicmodeler: topic modeler - :type topicmodeler: LatentTopicModeler - """ - self.topicmodeler = topicmodeler - - def score(self, shorttext): - """ Calculate the score, which is the cosine similarity with the topic vector of the model, - of the short text against each class labels. - - :param shorttext: short text - :return: dictionary of scores of the text to all classes - :type shorttext: str - :rtype: dict - """ - # scoredict = defaultdict(lambda : 0.0) - # similarities = self.topicmodeler.matsim[self.topicmodeler.retrieve_corpus_topicdist(shorttext)] - # for label, similarity in zip(self.topicmodeler.classlabels, similarities): - # scoredict[label] = similarity - # return dict(scoredict) - return self.topicmodeler.get_batch_cos_similarities(shorttext) - - def loadmodel(self, nameprefix): - """ Load the topic model with the given prefix of the file paths. - - Given the prefix of the file paths, load the corresponding topic model. The files - include a JSON (.json) file that specifies various parameters, a gensim dictionary (.gensimdict), - and a topic model (.gensimmodel). If weighing is applied, load also the tf-idf model (.gensimtfidf). - - This is essentialing loading the topic modeler :class:`LatentTopicModeler`. - - :param nameprefix: prefix of the file paths - :return: None - :type nameprefix: str - """ - self.topicmodeler.loadmodel(nameprefix) - - def savemodel(self, nameprefix): - """ Save the model with names according to the prefix. - - Given the prefix of the file paths, save the corresponding topic model. The files - include a JSON (.json) file that specifies various parameters, a gensim dictionary (.gensimdict), - and a topic model (.gensimmodel). If weighing is applied, load also the tf-idf model (.gensimtfidf). - - If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. - - This is essentialing saving the topic modeler :class:`LatentTopicModeler`. - - :param nameprefix: prefix of the file paths - :return: None - :raise: ModelNotTrainedException - :type nameprefix: str - """ - self.topicmodeler.savemodel(nameprefix) - - def load_compact_model(self, name): - self.topicmodeler.load_compact_model(name) - - def save_compact_model(self, name): - self.topicmodeler.save_compact_model(name) - -def train_gensimtopicvec_cosineClassifier(classdict, - nb_topics, - preprocessor=textpreprocess.standard_text_preprocessor_1(), - algorithm='lda', - toweigh=True, - normalize=True, - *args, **kwargs): - """ Return a cosine distance classifier, i.e., :class:`TopicVecCosineDistanceClassifier`, while - training a gensim topic model in between. - - :param classdict: training data - :param nb_topics: number of latent topics - :param preprocessor: function that preprocesses the text. (Default: `utils.textpreprocess.standard_text_preprocessor_1`) - :param algorithm: algorithm for topic modeling. Options: lda, lsi, rp. (Default: lda) - :param toweigh: whether to weigh the words using tf-idf. (Default: True) - :param normalize: whether the retrieved topic vectors are normalized. (Default: True) - :param args: arguments to pass to the `train` method for gensim topic models - :param kwargs: arguments to pass to the `train` method for gensim topic models - :return: a classifier that scores the short text based on the topic model - :type classdict: dict - :type nb_topics: int - :type preprocessor: function - :type algorithm: str - :type toweigh: bool - :type normalize: bool - :rtype: TopicVecCosineDistanceClassifier - """ - # train topic model - topicmodeler = GensimTopicModeler(preprocessor=preprocessor, - algorithm=algorithm, - toweigh=toweigh, - normalize=normalize) - topicmodeler.train(classdict, nb_topics, *args, **kwargs) - - # cosine distance classifier - return TopicVecCosineDistanceClassifier(topicmodeler) - -def load_gensimtopicvec_cosineClassifier(name, - preprocessor=textpreprocess.standard_text_preprocessor_1(), - compact=True): - """ Load a gensim topic model from files and return a cosine distance classifier. - - Given the prefix of the files of the topic model, return a cosine distance classifier - based on this model, i.e., :class:`TopicVecCosineDistanceClassifier`. - - The files include a JSON (.json) file that specifies various parameters, a gensim dictionary (.gensimdict), - and a topic model (.gensimmodel). If weighing is applied, load also the tf-idf model (.gensimtfidf). - - :param name: name (if compact=True) or prefix (if compact=False) of the file paths - :param preprocessor: function that preprocesses the text. (Default: `utils.textpreprocess.standard_text_preprocessor_1`) - :param compact: whether model file is compact (Default: True) - :return: a classifier that scores the short text based on the topic model - :type name: str - :type preprocessor: function - :type compact: bool - :rtype: TopicVecCosineDistanceClassifier - """ - topicmodeler = load_gensimtopicmodel(name, preprocessor=preprocessor, compact=compact) - return TopicVecCosineDistanceClassifier(topicmodeler) - -def train_autoencoder_cosineClassifier(classdict, - nb_topics, - preprocessor=textpreprocess.standard_text_preprocessor_1(), - normalize=True, - *args, **kwargs): - """ Return a cosine distance classifier, i.e., :class:`TopicVecCosineDistanceClassifier`, while - training an autoencoder as a topic model in between. - - :param classdict: training data - :param nb_topics: number of topics, i.e., number of encoding dimensions - :param preprocessor: function that preprocesses the text. (Default: `utils.textpreprocess.standard_text_preprocessor_1`) - :param normalize: whether the retrieved topic vectors are normalized. (Default: True) - :param args: arguments to be passed to keras model fitting - :param kwargs: arguments to be passed to keras model fitting - :return: a classifier that scores the short text based on the autoencoder - :type classdict: dict - :type nb_topics: int - :type preprocessor: function - :type normalize: bool - :rtype: TopicVecCosineDistanceClassifier - """ - # train the autoencoder - autoencoder = AutoencodingTopicModeler(preprocessor=preprocessor, normalize=normalize) - autoencoder.train(classdict, nb_topics, *args, **kwargs) - - # cosine distance classifier - return TopicVecCosineDistanceClassifier(autoencoder) - -def load_autoencoder_cosineClassifier(name, - preprocessor=textpreprocess.standard_text_preprocessor_1(), - compact=True): - """ Load an autoencoder from files for topic modeling, and return a cosine classifier. - - Given the prefix of the file paths, load the model into files, with name given by the prefix. - There are files with names ending with "_encoder.json" and "_encoder.h5", which are - the JSON and HDF5 files for the encoder respectively. - They also include a gensim dictionary (.gensimdict). - - :param name: name (if compact=True) or prefix (if compact=False) of the file paths - :param preprocessor: function that preprocesses the text. (Default: `utils.textpreprocess.standard_text_preprocessor_1`) - :param compact: whether model file is compact (Default: True) - :return: a classifier that scores the short text based on the autoencoder - :type name: str - :type preprocessor: function - :type compact: bool - :rtype: TopicVecCosineDistanceClassifier - """ - autoencoder = load_autoencoder_topicmodel(name, preprocessor=preprocessor, compact=compact) - return TopicVecCosineDistanceClassifier(autoencoder) \ No newline at end of file diff --git a/shorttext/classifiers/embed/nnlib/VarNNEmbedVecClassification.py b/shorttext/classifiers/embed/nnlib/VarNNEmbedVecClassification.py deleted file mode 100644 index 220d85af..00000000 --- a/shorttext/classifiers/embed/nnlib/VarNNEmbedVecClassification.py +++ /dev/null @@ -1,309 +0,0 @@ - -import json -import os - -import numpy as np -from keras.preprocessing.text import Tokenizer -from keras.preprocessing.sequence import pad_sequences - -import shorttext.utils.kerasmodel_io as kerasio -import shorttext.utils.classification_exceptions as e -from shorttext.utils import tokenize -import shorttext.utils.compactmodel_io as cio - - -@cio.compactio({'classifier': 'nnlibvec'}, 'nnlibvec', ['_classlabels.txt', '.json', '.h5', '_config.json']) -class VarNNEmbeddedVecClassifier: - """ - This is a wrapper for various neural network algorithms - for supervised short text categorization. - Each class label has a few short sentences, where each token is converted - to an embedded vector, given by a pre-trained word-embedding model (e.g., Google Word2Vec model). - The sentences are represented by a matrix, or rank-2 array. - The type of neural network has to be passed when training, and it has to be of - type :class:`keras.models.Sequential`. The number of outputs of the models has to match - the number of class labels in the training data. - To perform prediction, the input short sentences is converted to a unit vector - in the same way. The score is calculated according to the trained neural network model. - - Examples of the models can be found in `frameworks`. - - A pre-trained Google Word2Vec model can be downloaded `here - `_. - - Examples - - >>> import shorttext - >>> # load the Word2Vec model - >>> wvmodel = shorttext.utils.load_word2vec_model('GoogleNews-vectors-negative300.bin.gz', binary=True) - >>> - >>> # load the training data - >>> trainclassdict = shorttext.data.subjectkeywords() - >>> - >>> # initialize the classifier and train - >>> kmodel = shorttext.classifiers.frameworks.CNNWordEmbed(len(trainclassdict.keys())) # using convolutional neural network model - >>> classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(wvmodel) - >>> classifier.train(trainclassdict, kmodel) - Epoch 1/10 - 45/45 [==============================] - 0s - loss: 1.0578 - Epoch 2/10 - 45/45 [==============================] - 0s - loss: 0.5536 - Epoch 3/10 - 45/45 [==============================] - 0s - loss: 0.3437 - Epoch 4/10 - 45/45 [==============================] - 0s - loss: 0.2282 - Epoch 5/10 - 45/45 [==============================] - 0s - loss: 0.1658 - Epoch 6/10 - 45/45 [==============================] - 0s - loss: 0.1273 - Epoch 7/10 - 45/45 [==============================] - 0s - loss: 0.1052 - Epoch 8/10 - 45/45 [==============================] - 0s - loss: 0.0961 - Epoch 9/10 - 45/45 [==============================] - 0s - loss: 0.0839 - Epoch 10/10 - 45/45 [==============================] - 0s - loss: 0.0743 - >>> classifier.score('artificial intelligence') - {'mathematics': 0.57749695, 'physics': 0.33749574, 'theology': 0.085007325} - """ - def __init__(self, wvmodel, vecsize=100, maxlen=15, with_gensim=False): - """ Initialize the classifier. - - :param wvmodel: Word2Vec model - :param vecsize: length of the embedded vectors in the model (Default: 100) - :param maxlen: maximum number of words in a sentence (Default: 15) - :type wvmodel: gensim.models.keyedvectors.KeyedVectors - :type vecsize: int - :type maxlen: int - """ - self.wvmodel = wvmodel - self.vecsize = vecsize - self.maxlen = maxlen - self.with_gensim = with_gensim - self.trained = False - - def convert_trainingdata_matrix(self, classdict): - """ Convert the training data into format put into the neural networks. - - Convert the training data into format put into the neural networks. - This is called by :func:`~train`. - - :param classdict: training data - :return: a tuple of three, containing a list of class labels, matrix of embedded word vectors, and corresponding outputs - :type classdict: dict - :rtype: (list, numpy.ndarray, list) - """ - classlabels = classdict.keys() - lblidx_dict = dict(zip(classlabels, range(len(classlabels)))) - - # tokenize the words, and determine the word length - phrases = [] - indices = [] - for label in classlabels: - for shorttext in classdict[label]: - shorttext = shorttext if type(shorttext)==str else '' - category_bucket = [0]*len(classlabels) - category_bucket[lblidx_dict[label]] = 1 - indices.append(category_bucket) - if self.with_gensim: - phrases.append(shorttext) - else: - phrases.append(tokenize(shorttext)) - - if self.with_gensim: - return classlabels, phrases, indices - - # store embedded vectors - train_embedvec = np.zeros(shape=(len(phrases), self.maxlen, self.vecsize)) - for i in range(len(phrases)): - for j in range(min(self.maxlen, len(phrases[i]))): - train_embedvec[i, j] = self.word_to_embedvec(phrases[i][j]) - indices = np.array(indices, dtype=np.int) - - return classlabels, train_embedvec, indices - - def train(self, classdict, kerasmodel, nb_epoch=10): - """ Train the classifier. - - The training data and the corresponding keras model have to be given. - - If this has not been run, or a model was not loaded by :func:`~loadmodel`, - a `ModelNotTrainedException` will be raised. - - :param classdict: training data - :param kerasmodel: keras sequential model - :param nb_epoch: number of steps / epochs in training - :return: None - :type classdict: dict - :type kerasmodel: keras.models.Sequential - :type nb_epoch: int - """ - if self.with_gensim: - # convert classdict to training input vectors - self.classlabels, x_train, y_train = self.convert_trainingdata_matrix(classdict) - - tokenizer = Tokenizer() - tokenizer.fit_on_texts(x_train) - x_train = tokenizer.texts_to_sequences(x_train) - x_train = pad_sequences(x_train, maxlen=self.maxlen) - - # train the model - kerasmodel.fit(x_train, y_train, epochs=nb_epoch) - else: - # convert classdict to training input vectors - self.classlabels, train_embedvec, indices = self.convert_trainingdata_matrix(classdict) - - # train the model - kerasmodel.fit(train_embedvec, indices, epochs=nb_epoch) - - # flag switch - self.model = kerasmodel - self.trained = True - - def savemodel(self, nameprefix): - """ Save the trained model into files. - - Given the prefix of the file paths, save the model into files, with name given by the prefix. - There will be three files produced, one name ending with "_classlabels.txt", one name - ending with ".json", and one name ending with ".h5". For shorttext>=0.4.0, another file - with extension "_config.json" would be created. - - If there is no trained model, a `ModelNotTrainedException` will be thrown. - - :param nameprefix: prefix of the file path - :return: None - :type nameprefix: str - :raise: ModelNotTrainedException - """ - if not self.trained: - raise e.ModelNotTrainedException() - kerasio.save_model(nameprefix, self.model) - labelfile = open(nameprefix+'_classlabels.txt', 'w') - labelfile.write('\n'.join(self.classlabels)) - labelfile.close() - json.dump({'with_gensim': self.with_gensim}, open(nameprefix+'_config.json', 'w')) - - def loadmodel(self, nameprefix): - """ Load a trained model from files. - - Given the prefix of the file paths, load the model from files with name given by the prefix - followed by "_classlabels.txt", ".json" and ".h5". For shorttext>=0.4.0, a file with - extension "_config.json" would also be used. - - If this has not been run, or a model was not trained by :func:`~train`, - a `ModelNotTrainedException` will be raised while performing prediction or saving the model. - - :param nameprefix: prefix of the file path - :return: None - :type nameprefix: str - """ - self.model = kerasio.load_model(nameprefix) - labelfile = open(nameprefix+'_classlabels.txt', 'r') - self.classlabels = labelfile.readlines() - labelfile.close() - self.classlabels = map(lambda s: s.strip(), self.classlabels) - # check if _config.json exists. - # This file does not exist if the model was created with shorttext<0.4.0 - if os.path.exists(nameprefix+'_config.json'): - self.with_gensim = json.load(open(nameprefix+'_config.json', 'r'))['with_gensim'] - else: - self.with_gensim = False - self.trained = True - - def word_to_embedvec(self, word): - """ Convert the given word into an embedded vector. - - Given a word, return the corresponding embedded vector according to - the word-embedding model. If there is no such word in the model, - a vector with zero values are given. - - :param word: a word - :return: the corresponding embedded vector - :type word: str - :rtype: numpy.ndarray - """ - return self.wvmodel[word] if word in self.wvmodel else np.zeros(self.vecsize) - - def shorttext_to_matrix(self, shorttext): - """ Convert the short text into a matrix with word-embedding representation. - - Given a short sentence, it converts all the tokens into embedded vectors according to - the given word-embedding model, and put them into a matrix. If a word is not in the model, - that row will be filled with zero. - - :param shorttext: a short sentence - :return: a matrix of embedded vectors that represent all the tokens in the sentence - :type shorttext: str - :rtype: numpy.ndarray - """ - tokens = tokenize(shorttext) - matrix = np.zeros((self.maxlen, self.vecsize)) - for i in range(min(self.maxlen, len(tokens))): - matrix[i] = self.word_to_embedvec(tokens[i]) - return matrix - - def process_text(self, shorttext): - """Process the input text by tokenizing and padding it. - - :param shorttext: a short sentence - """ - tokenizer = Tokenizer() - tokenizer.fit_on_texts(shorttext) - x_train = tokenizer.texts_to_sequences(shorttext) - - x_train = pad_sequences(x_train, maxlen=self.maxlen) - return x_train - - def score(self, shorttext): - """ Calculate the scores for all the class labels for the given short sentence. - - Given a short sentence, calculate the classification scores for all class labels, - returned as a dictionary with key being the class labels, and values being the scores. - If the short sentence is empty, or if other numerical errors occur, the score will be `numpy.nan`. - If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. - - :param shorttext: a short sentence - :return: a dictionary with keys being the class labels, and values being the corresponding classification scores - :type shorttext: str - :rtype: dict - :raise: ModelNotTrainedException - """ - if not self.trained: - raise e.ModelNotTrainedException() - - if self.with_gensim: - # tokenize and pad input text - matrix = self.process_text(shorttext) - else: - # retrieve vector - matrix = np.array([self.shorttext_to_matrix(shorttext)]) - - # classification using the neural network - predictions = self.model.predict(matrix) - - # wrangle output result - scoredict = {} - for idx, classlabel in zip(range(len(self.classlabels)), self.classlabels): - scoredict[classlabel] = predictions[0][idx] - - return scoredict - -def load_varnnlibvec_classifier(wvmodel, name, compact=True): - """ Load a :class:`shorttext.classifiers.VarNNEmbeddedVecClassifier` instance from file, given the pre-trained Word2Vec model. - - :param wvmodel: Word2Vec model - :param name: name (if compact=True) or prefix (if compact=False) of the file path - :param compact whether model file is compact (Default: True) - :return: the classifier - :type wvmodel: gensim.models.keyedvectors.KeyedVectors - :type name: str - :type compact: bool - :rtype: VarNNEmbeddedVecClassifier - """ - classifier = VarNNEmbeddedVecClassifier(wvmodel) - if compact: - classifier.load_compact_model(name) - else: - classifier.loadmodel(name) - return classifier diff --git a/shorttext/classifiers/embed/nnlib/frameworks.py b/shorttext/classifiers/embed/nnlib/frameworks.py deleted file mode 100644 index cbd10dda..00000000 --- a/shorttext/classifiers/embed/nnlib/frameworks.py +++ /dev/null @@ -1,296 +0,0 @@ -from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LSTM -from keras.models import Sequential, Model -from keras.regularizers import l2 -from keras.engine import Input - -# Codes were changed because of Keras. -# Keras 1 --> Keras 2: https://github.com/fchollet/keras/wiki/Keras-2.0-release-notes - -# Paper: Yoon Kim, "Convolutional Neural Networks for Sentence Classification," arXiv:1408.5882 (2014). -# ref: https://gist.github.com/entron/b9bc61a74e7cadeb1fec -# ref: http://cs231n.github.io/convolutional-networks/ -def CNNWordEmbed(nb_labels, - wvmodel=None, - nb_filters=1200, - n_gram=2, - maxlen=15, - vecsize=100, - cnn_dropout=0.0, - final_activation='softmax', - dense_wl2reg=0.0, - dense_bl2reg=0.0, - optimizer='adam', - with_gensim=False): - """ Returns the convolutional neural network (CNN/ConvNet) for word-embedded vectors. - - Reference: Yoon Kim, "Convolutional Neural Networks for Sentence Classification," - *EMNLP* 2014, 1746-1751 (arXiv:1408.5882). [`arXiv - `_] - - :param nb_labels: number of class labels - :param wvmodel: pre-trained Gensim word2vec model - :param nb_filters: number of filters (Default: 1200) - :param n_gram: n-gram, or window size of CNN/ConvNet (Default: 2) - :param maxlen: maximum number of words in a sentence (Default: 15) - :param vecsize: length of the embedded vectors in the model (Default: 100) - :param cnn_dropout: dropout rate for CNN/ConvNet (Default: 0.0) - :param final_activation: activation function. Options: softplus, softsign, relu, tanh, sigmoid, hard_sigmoid, linear. (Default: 'softmax') - :param dense_wl2reg: L2 regularization coefficient (Default: 0.0) - :param dense_bl2reg: L2 regularization coefficient for bias (Default: 0.0) - :param optimizer: optimizer for gradient descent. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. (Default: adam) - :param with_gensim: boolean variable to indicate if the word-embeddings being used derived from a Gensim's Word2Vec model. (Default: True) - :return: keras model (`Sequential` or`Model`) for CNN/ConvNet for Word-Embeddings - :type nb_labels: int - :type wvmodel: gensim.models.keyedvectors.KeyedVectors - :type nb_filters: int - :type n_gram: int - :type maxlen: int - :type vecsize: int - :type cnn_dropout: float - :type final_activation: str - :type dense_wl2reg: float - :type dense_bl2reg: float - :type optimizer: str - :type with_gensim: bool - :rtype: keras.models.Sequential or keras.models.Model - """ - if with_gensim == True: - embedding_layer = wvmodel.get_embedding_layer() - sequence_input = Input(shape=(maxlen,), dtype='int32') - x = embedding_layer(sequence_input) - x = Conv1D(filters=nb_filters, - kernel_size=n_gram, - padding='valid', - activation='relu', - input_shape=(maxlen, vecsize))(x) - if cnn_dropout > 0.0: - x = Dropout(cnn_dropout)(x) - x = MaxPooling1D(pool_size=maxlen - n_gram + 1)(x) - x = Flatten()(x) - x = Dense(nb_labels, - activation=final_activation, - kernel_regularizer=l2(dense_wl2reg), - bias_regularizer=l2(dense_bl2reg))(x) - - model = Model(sequence_input, x) - model.compile(loss='categorical_crossentropy', optimizer=optimizer) - else: - model = Sequential() - model.add(Conv1D(filters=nb_filters, - kernel_size=n_gram, - padding='valid', - activation='relu', - input_shape=(maxlen, vecsize))) - if cnn_dropout > 0.0: - model.add(Dropout(cnn_dropout)) - model.add(MaxPooling1D(pool_size=maxlen - n_gram + 1)) - model.add(Flatten()) - model.add(Dense(nb_labels, - activation=final_activation, - kernel_regularizer=l2(dense_wl2reg), - bias_regularizer=l2(dense_bl2reg)) - ) - model.compile(loss='categorical_crossentropy', optimizer=optimizer) - - return model - -# two layers of CNN, maxpooling, dense -def DoubleCNNWordEmbed(nb_labels, - wvmodel=None, - nb_filters_1=1200, - nb_filters_2=600, - n_gram=2, - filter_length_2=10, - maxlen=15, - vecsize=100, - cnn_dropout_1=0.0, - cnn_dropout_2=0.0, - final_activation='softmax', - dense_wl2reg=0.0, - dense_bl2reg=0.0, - optimizer='adam', - with_gensim=False): - """ Returns the double-layered convolutional neural network (CNN/ConvNet) for word-embedded vectors. - - :param nb_labels: number of class labels - :param wvmodel: pre-trained Gensim word2vec model - :param nb_filters_1: number of filters for the first CNN/ConvNet layer (Default: 1200) - :param nb_filters_2: number of filters for the second CNN/ConvNet layer (Default: 600) - :param n_gram: n-gram, or window size of first CNN/ConvNet (Default: 2) - :param filter_length_2: window size for second CNN/ConvNet layer (Default: 10) - :param maxlen: maximum number of words in a sentence (Default: 15) - :param vecsize: length of the embedded vectors in the model (Default: 100) - :param cnn_dropout_1: dropout rate for the first CNN/ConvNet layer (Default: 0.0) - :param cnn_dropout_2: dropout rate for the second CNN/ConvNet layer (Default: 0.0) - :param final_activation: activation function. Options: softplus, softsign, relu, tanh, sigmoid, hard_sigmoid, linear. (Default: 'softmax') - :param dense_wl2reg: L2 regularization coefficient (Default: 0.0) - :param dense_bl2reg: L2 regularization coefficient for bias (Default: 0.0) - :param optimizer: optimizer for gradient descent. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. (Default: adam) - :return: keras sequantial model for CNN/ConvNet for Word-Embeddings - :type nb_labels: int - :type wvmodel: gensim.models.keyedvectors.KeyedVectors - :type nb_filters_1: int - :type nb_filters_2: int - :type n_gram: int - :type filter_length_2: int - :type maxlen: int - :type vecsize: int - :type cnn_dropout_1: float - :type cnn_dropout_2: float - :type final_activation: str - :type dense_wl2reg: float - :type dense_bl2reg: float - :type optimizer: str - :type with_gensim: bool - :rtype: keras.models.Sequential or keras.models.Model - """ - if with_gensim == True: - embedding_layer = wvmodel.get_embedding_layer() - sequence_input = Input(shape=(maxlen,), dtype='int32') - x = embedding_layer(sequence_input) - x = Conv1D(filters=nb_filters_1, - kernel_size=n_gram, - padding='valid', - activation='relu', - input_shape=(maxlen, vecsize))(x) - if cnn_dropout_1 > 0.0: - x = Dropout(cnn_dropout_1)(x) - x = Conv1D(filters=nb_filters_2, - kernel_size=filter_length_2, - padding='valid', - activation='relu')(x) - if cnn_dropout_2 > 0.0: - x = Dropout(cnn_dropout_2)(x) - x = MaxPooling1D(pool_size=maxlen - n_gram -filter_length_2 + 1)(x) - x = Flatten()(x) - x = Dense(nb_labels, - activation=final_activation, - kernel_regularizer=l2(dense_wl2reg), - bias_regularizer=l2(dense_bl2reg))(x) - - model = Model(sequence_input, x) - model.compile(loss='categorical_crossentropy', optimizer=optimizer) - else: - model = Sequential() - model.add(Conv1D(filters=nb_filters_1, - kernel_size=n_gram, - padding='valid', - activation='relu', - input_shape=(maxlen, vecsize))) - if cnn_dropout_1 > 0.0: - model.add(Dropout(cnn_dropout_1)) - model.add(Conv1D(filters=nb_filters_2, - kernel_size=filter_length_2, - padding='valid', - activation='relu')) - if cnn_dropout_2 > 0.0: - model.add(Dropout(cnn_dropout_2)) - model.add(MaxPooling1D(pool_size=maxlen - n_gram -filter_length_2 + 1)) - model.add(Flatten()) - model.add(Dense(nb_labels, - activation=final_activation, - kernel_regularizer=l2(dense_wl2reg), - bias_regularizer=l2(dense_bl2reg)) - ) - model.compile(loss='categorical_crossentropy', optimizer=optimizer) - - return model - -# C-LSTM -# Chunting Zhou, Chonglin Sun, Zhiyuan Liu, Francis Lau, -# "A C-LSTM Neural Network for Text Classification", arXiv:1511.08630 (2015). -def CLSTMWordEmbed(nb_labels, - wvmodel=None, - nb_filters=1200, - n_gram=2, - maxlen=15, - vecsize=100, - cnn_dropout=0.0, - nb_rnnoutdim=1200, - rnn_dropout=0.2, - final_activation='softmax', - dense_wl2reg=0.0, - dense_bl2reg=0.0, - optimizer='adam', - with_gensim=False): - """ Returns the C-LSTM neural networks for word-embedded vectors. - - Reference: Chunting Zhou, Chonglin Sun, Zhiyuan Liu, Francis Lau, - "A C-LSTM Neural Network for Text Classification," - (arXiv:1511.08630). [`arXiv - `_] - - :param nb_labels: number of class labels - :param wvmodel: pre-trained Gensim word2vec model - :param nb_filters: number of filters (Default: 1200) - :param n_gram: n-gram, or window size of CNN/ConvNet (Default: 2) - :param maxlen: maximum number of words in a sentence (Default: 15) - :param vecsize: length of the embedded vectors in the model (Default: 100) - :param cnn_dropout: dropout rate for CNN/ConvNet (Default: 0.0) - :param nb_rnnoutdim: output dimension for the LSTM networks (Default: 1200) - :param rnn_dropout: dropout rate for LSTM (Default: 0.2) - :param final_activation: activation function. Options: softplus, softsign, relu, tanh, sigmoid, hard_sigmoid, linear. (Default: 'softmax') - :param dense_wl2reg: L2 regularization coefficient (Default: 0.0) - :param dense_bl2reg: L2 regularization coefficient for bias (Default: 0.0) - :param optimizer: optimizer for gradient descent. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. (Default: adam) - :return: keras sequantial model for CNN/ConvNet for Word-Embeddings - :type nb_labels: int - :type wvmodel: gensim.models.keyedvectors.KeyedVectors - :type nb_filters: int - :type n_gram: int - :type maxlen: int - :type vecsize: int - :type cnn_dropout: float - :type nb_rnnoutdim: int - :type rnn_dropout: float - :type final_activation: str - :type dense_wl2reg: float - :type dense_bl2reg: float - :type optimizer: str - :type with_gensim: bool - :rtype: keras.models.Sequential or keras.models.Model - """ - if with_gensim == True: - embedding_layer = wvmodel.get_embedding_layer() - sequence_input = Input(shape=(maxlen,), dtype='int32') - x = embedding_layer(sequence_input) - x = Conv1D(filters=nb_filters, - kernel_size=n_gram, - padding='valid', - activation='relu', - input_shape=(maxlen, vecsize))(x) - if cnn_dropout > 0.0: - x = Dropout(cnn_dropout)(x) - x = MaxPooling1D(pool_size=maxlen - n_gram + 1)(x) - x = LSTM(nb_rnnoutdim)(x) - if rnn_dropout > 0.0: - x = Dropout(rnn_dropout)(x) - x = Dense(nb_labels, - activation=final_activation, - kernel_regularizer=l2(dense_wl2reg), - bias_regularizer=l2(dense_bl2reg),)(x) - - model = Model(sequence_input, x) - model.compile(loss='categorical_crossentropy', optimizer=optimizer) - else: - model = Sequential() - model.add(Conv1D(filters=nb_filters, - kernel_size=n_gram, - padding='valid', - activation='relu', - input_shape=(maxlen, vecsize))) - if cnn_dropout > 0.0: - model.add(Dropout(cnn_dropout)) - model.add(MaxPooling1D(pool_size=maxlen - n_gram + 1)) - model.add(LSTM(nb_rnnoutdim)) - if rnn_dropout > 0.0: - model.add(Dropout(rnn_dropout)) - model.add(Dense(nb_labels, - activation=final_activation, - kernel_regularizer=l2(dense_wl2reg), - bias_regularizer=l2(dense_bl2reg), - ) - ) - model.compile(loss='categorical_crossentropy', optimizer=optimizer) - - return model diff --git a/shorttext/classifiers/embed/sumvec/SumEmbedVecClassification.py b/shorttext/classifiers/embed/sumvec/SumEmbedVecClassification.py deleted file mode 100644 index 96aff3c0..00000000 --- a/shorttext/classifiers/embed/sumvec/SumEmbedVecClassification.py +++ /dev/null @@ -1,157 +0,0 @@ -import pickle -from collections import defaultdict - -import numpy as np -from scipy.spatial.distance import cosine - -import shorttext.utils.classification_exceptions as e -from shorttext.utils import tokenize -import shorttext.utils.compactmodel_io as cio - - -@cio.compactio({'classifier': 'sumvec'}, 'sumvec', ['_embedvecdict.pkl']) -class SumEmbeddedVecClassifier: - """ - This is a supervised classification algorithm for short text categorization. - Each class label has a few short sentences, where each token is converted - to an embedded vector, given by a pre-trained word-embedding model (e.g., Google Word2Vec model). - They are then summed up and normalized to a unit vector for that particular class labels. - To perform prediction, the input short sentences is converted to a unit vector - in the same way. The similarity score is calculated by the cosine similarity. - - A pre-trained Google Word2Vec model can be downloaded `here - `_. - """ - - def __init__(self, wvmodel, vecsize=100, simfcn=lambda u, v: 1-cosine(u, v)): - """ Initialize the classifier. - - :param wvmodel: Word2Vec model - :param vecsize: length of the embedded vectors in the model (Default: 100) - :param simfcn: similarity function (Default: cosine similarity) - :type wvmodel: gensim.models.word2vec.Word2Vec - :type vecsize: int - :type simfcn: function - """ - self.wvmodel = wvmodel - self.vecsize = vecsize - self.simfcn = simfcn - self.trained = False - - def train(self, classdict): - """ Train the classifier. - - If this has not been run, or a model was not loaded by :func:`~loadmodel`, - a `ModelNotTrainedException` will be raised while performing prediction or saving - the model. - - :param classdict: training data - :return: None - :type classdict: dict - """ - self.addvec = defaultdict(lambda : np.zeros(self.vecsize)) - for classtype in classdict: - for shorttext in classdict[classtype]: - self.addvec[classtype] += self.shorttext_to_embedvec(shorttext) - self.addvec[classtype] /= np.linalg.norm(self.addvec[classtype]) - self.addvec = dict(self.addvec) - self.trained = True - - def savemodel(self, nameprefix): - """ Save the trained model into files. - - Given the prefix of the file paths, save the model into files, with name given by the prefix, - and add "_embedvecdict.pickle" at the end. If there is no trained model, a `ModelNotTrainedException` - will be thrown. - - :param nameprefix: prefix of the file path - :return: None - :type nameprefix: str - :raise: ModelNotTrainedException - """ - if not self.trained: - raise e.ModelNotTrainedException() - pickle.dump(self.addvec, open(nameprefix+'_embedvecdict.pkl', 'w')) - - def loadmodel(self, nameprefix): - """ Load a trained model from files. - - Given the prefix of the file paths, load the model from files with name given by the prefix - followed by "_embedvecdict.pickle". - - If this has not been run, or a model was not trained by :func:`~train`, - a `ModelNotTrainedException` will be raised while performing prediction and saving the model. - - :param nameprefix: prefix of the file path - :return: None - :type nameprefix: str - """ - self.addvec = pickle.load(open(nameprefix+'_embedvecdict.pkl', 'r')) - self.trained = True - - def shorttext_to_embedvec(self, shorttext): - """ Convert the short text into an averaged embedded vector representation. - - Given a short sentence, it converts all the tokens into embedded vectors according to - the given word-embedding model, sums - them up, and normalize the resulting vector. It returns the resulting vector - that represents this short sentence. - - :param shorttext: a short sentence - :return: an embedded vector that represents the short sentence - :type shorttext: str - :rtype: numpy.ndarray - """ - vec = np.zeros(self.vecsize) - for token in tokenize(shorttext): - if token in self.wvmodel: - vec += self.wvmodel[token] - norm = np.linalg.norm(vec) - if norm != 0: - vec /= np.linalg.norm(vec) - return vec - - def score(self, shorttext): - """ Calculate the scores for all the class labels for the given short sentence. - - Given a short sentence, calculate the classification scores for all class labels, - returned as a dictionary with key being the class labels, and values being the scores. - If the short sentence is empty, or if other numerical errors occur, the score will be `numpy.nan`. - - If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. - - :param shorttext: a short sentence - :return: a dictionary with keys being the class labels, and values being the corresponding classification scores - :type shorttext: str - :rtype: dict - :raise: ModelNotTrainedException - """ - if not self.trained: - raise e.ModelNotTrainedException() - vec = self.shorttext_to_embedvec(shorttext) - scoredict = {} - for classtype in self.addvec: - try: - scoredict[classtype] = self.simfcn(vec, self.addvec[classtype]) - except ValueError: - scoredict[classtype] = np.nan - return scoredict - -def load_sumword2vec_classifier(wvmodel, name, compact=True): - """ Load a :class:`shorttext.classifiers.SumEmbeddedVecClassifier` instance from file, given the pre-trained Word2Vec model. - - :param wvmodel: Word2Vec model - :param name: name (if compact=True) or prefix (if compact=False) of the file path - :param compact whether model file is compact (Default: True) - :return: the classifier - :type wvmodel: gensim.models.keyedvectors.KeyedVectors - :type name: str - :type compact: bool - :rtype: SumEmbeddedVecClassifier - """ - classifier = SumEmbeddedVecClassifier(wvmodel) - if compact: - classifier.load_compact_model(name) - else: - classifier.loadmodel(name) - return classifier \ No newline at end of file diff --git a/shorttext/classifiers/embed/sumvec/VarNNSumEmbedVecClassification.py b/shorttext/classifiers/embed/sumvec/VarNNSumEmbedVecClassification.py deleted file mode 100644 index 6f216ef9..00000000 --- a/shorttext/classifiers/embed/sumvec/VarNNSumEmbedVecClassification.py +++ /dev/null @@ -1,204 +0,0 @@ -import numpy as np - -import shorttext.utils.kerasmodel_io as kerasio -import shorttext.utils.classification_exceptions as e -from shorttext.utils.textpreprocessing import spacy_tokenize - - -class VarNNSumEmbeddedVecClassifier: - """ - This is a wrapper for various neural network algorithms - for supervised short text categorization. - Each class label has a few short sentences, where each token is converted - to an embedded vector, given by a pre-trained word-embedding model (e.g., Google Word2Vec model). - The sentences are represented by an array. - The type of neural network has to be passed when training, and it has to be of - type :class:`keras.models.Sequential`. The number of outputs of the models has to match - the number of class labels in the training data. - To perform prediction, the input short sentences is converted to a unit vector - in the same way. The score is calculated according to the trained neural network model. - - Examples of the models can be found in `frameworks`. - - A pre-trained Google Word2Vec model can be downloaded `here - `_. - - """ - def __init__(self, wvmodel, vecsize=100, maxlen=15): - """ Initialize the classifier. - - :param wvmodel: Word2Vec model - :param vecsize: length of the embedded vectors in the model (Default: 100) - :param maxlen: maximum number of words in a sentence (Default: 15) - :type wvmodel: gensim.models.word2vec.Word2Vec - :type vecsize: int - :type maxlen: int - """ - self.wvmodel = wvmodel - self.vecsize = vecsize - self.maxlen = maxlen - self.trained = False - - def convert_traindata_embedvecs(self, classdict): - """ Convert the training text data into embedded matrix. - - COnvert the training text data into embedded matrix, where each short sentence - is a normalized summed embedded vectors for all words. - - :param classdict: training data - :return: tuples, consisting of class labels, matrix of embedded vectors, and corresponding outputs - :type classdict: dict - :rtype: (list, numpy.ndarray, list) - """ - classlabels = classdict.keys() - lblidx_dict = dict(zip(classlabels, range(len(classlabels)))) - - indices = [] - embedvecs = [] - for classlabel in classlabels: - for shorttext in classdict[classlabel]: - embedvec = np.sum(np.array([self.word_to_embedvec(token) for token in spacy_tokenize(shorttext)]), - axis=0) - # embedvec = np.reshape(embedvec, embedvec.shape+(1,)) - norm = np.linalg.norm(embedvec) - if norm == 0: - continue - embedvec /= norm - embedvecs.append(embedvec) - category_bucket = [0]*len(classlabels) - category_bucket[lblidx_dict[classlabel]] = 1 - indices.append(category_bucket) - - indices = np.array(indices) - embedvecs = np.array(embedvecs) - return classlabels, embedvecs, indices - - def train(self, classdict, kerasmodel, nb_epoch=10): - """ Train the classifier. - - The training data and the corresponding keras model have to be given. - - If this has not been run, or a model was not loaded by :func:`~loadmodel`, - a `ModelNotTrainedException` will be raised while performing prediction and saving the model. - - :param classdict: training data - :param kerasmodel: keras sequential model - :param nb_epoch: number of steps / epochs in training - :return: None - :type classdict: dict - :type kerasmodel: keras.models.Sequential - :type nb_epoch: int - """ - # convert training data into embedded vectors - self.classlabels, train_embedvec, indices = self.convert_traindata_embedvecs(classdict) - - # train the model - kerasmodel.fit(train_embedvec, indices, epochs=nb_epoch) - - # flag switch - self.model = kerasmodel - self.trained = True - - def savemodel(self, nameprefix): - """ Save the trained model into files. - - Given the prefix of the file paths, save the model into files, with name given by the prefix. - There will be three files produced, one name ending with "_classlabels.txt", one name - ending with ".json", and one name ending with ".h5". - If there is no trained model, a `ModelNotTrainedException` will be thrown. - - :param nameprefix: prefix of the file path - :return: None - :type nameprefix: str - :raise: ModelNotTrainedException - """ - if not self.trained: - raise e.ModelNotTrainedException() - kerasio.save_model(nameprefix, self.model) - labelfile = open(nameprefix+'_classlabels.txt', 'w') - labelfile.write('\n'.join(self.classlabels)) - labelfile.close() - - def loadmodel(self, nameprefix): - """ Load a trained model from files. - - Given the prefix of the file paths, load the model from files with name given by the prefix - followed by "_classlabels.txt", ".json", and ".h5". - - If this has not been run, or a model was not trained by :func:`~train`, - a `ModelNotTrainedException` will be raised while performing prediction and saving the model. - - :param nameprefix: prefix of the file path - :return: None - :type nameprefix: str - """ - self.model = kerasio.load_model(nameprefix) - labelfile = open(nameprefix+'_classlabels.txt', 'r') - self.classlabels = labelfile.readlines() - labelfile.close() - self.classlabels = map(lambda s: s.strip(), self.classlabels) - self.trained = True - - def word_to_embedvec(self, word): - """ Convert the given word into an embedded vector. - - Given a word, return the corresponding embedded vector according to - the word-embedding model. If there is no such word in the model, - a vector with zero values are given. - - :param word: a word - :return: the corresponding embedded vector - :type word: str - :rtype: numpy.ndarray - """ - return self.wvmodel[word] if word in self.wvmodel else np.zeros(self.vecsize) - - def shorttext_to_embedvec(self, shorttext): - """ Convert the short text into an averaged embedded vector representation. - - Given a short sentence, it converts all the tokens into embedded vectors according to - the given word-embedding model, sums - them up, and normalize the resulting vector. It returns the resulting vector - that represents this short sentence. - - :param shorttext: a short sentence - :return: an embedded vector that represents the short sentence - :type shorttext: str - :rtype: numpy.ndarray - """ - vec = np.zeros(self.vecsize) - for token in spacy_tokenize(shorttext): - if token in self.wvmodel: - vec += self.wvmodel[token] - norm = np.linalg.norm(vec) - if norm!=0: - vec /= np.linalg.norm(vec) - return vec - - def score(self, shorttext): - """ Calculate the scores for all the class labels for the given short sentence. - - Given a short sentence, calculate the classification scores for all class labels, - returned as a dictionary with key being the class labels, and values being the scores. - If the short sentence is empty, or if other numerical errors occur, the score will be `numpy.nan`. - - If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. - - :param shorttext: a short sentence - :return: a dictionary with keys being the class labels, and values being the corresponding classification scores - :type shorttext: str - :rtype: dict - :raise: ModelNotTrainedException - """ - if not self.trained: - raise e.ModelNotTrainedException() - - # retrieve vector - embedvec = np.array(self.shorttext_to_embedvec(shorttext)) - - # classification using the neural network - predictions = self.model.predict(np.array([embedvec])) - - # wrangle output result - scoredict = {classlabel: predictions[0][idx] for idx, classlabel in enumerate(self.classlabels)} - return scoredict \ No newline at end of file diff --git a/shorttext/classifiers/embed/sumvec/frameworks.py b/shorttext/classifiers/embed/sumvec/frameworks.py deleted file mode 100644 index 25bc7e5e..00000000 --- a/shorttext/classifiers/embed/sumvec/frameworks.py +++ /dev/null @@ -1,63 +0,0 @@ -from keras.layers import Dense -from keras.models import Sequential -from keras.regularizers import l2 - -from shorttext.utils.classification_exceptions import UnequalArrayLengthsException - - -def DenseWordEmbed(nb_labels, - dense_nb_nodes=[], - dense_actfcn=[], - vecsize=100, - reg_coef=0.1, - final_activiation='softmax', - optimizer='adam'): - """ Return layers of dense neural network. - - Return layers of dense neural network. This assumes the input to be a rank-1 vector. - - :param nb_labels: number of class labels - :param dense_nb_nodes: number of nodes in each later (Default: []) - :param dense_actfcn: activation functions for each layer (Default: []) - :param vecsize: length of the embedded vectors in the model (Default: 100) - :param reg_coef: regularization coefficient (Default: 0.1) - :param final_activiation: activation function of the final layer (Default: softmax) - :param optimizer: optimizer for gradient descent. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. (Default: adam) - :return: keras sequential model for dense neural network - :type nb_labels: int - :type dense_nb_nodes: list - :type dense_actfcn: list - :type vecsize: int - :type reg_coef: float - :type final_activiation: str - :type optimizer: str - :rtype: keras.models.Sequential - """ - if len(dense_nb_nodes)!=len(dense_actfcn): - raise UnequalArrayLengthsException(dense_nb_nodes, dense_actfcn) - nb_layers = len(dense_nb_nodes) - - model = Sequential() - if nb_layers==0: - model.add(Dense(nb_labels, - input_shape=(vecsize,), - activation=final_activiation, - kernel_regularizer=l2(reg_coef))) - else: - model.add(Dense(dense_nb_nodes[0], - input_shape=(vecsize,), - activation=dense_actfcn[0], - kernel_regularizer=l2(reg_coef)) - ) - for nb_nodes, activation in zip(dense_nb_nodes[1:], dense_actfcn[1:]): - model.add(Dense(nb_nodes, - activation=activation, - kernel_regularizer=l2(reg_coef)) - ) - model.add(Dense(nb_labels, - activation=final_activiation, - kernel_regularizer=l2(reg_coef)) - ) - model.compile(loss='categorical_crossentropy', optimizer=optimizer) - - return model \ No newline at end of file diff --git a/shorttext/data/__init__.py b/shorttext/data/__init__.py deleted file mode 100644 index 58cf1524..00000000 --- a/shorttext/data/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .data_retrieval import subjectkeywords, nihreports, inaugual, retrieve_jsondata_as_dict, retrieve_csvdata_as_dict, yield_crossvalidation_classdicts diff --git a/shorttext/data/data_retrieval.py b/shorttext/data/data_retrieval.py deleted file mode 100644 index 5b5229f0..00000000 --- a/shorttext/data/data_retrieval.py +++ /dev/null @@ -1,208 +0,0 @@ -import random -from collections import defaultdict -import json -import os -import zipfile -from urllib import urlretrieve -import sys - -import pandas as pd -import numpy as np - - -def retrieve_csvdata_as_dict(filepath): - """ Retrieve the training data in a CSV file. - - Retrieve the training data in a CSV file, with the first column being the - class labels, and second column the text data. It returns a dictionary with - the class labels as keys, and a list of short texts as the value for each key. - - :param filepath: path of the training data (CSV) - :return: a dictionary with class labels as keys, and lists of short texts - :type filepath: str - :rtype: dict - """ - df = pd.read_csv(filepath) - category_col, descp_col = df.columns.values.tolist() - shorttextdict = defaultdict(lambda : []) - for category, descp in zip(df[category_col], df[descp_col]): - if type(descp)==str: - shorttextdict[category] += [descp] - return dict(shorttextdict) - -# for backward compatibility -def retrieve_data_as_dict(filepath): - """ Retrieve the training data in a CSV file. - - This calls :func:`~retrieve_csvdata_as_dict` for backward compatibility. - - :param filepath: path of the training data (CSV) - :return: a dictionary with class labels as keys, and lists of short texts - :type filepath: str - :rtype: dict - """ - return retrieve_csvdata_as_dict(filepath) - -def retrieve_jsondata_as_dict(filepath): - """ Retrieve the training data in a JSON file. - - Retrieve the training data in a JSON file, with - the class labels as keys, and a list of short texts as the value for each key. - It returns the corresponding dictionary. - - :param filepath: path of the training data (JSON) - :return: a dictionary with class labels as keys, and lists of short texts - :type filepath: str - :rtype: dict - """ - return json.load(open(filepath, 'r')) - -def subjectkeywords(): - """ Return an example data set of subjects. - - Return an example data set, with three subjects and corresponding keywords. - This is in the format of the training input. - - :return: example data set - :rtype: dict - """ - this_dir, _ = os.path.split(__file__) - return retrieve_csvdata_as_dict(os.path.join(this_dir, 'shorttext_exampledata.csv')) - -def inaugual(): - """ Return an example dataset, which is the Inaugural Addresses of all Presidents of - the United States from George Washington to Barack Obama. - - Each key is the year, a dash, and the last name of the president. The content is - the list of all the sentences - - :return: example data set - :rtype: dict - """ - zfile = zipfile.ZipFile(get_or_download_data("USInaugural.zip", - "https://github.com/stephenhky/PyShortTextCategorization/blob/master/data/USInaugural.zip?raw=true")) - return json.loads(zfile.open("addresses.json").read()) - -def nihreports(txt_col='PROJECT_TITLE', label_col='FUNDING_ICs', sample_size=512): - """ Return an example data set, sampled from NIH RePORT (Research Portfolio - Online Reporting Tools). - - Return an example data set from NIH (National Institutes of Health), - data publicly available from their RePORT - website. (`link - `_). - The data is with `txt_col` being either project titles ('PROJECT_TITLE') - or proposal abstracts ('ABSTRACT_TEXT'), and label_col being the names of the ICs (Institutes or Centers), - with 'IC_NAME' the whole form, and 'FUNDING_ICs' the abbreviated form). - - Dataset directly adapted from the NIH data from `R` package `textmineR - `_. - - :param txt_col: column for the text (Default: 'PROJECT_TITLE') - :param label_col: column for the labels (Default: 'FUNDING_ICs') - :param sample_size: size of the sample. Set to None if all rows. (Default: 512) - :return: example data set - :type txt_col: str - :type label_col: str - :type sample_size: int - :rtype: dict - """ - # validation - # txt_col = 'PROJECT_TITLE' or 'ABSTRACT_TEXT' - # label_col = 'FUNDING_ICs' or 'IC_NAME' - if not (txt_col in ['PROJECT_TITLE', 'ABSTRACT_TEXT']): - raise KeyError('Undefined text column: '+txt_col+'. Must be PROJECT_TITLE or ABSTRACT_TEXT.') - if not (label_col in ['FUNDING_ICs', 'IC_NAME']): - raise KeyError('Undefined label column: '+label_col+'. Must be FUNDING_ICs or IC_NAME.') - - zfile = zipfile.ZipFile(get_or_download_data('nih_full.csv.zip', - 'https://github.com/stephenhky/PyShortTextCategorization/blob/master/data/nih_full.csv.zip?raw=true') - ) - nih = pd.read_csv(zfile.open('nih_full.csv'), na_filter=False, usecols=[label_col, txt_col]) - nb_data = len(nih) - sample_size = nb_data if sample_size==None else min(nb_data, sample_size) - - classdict = defaultdict(lambda : []) - - for rowidx in np.random.randint(nb_data, size=min(nb_data, sample_size)): - label = nih.ix[rowidx, label_col] - if label_col=='FUNDING_ICs': - if label=='': - label = 'OTHER' - else: - endpos = label.index(':') - label = label[:endpos] - classdict[label] += [nih.ix[rowidx, txt_col]] - - return dict(classdict) - -def mergedict(dicts): - """ Merge data dictionary. - - Merge dictionaries of the data in the training data format. - - :param dicts: dicts to merge - :return: merged dict - :type dicts: list - :rtype: dict - """ - mdict = defaultdict(lambda : []) - for thisdict in dicts: - for label in thisdict: - mdict[label] += thisdict[label] - return dict(mdict) - -def yield_crossvalidation_classdicts(classdict, nb_partitions, shuffle=False): - """ Yielding test data and training data for cross validation by partitioning it. - - Given a training data, partition the data into portions, each will be used as test - data set, while the other training data set. It returns a generator. - - :param classdict: training data - :param nb_partitions: number of partitions - :param shuffle: whether to shuffle the data before partitioning - :return: generator, producing a test data set and a training data set each time - :type classdict: dict - :type nb_partitions: int - :type shuffle: bool - :rtype: generator - """ - crossvaldicts = [] - for i in range(nb_partitions): - crossvaldicts.append(defaultdict(lambda: [])) - - for label in classdict: - nb_data = len(classdict[label]) - partsize = nb_data / nb_partitions - sentences = classdict[label] if not shuffle else random.shuffle(sentences) - for i in range(nb_partitions): - crossvaldicts[i][label] += sentences[i * partsize:min(nb_data, (i + 1) * partsize)] - crossvaldicts = map(dict, crossvaldicts) - - for i in range(nb_partitions): - testdict = crossvaldicts[i] - traindict = mergedict([crossvaldicts[j] for j in range(nb_partitions) if j != i]) - yield testdict, traindict - -def get_or_download_data(filename, origin): - # determine path - homedir = os.path.expanduser('~') - datadir = os.path.join(homedir, '.shorttext') - if not os.path.exists(datadir): - os.makedirs(datadir) - - targetfilepath = os.path.join(datadir, filename) - # download if not exist - if not os.path.exists(os.path.join(datadir, filename)): - print 'Downloading...' - print 'Source: ', origin - print 'Target: ', targetfilepath - try: - urlretrieve(origin, targetfilepath) - except: - print 'Failure to download file!' - print sys.exc_info() - os.remove(targetfilepath) - - # return - return open(targetfilepath, 'r') \ No newline at end of file diff --git a/shorttext/generators/__init__.py b/shorttext/generators/__init__.py deleted file mode 100644 index 9cd8fbe8..00000000 --- a/shorttext/generators/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from .bow.GensimTopicModeling import load_gensimtopicmodel -from .bow.AutoEncodingTopicModeling import load_autoencoder_topicmodel - -from .bow.GensimTopicModeling import GensimTopicModeler, LDAModeler, LSIModeler, RPModeler -from .bow.AutoEncodingTopicModeling import AutoencodingTopicModeler \ No newline at end of file diff --git a/shorttext/generators/bow/AutoEncodingTopicModeling.py b/shorttext/generators/bow/AutoEncodingTopicModeling.py deleted file mode 100644 index 1271e4d0..00000000 --- a/shorttext/generators/bow/AutoEncodingTopicModeling.py +++ /dev/null @@ -1,223 +0,0 @@ -import json -import pickle -from operator import add - -import numpy as np -from gensim.corpora import Dictionary -from keras import Input -from keras.engine import Model -from keras.layers import Dense -from scipy.spatial.distance import cosine - -from .LatentTopicModeling import LatentTopicModeler -from utils import compactmodel_io as cio, classification_exceptions as e, kerasmodel_io as kerasio, \ - textpreprocessing as textpreprocess - -autoencoder_suffices = ['.gensimdict', '_encoder.json', '_encoder.h5', '_classtopicvecs.pkl', - '_decoder.json', '_decoder.h5', '_autoencoder.json', '_autoencoder.h5', - '.json'] - - -@cio.compactio({'classifier': 'kerasautoencoder'}, 'kerasautoencoder', autoencoder_suffices) -class AutoencodingTopicModeler(LatentTopicModeler): - """ - This class facilitates the topic modeling of input training data using the autoencoder. - - A reference about how an autoencoder is written with keras by Francois Chollet, titled - `Building Autoencoders in Keras - `_ . - - This class extends :class:`LatentTopicModeler`. - """ - def train(self, classdict, nb_topics, *args, **kwargs): - """ Train the autoencoder. - - :param classdict: training data - :param nb_topics: number of topics, i.e., the number of encoding dimensions - :param args: arguments to be passed to keras model fitting - :param kwargs: arguments to be passed to keras model fitting - :return: None - :type classdict: dict - :type nb_topics: int - """ - self.nb_topics = nb_topics - self.generate_corpus(classdict) - vecsize = len(self.dictionary) - - # define all the layers of the autoencoder - input_vec = Input(shape=(vecsize,)) - encoded = Dense(self.nb_topics, activation='relu')(input_vec) - decoded = Dense(vecsize, activation='sigmoid')(encoded) - - # define the autoencoder model - autoencoder = Model(input=input_vec, output=decoded) - - # define the encoder - encoder = Model(input=input_vec, output=encoded) - - # define the decoder - encoded_input = Input(shape=(self.nb_topics,)) - decoder_layer = autoencoder.layers[-1] - decoder = Model(input=encoded_input, output=decoder_layer(encoded_input)) - - # compile the autoencoder - autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy') - - # process training data - embedvecs = np.array(reduce(add, - [map(lambda shorttext: self.retrieve_bow_vector(shorttext, normalize=True), - classdict[classtype]) - for classtype in classdict] - ) - ) - - # fit the model - autoencoder.fit(embedvecs, embedvecs, *args, **kwargs) - - # store the autoencoder models - self.autoencoder = autoencoder - self.encoder = encoder - self.decoder = decoder - - # flag setting - self.trained = True - - # classes topic vector precomputation - self.classtopicvecs = {} - for label in classdict: - self.classtopicvecs[label] = self.precalculate_liststr_topicvec(classdict[label]) - - def retrieve_topicvec(self, shorttext): - """ Calculate the topic vector representation of the short text. - - If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. - - :param shorttext: short text - :return: encoded vector representation of the short text - :raise: ModelNotTrainedException - :type shorttext: str - :rtype: numpy.ndarray - """ - if not self.trained: - raise e.ModelNotTrainedException() - bow_vector = self.retrieve_bow_vector(shorttext) - encoded_vec = self.encoder.predict(np.array([bow_vector]))[0] - if self.normalize: - encoded_vec /= np.linalg.norm(encoded_vec) - return encoded_vec - - def precalculate_liststr_topicvec(self, shorttexts): - """ Calculate the summed topic vectors for training data for each class. - - This function is called while training. - - :param shorttexts: list of short texts - :return: average topic vector - :raise: ModelNotTrainedException - :type shorttexts: list - :rtype: numpy.ndarray - """ - sumvec = sum(map(self.retrieve_topicvec, shorttexts)) - sumvec /= np.linalg.norm(sumvec) - return sumvec - - def get_batch_cos_similarities(self, shorttext): - """ Calculate the score, which is the cosine similarity with the topic vector of the model, - of the short text against each class labels. - - If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. - - :param shorttext: short text - :return: dictionary of scores of the text to all classes - :raise: ModelNotTrainedException - :type shorttext: str - :rtype: dict - """ - if not self.trained: - raise e.ModelNotTrainedException() - simdict = {} - for label in self.classtopicvecs: - simdict[label] = 1 - cosine(self.classtopicvecs[label], self.retrieve_topicvec(shorttext)) - return simdict - - def savemodel(self, nameprefix, save_complete_autoencoder=True): - """ Save the model with names according to the prefix. - - Given the prefix of the file paths, save the model into files, with name given by the prefix. - There are files with names ending with "_encoder.json" and "_encoder.h5", which are - the JSON and HDF5 files for the encoder respectively. They also include a gensim dictionary (.gensimdict). - - If `save_complete_autoencoder` is True, - then there are also files with names ending with "_decoder.json" and "_decoder.h5". - - If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. - - :param nameprefix: prefix of the paths of the file - :param save_complete_autoencoder: whether to store the decoder and the complete autoencoder (Default: True; but False for version <= 0.2.1) - :return: None - :type nameprefix: str - :type save_complete_autoencoder: bool - """ - if not self.trained: - raise e.ModelNotTrainedException() - - parameters = {} - parameters['nb_topics'] = self.nb_topics - parameters['classlabels'] = self.classlabels - json.dump(parameters, open(nameprefix+'.json', 'wb')) - - self.dictionary.save(nameprefix+'.gensimdict') - kerasio.save_model(nameprefix+'_encoder', self.encoder) - if save_complete_autoencoder: - kerasio.save_model(nameprefix+'_decoder', self.decoder) - kerasio.save_model(nameprefix+'_autoencoder', self.autoencoder) - pickle.dump(self.classtopicvecs, open(nameprefix+'_classtopicvecs.pkl', 'w')) - - def loadmodel(self, nameprefix, load_incomplete=False): - """ Save the model with names according to the prefix. - - Given the prefix of the file paths, load the model into files, with name given by the prefix. - There are files with names ending with "_encoder.json" and "_encoder.h5", which are - the JSON and HDF5 files for the encoder respectively. - They also include a gensim dictionary (.gensimdict). - - :param nameprefix: prefix of the paths of the file - :param load_incomplete: load encoder only, not decoder and autoencoder file (Default: False; put True for model built in version <= 0.2.1) - :return: None - :type nameprefix: str - :type load_incomplete: bool - """ - # load the JSON file (parameters) - parameters = json.load(open(nameprefix+'.json', 'rb')) - self.nb_topics = parameters['nb_topics'] - self.classlabels = parameters['classlabels'] - - self.dictionary = Dictionary.load(nameprefix + '.gensimdict') - self.encoder = kerasio.load_model(nameprefix+'_encoder') - self.classtopicvecs = pickle.load(open(nameprefix+'_classtopicvecs.pkl', 'r')) - if not load_incomplete: - self.decoder = kerasio.load_model(nameprefix+'_decoder') - self.autoencoder = kerasio.load_model(nameprefix+'_autoencoder') - self.trained = True - - -def load_autoencoder_topicmodel(name, - preprocessor=textpreprocess.standard_text_preprocessor_1(), - compact=True): - """ Load the autoencoding topic model from files. - - :param name: name (if compact=True) or prefix (if compact=False) of the paths of the model files - :param preprocessor: function that preprocesses the text. (Default: `shorttext.utils.textpreprocess.standard_text_preprocessor_1`) - :param compact: whether model file is compact (Default: True) - :return: an autoencoder as a topic modeler - :type name: str - :type preprocessor: function - :type compact: bool - :rtype: generators.bow.AutoEncodingTopicModeling.AutoencodingTopicModeler - """ - autoencoder = AutoencodingTopicModeler(preprocessor=preprocessor) - if compact: - autoencoder.load_compact_model(name) - else: - autoencoder.loadmodel(name) - return autoencoder \ No newline at end of file diff --git a/shorttext/generators/bow/GensimTopicModeling.py b/shorttext/generators/bow/GensimTopicModeling.py deleted file mode 100644 index aa03fa3e..00000000 --- a/shorttext/generators/bow/GensimTopicModeling.py +++ /dev/null @@ -1,318 +0,0 @@ -import json - -import gensim -import numpy as np -from gensim.corpora import Dictionary -from gensim.models import TfidfModel, LdaModel, LsiModel, RpModel -from gensim.similarities import MatrixSimilarity - -import shorttext.utils.classification_exceptions as e -import shorttext.utils.compactmodel_io as cio -from shorttext.utils import gensim_corpora as gc -from .LatentTopicModeling import LatentTopicModeler -from shorttext.utils import textpreprocessing as textpreprocess -from shorttext.utils.textpreprocessing import spacy_tokenize as tokenize - -gensim_topic_model_dict = {'lda': LdaModel, 'lsi': LsiModel, 'rp': RpModel} - - -class GensimTopicModeler(LatentTopicModeler): - """ - This class facilitates the creation of topic models (options: LDA (latent Dirichlet Allocation), - LSI (latent semantic indexing), and Random Projections - with the given short text training data, and convert future - short text into topic vectors using the trained topic model. - - No compact model I/O available for this class. Refer to - :class:`LDAModeler` and :class:`LSIModeler`. - - This class extends :class:`LatentTopicModeler`. - """ - def __init__(self, - preprocessor=textpreprocess.standard_text_preprocessor_1(), - algorithm='lda', - toweigh=True, - normalize=True): - """ Initialize the topic modeler. - - :param preprocessor: function that preprocesses the text. (Default: `utils.textpreprocess.standard_text_preprocessor_1`) - :param algorithm: algorithm for topic modeling. Options: lda, lsi, rp. (Default: lda) - :param toweigh: whether to weigh the words using tf-idf. (Default: True) - :param normalize: whether the retrieved topic vectors are normalized. (Default: True) - :type preprocessor: function - :type algorithm: str - :type toweigh: bool - """ - LatentTopicModeler.__init__(self, preprocessor=preprocessor, normalize=normalize) - self.algorithm = algorithm - self.toweigh = toweigh - - def train(self, classdict, nb_topics, *args, **kwargs): - """ Train the topic modeler. - - :param classdict: training data - :param nb_topics: number of latent topics - :param args: arguments to pass to the `train` method for gensim topic models - :param kwargs: arguments to pass to the `train` method for gensim topic models - :return: None - :type classdict: dict - :type nb_topics: int - """ - self.nb_topics = nb_topics - self.generate_corpus(classdict) - if self.toweigh: - self.tfidf = TfidfModel(self.corpus) - normcorpus = self.tfidf[self.corpus] - else: - self.tfidf = None - normcorpus = self.corpus - - self.topicmodel = gensim_topic_model_dict[self.algorithm](normcorpus, - num_topics=self.nb_topics, - *args, - **kwargs) - self.matsim = MatrixSimilarity(self.topicmodel[normcorpus]) - - # change the flag - self.trained = True - - def update(self, additional_classdict): - """ Update the model with additional data. - - It updates the topic model with additional data. - - Warning: It does not allow adding class labels, and new words. - The dictionary is not changed. Therefore, such an update will alter the - topic model only. It affects the topic vector representation. While the corpus - is changed, the words pumped into calculating the similarity matrix is not changed. - - Therefore, this function means for a fast update. - But if you want a comprehensive model, it is recommended to retrain. - - :param additional_classdict: additional training data - :return: None - :type additional_classdict: dict - """ - # cannot use this way, as we want to update the corpus with existing words - self.corpus, newcorpus = gc.update_corpus_labels(self.dictionary, - self.corpus, - additional_classdict, - preprocess_and_tokenize=lambda sent: tokenize(self.preprocessor(sent))) - self.topicmodel.update(newcorpus) - - def retrieve_corpus_topicdist(self, shorttext): - """ Calculate the topic vector representation of the short text, in the corpus form. - - If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. - - :param shorttext: text to be represented - :return: topic vector in the corpus form - :raise: ModelNotTrainedException - :type shorttext: str - :rtype: list - """ - if not self.trained: - raise e.ModelNotTrainedException() - bow = self.retrieve_bow(shorttext) - return self.topicmodel[self.tfidf[bow] if self.toweigh else bow] - - def retrieve_topicvec(self, shorttext): - """ Calculate the topic vector representation of the short text. - - This function calls :func:`~retrieve_corpus_topicdist`. - - If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. - - :param shorttext: text to be represented - :return: topic vector - :raise: ModelNotTrainedException - :type shorttext: str - :rtype: numpy.ndarray - """ - if not self.trained: - raise e.ModelNotTrainedException() - topicdist = self.retrieve_corpus_topicdist(shorttext) - topicvec = np.zeros(self.nb_topics) - for topicid, frac in topicdist: - topicvec[topicid] = frac - if self.normalize: - topicvec /= np.linalg.norm(topicvec) - return topicvec - - def get_batch_cos_similarities(self, shorttext): - """ Calculate the score, which is the cosine similarity with the topic vector of the model, - of the short text against each class labels. - - If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. - - :param shorttext: short text - :return: dictionary of scores of the text to all classes - :raise: ModelNotTrainedException - :type shorttext: str - :rtype: dict - """ - if not self.trained: - raise e.ModelNotTrainedException() - simdict = {} - similarities = self.matsim[self.retrieve_corpus_topicdist(shorttext)] - for label, similarity in zip(self.classlabels, similarities): - simdict[label] = similarity - return simdict - - def loadmodel(self, nameprefix): - """ Load the topic model with the given prefix of the file paths. - - Given the prefix of the file paths, load the corresponding topic model. The files - include a JSON (.json) file that specifies various parameters, a gensim dictionary (.gensimdict), - and a topic model (.gensimmodel). If weighing is applied, load also the tf-idf model (.gensimtfidf). - - :param nameprefix: prefix of the file paths - :return: None - :type nameprefix: str - """ - # load the JSON file (parameters) - parameters = json.load(open(nameprefix+'.json', 'rb')) - self.nb_topics = parameters['nb_topics'] - self.toweigh = parameters['toweigh'] - self.algorithm = parameters['algorithm'] - self.classlabels = parameters['classlabels'] - - # load the dictionary - self.dictionary = Dictionary.load(nameprefix+'.gensimdict') - - # load the topic model - self.topicmodel = gensim_topic_model_dict[self.algorithm].load(nameprefix + '.gensimmodel') - - # load the similarity matrix - self.matsim = MatrixSimilarity.load(nameprefix+'.gensimmat') - - # load the tf-idf modek - if self.toweigh: - self.tfidf = TfidfModel.load(nameprefix+'.gensimtfidf') - - # flag - self.trained = True - - def savemodel(self, nameprefix): - """ Save the model with names according to the prefix. - - Given the prefix of the file paths, save the corresponding topic model. The files - include a JSON (.json) file that specifies various parameters, a gensim dictionary (.gensimdict), - and a topic model (.gensimmodel). If weighing is applied, load also the tf-idf model (.gensimtfidf). - - If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. - - :param nameprefix: prefix of the file paths - :return: None - :raise: ModelNotTrainedException - :type nameprefix: str - """ - if not self.trained: - raise e.ModelNotTrainedException() - parameters = {} - parameters['nb_topics'] = self.nb_topics - parameters['toweigh'] = self.toweigh - parameters['algorithm'] = self.algorithm - parameters['classlabels'] = self.classlabels - json.dump(parameters, open(nameprefix+'.json', 'wb')) - - self.dictionary.save(nameprefix+'.gensimdict') - self.topicmodel.save(nameprefix+'.gensimmodel') - self.matsim.save(nameprefix+'.gensimmat') - if self.toweigh: - self.tfidf.save(nameprefix+'.gensimtfidf') - -lda_suffices = ['.json', '.gensimdict', '.gensimmodel.state', - '.gensimtfidf', '.gensimmodel', '.gensimmat'] -if gensim.__version__ >= '1.0.0': - lda_suffices += ['.gensimmodel.expElogbeta.npy', '.gensimmodel.id2word'] - -@cio.compactio({'classifier': 'ldatopic'}, 'ldatopic', lda_suffices) -class LDAModeler(GensimTopicModeler): - """ - This class facilitates the creation of LDA (latent Dirichlet Allocation) topic models, - with the given short text training data, and convert future - short text into topic vectors using the trained topic model. - - This class extends :class:`GensimTopicModeler`. - """ - def __init__(self, - preprocessor=textpreprocess.standard_text_preprocessor_1(), - toweigh=True, - normalize=True): - GensimTopicModeler.__init__(self, - preprocessor=preprocessor, - algorithm='lda', - toweigh=toweigh, - normalize=normalize) - -lsi_suffices = ['.json', '.gensimdict', '.gensimtfidf', '.gensimmodel.projection', - '.gensimmodel', '.gensimmat', ] - -@cio.compactio({'classifier': 'lsitopic'}, 'lsitopic', lsi_suffices) -class LSIModeler(GensimTopicModeler): - """ - This class facilitates the creation of LSI (latent semantic indexing) topic models, - with the given short text training data, and convert future - short text into topic vectors using the trained topic model. - - This class extends :class:`GensimTopicModeler`. - """ - def __init__(self, - preprocessor=textpreprocess.standard_text_preprocessor_1(), - toweigh=True, - normalize=True): - GensimTopicModeler.__init__(self, - preprocessor=preprocessor, - algorithm='lsi', - toweigh=toweigh, - normalize=normalize) - -rp_suffices = ['.json', '.gensimtfidf', '.gensimmodel', '.gensimmat', '.gensimdict'] - -@cio.compactio({'classifier': 'rptopic'}, 'rptopic', rp_suffices) -class RPModeler(GensimTopicModeler): - """ - This class facilitates the creation of RP (random projection) topic models, - with the given short text training data, and convert future - short text into topic vectors using the trained topic model. - - This class extends :class:`GensimTopicModeler`. - """ - def __init__(self, - preprocessor=textpreprocess.standard_text_preprocessor_1(), - toweigh=True, - normalize=True): - GensimTopicModeler.__init__(self, - preprocessor=preprocessor, - algorithm='rp', - toweigh=toweigh, - normalize=normalize) - - -def load_gensimtopicmodel(name, - preprocessor=textpreprocess.standard_text_preprocessor_1(), - compact=True): - """ Load the gensim topic modeler from files. - - :param name: name (if compact=True) or prefix (if compact=False) of the file path - :param preprocessor: function that preprocesses the text. (Default: `shorttext.utils.textpreprocess.standard_text_preprocessor_1`) - :param compact: whether model file is compact (Default: True) - :return: a topic modeler - :type name: str - :type preprocessor: function - :type compact: bool - :rtype: GensimTopicModeler - """ - if compact: - modelerdict = {'ldatopic': LDAModeler, 'lsitopic': LSIModeler, 'rptopic': RPModeler} - classifier_name = str(cio.get_model_classifier_name(name)) - - topicmodeler = modelerdict[classifier_name](preprocessor=preprocessor) - topicmodeler.load_compact_model(name) - return topicmodeler - else: - topicmodeler = GensimTopicModeler(preprocessor=preprocessor) - topicmodeler.loadmodel(name) - return topicmodeler - diff --git a/shorttext/generators/bow/LatentTopicModeling.py b/shorttext/generators/bow/LatentTopicModeling.py deleted file mode 100644 index 4d8e8863..00000000 --- a/shorttext/generators/bow/LatentTopicModeling.py +++ /dev/null @@ -1,137 +0,0 @@ -import numpy as np - -from shorttext.utils import textpreprocessing as textpreprocess, gensim_corpora as gc, classification_exceptions as e -from shorttext.utils.textpreprocessing import spacy_tokenize as tokenize - -# abstract class -class LatentTopicModeler: - """ - Abstract class for various topic modeler. - """ - def __init__(self, - preprocessor=textpreprocess.standard_text_preprocessor_1(), - normalize=True): - """ Initialize the modeler. - - :param preprocessor: function that preprocesses the text. (Default: `shorttext.utils.textpreprocess.standard_text_preprocessor_1`) - :param normalize: whether the retrieved topic vectors are normalized. (Default: True) - :type preprocessor: function - :type normalize: bool - """ - self.preprocessor = preprocessor - self.normalize = normalize - self.trained = False - - def generate_corpus(self, classdict): - """ Calculate the gensim dictionary and corpus, and extract the class labels - from the training data. Called by :func:`~train`. - - :param classdict: training data - :return: None - :type classdict: dict - """ - self.dictionary, self.corpus, self.classlabels = gc.generate_gensim_corpora(classdict, - preprocess_and_tokenize=lambda sent: tokenize(self.preprocessor(sent))) - - def train(self, classdict, nb_topics, *args, **kwargs): - """ Train the modeler. - - This is an abstract method of this abstract class, which raise the `NotImplementedException`. - - :param classdict: training data - :param nb_topics: number of latent topics - :param args: arguments to be passed into the wrapped training functions - :param kwargs: arguments to be passed into the wrapped training functions - :return: None - :raise: NotImplementedException - :type classdict: dict - :type nb_topics: int - """ - self.nb_topics = nb_topics - raise e.NotImplementedException() - - def retrieve_bow(self, shorttext): - """ Calculate the gensim bag-of-words representation of the given short text. - - :param shorttext: text to be represented - :return: corpus representation of the text - :type shorttext: str - :rtype: list - """ - return self.dictionary.doc2bow(tokenize(self.preprocessor(shorttext))) - - def retrieve_bow_vector(self, shorttext, normalize=True): - """ Calculate the vector representation of the bag-of-words in terms of numpy.ndarray. - - :param shorttext: short text - :param normalize: whether the retrieved topic vectors are normalized. (Default: True) - :return: vector represtation of the text - :type shorttext: str - :type normalize: bool - :rtype: numpy.ndarray - """ - bow = self.retrieve_bow(shorttext) - vec = np.zeros(len(self.dictionary)) - for id, val in bow: - vec[id] = val - if normalize: - vec /= np.linalg.norm(vec) - return vec - - def retrieve_topicvec(self, shorttext): - """ Calculate the topic vector representation of the short text. - - This is an abstract method of this abstract class, which raise the `NotImplementedException`. - - :param shorttext: short text - :return: topic vector - :raise: NotImplementedException - :type shorttext: str - :rtype: numpy.ndarray - """ - raise e.NotImplementedException() - - def get_batch_cos_similarities(self, shorttext): - """ Calculate the cosine similarities of the given short text and all the class labels. - - This is an abstract method of this abstract class, which raise the `NotImplementedException`. - - :param shorttext: short text - :return: topic vector - :raise: NotImplementedException - :type shorttext: str - :rtype: numpy.ndarray - """ - raise e.NotImplementedException() - - def __getitem__(self, shorttext): - return self.retrieve_topicvec(shorttext) - - def __contains__(self, shorttext): - if not self.trained: - raise e.ModelNotTrainedException() - return True - - def loadmodel(self, nameprefix): - """ Load the model from files. - - This is an abstract method of this abstract class, which raise the `NotImplementedException`. - - :param nameprefix: prefix of the paths of the model files - :return: None - :raise: NotImplementedException - :type nameprefix: str - """ - raise e.NotImplementedException() - - def savemodel(self, nameprefix): - """ Save the model to files. - - This is an abstract method of this abstract class, which raise the `NotImplementedException`. - - :param nameprefix: prefix of the paths of the model files - :return: None - :raise: NotImplementedException - :type nameprefix: str - """ - raise e.NotImplementedException() \ No newline at end of file diff --git a/shorttext/smartload.py b/shorttext/smartload.py deleted file mode 100644 index d99307aa..00000000 --- a/shorttext/smartload.py +++ /dev/null @@ -1,45 +0,0 @@ - -from .utils import standard_text_preprocessor_1 -from .utils import compactmodel_io as cio -from .utils import classification_exceptions as e -from .classifiers import load_varnnlibvec_classifier, load_sumword2vec_classifier -from .generators import load_autoencoder_topicmodel, load_gensimtopicmodel -from .classifiers import load_autoencoder_topic_sklearnclassifier, load_gensim_topicvec_sklearnclassifier -from .classifiers import load_maxent_classifier - - -def smartload_compact_model(filename, wvmodel, preprocessor=standard_text_preprocessor_1()): - """ Load appropriate classifier or model from the binary model. - - The second parameter, `wvmodel`, can be set to `None` if no Word2Vec model is needed. - - :param filename: path of the compact model file - :param wvmodel: Word2Vec model - :param preprocessor: text preprocessor (Default: `shorttext.utils.textpreprocess.standard_text_preprocessor_1`) - :return: appropriate classifier or model - :raise: AlgorithmNotExistException - :type filename: str - :type wvmodel: gensim.models.keyedvectors.KeyedVectors - :type preprocessor: function - """ - classifier_name = cio.get_model_classifier_name(filename) - if classifier_name in ['ldatopic', 'lsitopic', 'rptopic']: - return load_gensimtopicmodel(filename, preprocessor=preprocessor, compact=True) - elif classifier_name in ['kerasautoencoder']: - return load_autoencoder_topicmodel(filename, preprocessor=preprocessor, compact=True) - elif classifier_name in ['topic_sklearn']: - topicmodel = cio.get_model_config_field(filename, 'topicmodel') - if topicmodel in ['ldatopic', 'lsitopic', 'rptopic']: - return load_gensim_topicvec_sklearnclassifier(filename, preprocessor=preprocessor, compact=True) - elif topicmodel in ['kerasautoencoder']: - return load_autoencoder_topic_sklearnclassifier(filename, preprocessor=preprocessor, compact=True) - else: - raise e.AlgorithmNotExistException(topicmodel) - elif classifier_name in ['nnlibvec']: - return load_varnnlibvec_classifier(wvmodel, filename, compact=True) - elif classifier_name in ['sumvec']: - return load_sumword2vec_classifier(wvmodel, filename, compact=True) - elif classifier_name in ['maxent']: - return load_maxent_classifier(filename, compact=True) - else: - raise e.AlgorithmNotExistException(classifier_name) \ No newline at end of file diff --git a/shorttext/stack/stacking.py b/shorttext/stack/stacking.py deleted file mode 100644 index a7888f33..00000000 --- a/shorttext/stack/stacking.py +++ /dev/null @@ -1,290 +0,0 @@ -import pickle - -import numpy as np -from keras.layers import Dense, Reshape -from keras.models import Sequential -from keras.regularizers import l2 - -import shorttext.utils.classification_exceptions as e -import shorttext.utils.kerasmodel_io as kerasio -import shorttext.utils.compactmodel_io as cio - -# abstract class -class StackedGeneralization: - """ - This is an abstract class for any stacked generalization method. It is an intermediate model - that takes the results of other classifiers as the input features, and perform another classification. - - The classifiers must have the :func:`~score` method that takes a string as an input argument. - - More references: - - David H. Wolpert, "Stacked Generalization," *Neural Netw* 5: 241-259 (1992). - - M. Paz Sesmero, Agapito I. Ledezma, Araceli Sanchis, "Generating ensembles of heterogeneous classifiers using Stacked Generalization," - *WIREs Data Mining and Knowledge Discovery* 5: 21-34 (2015). - """ - def __init__(self, intermediate_classifiers={}): - """ Initialize the stacking class instance. - - :param intermediate_classifiers: dictionary, with key being a string, and the values intermediate classifiers, that have the method :func:`~score`, which takes a string as the input argument. - :type intermediate_classifiers: dict - """ - self.classifiers = intermediate_classifiers - self.classlabels = [] - self.trained = False - - def register_classifiers(self): - """ Register the intermediate classifiers. - - It must be run before any training. - - :return: None - """ - self.classifier2idx = {} - self.idx2classifier = {} - for idx, key in enumerate(self.classifiers.keys()): - self.classifier2idx[key] = idx - self.idx2classifier[idx] = key - - def register_classlabels(self, labels): - """ Register output labels. - - Given the labels, it gives an integer as the index for each label. - It is essential for the output model to place. - - It must be run before any training. - - :param labels: list of output labels - :return: None - :type labels: list - """ - self.classlabels = list(labels) - self.labels2idx = {classlabel: idx for idx, classlabel in enumerate(self.classlabels)} - - def add_classifier(self, name, classifier): - """ Add a classifier. - - Add a classifier to the class. The classifier must have the method :func:`~score` which - takes a string as an input argument. - - :param name: name of the classifier, without spaces and any special characters - :param classifier: instance of a classifier, which has a method :func:`~score` which takes a string as an input argument - :return: None - :type name: str - :type classifier: any class with a method :func:`~score` - """ - self.classifiers[name] = classifier - self.register_classifiers() - - def delete_classifier(self, name): - """ Delete a classifier. - - :param name: name of the classifier to be deleted - :return: None - :type name: str - :raise: KeyError - """ - del self.classifiers[name] - self.register_classifiers() - - def translate_shorttext_intfeature_matrix(self, shorttext): - """ Represent the given short text as the input matrix of the stacking class. - - :param shorttext: short text - :return: input matrix of the stacking class - :type shorttext: str - :rtype: numpy.ndarray - """ - feature_matrix = np.zeros((len(self.classifier2idx), len(self.labels2idx))) - for key in self.classifier2idx: - scoredict = self.classifiers[key].score(shorttext) - for label in scoredict: - feature_matrix[self.classifier2idx[key], self.labels2idx[label]] = scoredict[label] - return feature_matrix - - def convert_label_to_buckets(self, label): - """ Convert the label into an array of bucket. - - Some classification algorithms, especially those of neural networks, have the output - as a serious of buckets with the correct answer being 1 in the correct label, with other being 0. - This method convert the label into the corresponding buckets. - - :param label: label - :return: array of buckets - :type label: str - :rtype: numpy.ndarray - """ - buckets = np.zeros(len(self.labels2idx), dtype=np.int) - buckets[self.labels2idx[label]] = 1 - return buckets - - def convert_traindata_matrix(self, classdict, tobucket=True): - """ Returns a generator that returns the input matrix and the output labels for training. - - :param classdict: dictionary of the training data - :param tobucket: whether to convert the label into buckets (Default: True) - :return: array of input matrix, and output labels - :type classdict: dict - :type tobucket: bool - :rtype: tuple - """ - for label in classdict: - y = self.convert_label_to_buckets(label) if tobucket else self.labels2idx[label] - for shorttext in classdict[label]: - X = self.translate_shorttext_intfeature_matrix(shorttext) - yield X, y - - def train(self, classdict, *args, **kwargs): - """ Train the stacked generalization. - - Not implemented. `NotImplemntedException` raised. - - :param classdict: training data - :param args: arguments to be parsed - :param kwargs: arguments to be parsed - :return: None - :type classdict: dict - :type args: dict - :type kwargs: dict - :raise: NotImplementedException - """ - raise e.NotImplementedException() - - def score(self, shorttext, *args, **kwargs): - """ Calculate the scores for each class labels. - - Not implemented. `NotImplemntedException` raised. - - :param shorttext: short text to be scored - :param args: arguments to be parsed - :param kwargs: arguments to be parsed - :return: dictionary of scores for all class labels - :type shorttext: str - :type args: dict - :type kwargs: dict - :rtype: dict - :raise: NotImplementedException - """ - raise e.NotImplementedException() - -@cio.compactio({'classifier': 'stacked_logistics'}, 'stacked_logistics', - ['_stackedlogistics.pkl', '_stackedlogistics.h5', '_stackedlogistics.json']) -class LogisticStackedGeneralization(StackedGeneralization): - """ - This class implements logistic regression as the stacked generalizer. - - It is an intermediate model - that takes the results of other classifiers as the input features, and perform another classification. - - This class saves the stacked logistic model, but not the information of the primary model. - - The classifiers must have the :func:`~score` method that takes a string as an input argument. - """ - def train(self, classdict, optimizer='adam', l2reg=0.01, bias_l2reg=0.01, nb_epoch=1000): - """ Train the stacked generalization. - - :param classdict: training data - :param optimizer: optimizer to use Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. (Default: 'adam', for adam optimizer) - :param l2reg: coefficients for L2-regularization (Default: 0.01) - :param bias_l2reg: coefficients for L2-regularization for bias (Default: 0.01) - :param nb_epoch: number of epochs for training (Default: 1000) - :return: None - :type classdict: dict - :type optimizer: str - :type l2reg: float - :type bias_l2reg: float - :type nb_epoch: int - """ - # register - self.register_classifiers() - self.register_classlabels(classdict.keys()) - - kmodel = Sequential() - kmodel.add(Reshape((len(self.classifier2idx) * len(self.labels2idx),), - input_shape=(len(self.classifier2idx), len(self.labels2idx)))) - kmodel.add(Dense(units=len(classdict), - activation='sigmoid', - kernel_regularizer=l2(l2reg), - bias_regularizer=l2(bias_l2reg)) - ) - kmodel.compile(loss='categorical_crossentropy', optimizer=optimizer) - - Xy = [(xone, yone) for xone, yone in self.convert_traindata_matrix(classdict, tobucket=True)] - X = np.array(map(lambda item: item[0], Xy)) - y = np.array(map(lambda item: item[1], Xy)) - - kmodel.fit(X, y, epochs=nb_epoch) - - self.model = kmodel - self.trained = True - - def score(self, shorttext): - """ Calculate the scores for all the class labels for the given short sentence. - - Given a short sentence, calculate the classification scores for all class labels, - returned as a dictionary with key being the class labels, and values being the scores. - If the short sentence is empty, or if other numerical errors occur, the score will be `numpy.nan`. - - If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. - - :param shorttext: a short sentence - :return: a dictionary with keys being the class labels, and values being the corresponding classification scores - :type shorttext: str - :rtype: dict - """ - if not self.trained: - raise e.ModelNotTrainedException() - - input_matrix = self.translate_shorttext_intfeature_matrix(shorttext) - prediction = self.model.predict(np.array([input_matrix])) - - scoredict = {label: prediction[0][idx] for idx, label in enumerate(self.classlabels)} - - return scoredict - - def savemodel(self, nameprefix): - """ Save the logistic stacked model into files. - - Save the stacked model into files. Note that the intermediate classifiers - are not saved. Users are advised to save those classifiers separately. - - If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`. - - :param nameprefix: prefix of the files - :return: None - :raise: ModelNotTrainedException - :type nameprefix: str - """ - if not self.trained: - raise e.ModelNotTrainedException() - - stackedmodeldict = {'classifiers': self.classifier2idx, - 'classlabels': self.classlabels} - pickle.dump(stackedmodeldict, open(nameprefix+'_stackedlogistics.pkl', 'w')) - kerasio.save_model(nameprefix+'_stackedlogistics', self.model) - - def loadmodel(self, nameprefix): - """ Load the model with the given prefix. - - Load the model with the given prefix of their paths. Note that the intermediate - classifiers are not loaded, and users are required to load them separately. - - :param nameprefix: prefix of the model files - :return: None - :type nameprefix: str - """ - stackedmodeldict = pickle.load(open(nameprefix+'_stackedlogistics.pkl', 'r')) - self.register_classlabels(stackedmodeldict['classlabels']) - self.classifier2idx = stackedmodeldict['classifiers'] - self.idx2classifier = {} - for key, val in self.classifier2idx.items(): - self.idx2classifier[val] = key - - self.model = kerasio.load_model(nameprefix+'_stackedlogistics') - - self.trained = True - - - - diff --git a/shorttext/utils/__init__.py b/shorttext/utils/__init__.py deleted file mode 100644 index 8729b124..00000000 --- a/shorttext/utils/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -from . import kerasmodel_io -from . import classification_exceptions -from . import gensim_corpora -from . import textpreprocessing -from .wordembed import load_word2vec_model -from . import compactmodel_io - -from .textpreprocessing import spacy_tokenize as tokenize -from .textpreprocessing import text_preprocessor, standard_text_preprocessor_1 - -from .deprecation import deprecated diff --git a/shorttext/utils/classification_exceptions.py b/shorttext/utils/classification_exceptions.py deleted file mode 100644 index a2e758ad..00000000 --- a/shorttext/utils/classification_exceptions.py +++ /dev/null @@ -1,24 +0,0 @@ - -class ModelNotTrainedException(Exception): - def __init__(self): - self.message = 'Model not trained.' - -class AlgorithmNotExistException(Exception): - def __init__(self, algoname): - self.message = 'Algorithm '+algoname+' not exist.' - -class Word2VecModelNotExistException(Exception): - def __init__(self, path): - self.message = 'Given path of Word2Vec not exist: '+path - -class UnequalArrayLengthsException(Exception): - def __init__(self, arr1, arr2): - self.message = 'Unequal lengths: '+str(len(arr1))+" and "+str(len(arr2)) - -class NotImplementedException(Exception): - def __init__(self): - self.message = 'Method not implemented.' - -class IncorrectClassificationModelFileException(Exception): - def __init__(self, expectedname, actualname): - self.message = 'Incorrect model (expected: '+expectedname+' ; actual: '+actualname+')' \ No newline at end of file diff --git a/shorttext/utils/compactmodel_io.py b/shorttext/utils/compactmodel_io.py deleted file mode 100644 index 6c2354d7..00000000 --- a/shorttext/utils/compactmodel_io.py +++ /dev/null @@ -1,173 +0,0 @@ -""" -This module contains general routines to zip all model files into one compact file. The model can be copied -or transferred with handiness. - -The methods and decorators in this module are called by other codes. It is not recommended for developers -to call them directly. -""" - -from tempfile import mkdtemp -import zipfile -import json -import os -from functools import partial - -from . import classification_exceptions as e - -def removedir(dir): - """ Remove all subdirectories and files under the specified path. - - :param dir: path of the directory to be clean - :return: None - """ - for filename in os.listdir(dir): - if os.path.isdir(filename): - removedir(os.path.join(dir, filename)) - os.rmdir(os.path.isdir(filename)) - else: - os.remove(dir+'/'+filename) - os.rmdir(dir) - - -def save_compact_model(filename, savefunc, prefix, suffices, infodict): - """ Save the model in one compact file by zipping all the related files. - - :param filename: name of the model file - :param savefunc: method or function that performs the saving action. Only one argument (str), the prefix of the model files, to be passed. - :param prefix: prefix of the names of the files related to the model - :param suffices: list of suffices - :param infodict: dictionary that holds information about the model. Must contain the key 'classifier'. - :return: None - :type filename: str - :type savefunc: function - :type prefix: str - :type suffices: list - :type infodict: dict - """ - # create temporary directory - tempdir = mkdtemp() - savefunc(tempdir+'/'+prefix) - - # zipping - outputfile = zipfile.ZipFile(filename, mode='w') - for suffix in suffices: - outputfile.write(tempdir+'/'+prefix+suffix, prefix+suffix) - outputfile.writestr('modelconfig.json', json.dumps(infodict)) - outputfile.close() - - # delete temporary files - removedir(tempdir) - -def load_compact_model(filename, loadfunc, prefix, infodict): - """ Load a model from a compact file that contains multiple files related to the model. - - :param filename: name of the model file - :param loadfunc: method or function that performs the loading action. Only one argument (str), the prefix of the model files, to be passed. - :param prefix: prefix of the names of the files - :param infodict: dictionary that holds information about the model. Must contain the key 'classifier'. - :return: instance of the model - :type filename: str - :type loadfunc: function - :type prefix: str - :type infodict: dict - """ - # create temporary directory - tempdir = mkdtemp() - - # unzipping - inputfile = zipfile.ZipFile(filename, mode='r') - inputfile.extractall(tempdir) - inputfile.close() - - # check model config - readinfodict = json.load(open(tempdir+'/modelconfig.json', 'r')) - if readinfodict['classifier'] != infodict['classifier']: - raise e.IncorrectClassificationModelFileException(infodict['classifier'], - readinfodict['classifier']) - - # load the model - returnobj = loadfunc(tempdir+'/'+prefix) - - # delete temporary files - removedir(tempdir) - - return returnobj - -# decorator that adds compact model methods to classifier dynamically -def CompactIOClassifier(Classifier, infodict, prefix, suffices): - """ Returns a decorated class object with additional methods for compact model I/O. - - The class itself must have methods :func:`loadmodel` and :func:`savemodel` that - takes the prefix of the model files as the argument. - - :param Classifier: class to be decorated - :param infodict: information about the model. Must contain the key 'classifier'. - :param prefix: prefix of names of the model file - :param suffices: suffices of the names of the model file - :return: the decorated class - :type Classifier: classobj - :type infodict: dict - :type prefix: str - :type suffices: list - :rtype: classobj - """ - # define the inherit class - class DressedClassifier(Classifier): - def save_compact_model(self, filename): - save_compact_model(filename, self.savemodel, prefix, suffices, infodict) - - def load_compact_model(self, filename): - return load_compact_model(filename, self.loadmodel, prefix, infodict) - - def get_info(self): - return {'classifier': infodict['classifier'], - 'prefix': prefix, - 'suffices': suffices} - - # return decorated classifier - return DressedClassifier - -# decorator for use -def compactio(infodict, prefix, suffices): - """ Returns a decorator that performs the decoration by :func:`CompactIOClassifier`. - - :param infodict: information about the model. Must contain the key 'classifier'. - :param prefix: prefix of names of the model file - :param suffices: suffices of the names of the model file - :return: the decorator - :type infodict: dict - :type prefix: str - :type suffices: list - :rtype: function - """ - return partial(CompactIOClassifier, infodict=infodict, prefix=prefix, suffices=suffices) - -def get_model_config_field(filename, parameter): - """ Return the configuration parameter of a model file. - - Read the file `modelconfig.json` in the compact model file, and return - the value of a particular parameter. - - :param filename: path of the model file - :param parameter: parameter to look in - :return: value of the parameter of this model - :type filename: str - :type parameter: str - :rtype: str - """ - inputfile = zipfile.ZipFile(filename, mode='r') - readinfodict = json.load(inputfile.open('modelconfig.json', 'r')) - return readinfodict[parameter] - -def get_model_classifier_name(filename): - """ Return the name of the classifier from a model file. - - Read the file `modelconfig.json` in the compact model file, and return - the name of the classifier. - - :param filename: path of the model file - :return: name of the classifier - :type filename: str - :rtype: str - """ - return get_model_config_field(filename, 'classifier') \ No newline at end of file diff --git a/shorttext/utils/deprecation.py b/shorttext/utils/deprecation.py deleted file mode 100644 index 85516c79..00000000 --- a/shorttext/utils/deprecation.py +++ /dev/null @@ -1,14 +0,0 @@ -import warnings - -def deprecated(func): - """This is a decorator which can be used to mark functions - as deprecated. It will result in a warning being emmitted - when the function is used.""" - def newFunc(*args, **kwargs): - warnings.warn("Call to deprecated function %s." % func.__name__, - category=DeprecationWarning) - return func(*args, **kwargs) - newFunc.__name__ = func.__name__ - newFunc.__doc__ = func.__doc__ - newFunc.__dict__.update(func.__dict__) - return newFunc \ No newline at end of file diff --git a/shorttext/utils/gensim_corpora.py b/shorttext/utils/gensim_corpora.py deleted file mode 100644 index 5c58250c..00000000 --- a/shorttext/utils/gensim_corpora.py +++ /dev/null @@ -1,73 +0,0 @@ -import gensim -from .textpreprocessing import spacy_tokenize as tokenize - -def generate_gensim_corpora(classdict, preprocess_and_tokenize=tokenize): - """ Generate gensim bag-of-words dictionary and corpus. - - Given a text data, a dict with keys being the class labels, and the values - being the list of short texts, in the same format output by `shorttext.data.data_retrieval`, - return a gensim dictionary and corpus. - - :param classdict: text data, a dict with keys being the class labels, and each value is a list of short texts - :param proprocess_and_tokenize: preprocessor function, that takes a short sentence, and return a list of tokens (Default: `shorttext.utils.tokenize`) - :return: a tuple, consisting of a gensim dictionary, a corpus, and a list of class labels - :type classdict: dict - :type proprocess_and_tokenize: function - :rtype: (gensim.corpora.Dictionary, list, list) - """ - classlabels = sorted(classdict.keys()) - doc = [preprocess_and_tokenize(' '.join(classdict[classlabel])) for classlabel in classlabels] - dictionary = gensim.corpora.Dictionary(doc) - corpus = [dictionary.doc2bow(doctokens) for doctokens in doc] - return dictionary, corpus, classlabels - -def save_corpus(dictionary, corpus, prefix): - """ Save gensim corpus and dictionary. - - :param dictionary: dictionary to save - :param corpus: corpus to save - :param prefix: prefix of the files to save - :return: None - :type dictionary: gensim.corpora.Dictionary - :type corpus: list - :type prefix: str - """ - dictionary.save(prefix+'_dictionary.dict') - gensim.corpora.MmCorpus.serialize(prefix+'_corpus.mm', corpus) - - -def load_corpus(prefix): - """ Load gensim corpus and dictionary. - - :param prefix: prefix of the file to load - :return: corpus and dictionary - :type prefix: str - :rtype: tuple - """ - corpus = gensim.corpora.MmCorpus(prefix+'_corpus.mm') - dictionary = gensim.corpora.Dictionary.load(prefix+'_dictionary.dict') - return corpus, dictionary - - -def update_corpus_labels(dictionary, corpus, newclassdict, preprocess_and_tokenize=tokenize): - """ Update corpus with additional training data. - - With the additional training data, the dictionary and corpus are updated. - - :param dictionary: original dictionary - :param corpus: original corpus - :param newclassdict: additional training data - :param preprocess_and_tokenize: preprocessor function, that takes a short sentence, and return a list of tokens (Default: `shorttext.utils.tokenize`) - :return: a tuple, an updated corpus, and the new corpus (for updating model) - :type dictionary: gensim.corpora.Dictionary - :type corpus: list - :type newclassdict: dict - :type preprocess_and_tokenize: function - :rtype: tuple - """ - - newdoc = [preprocess_and_tokenize(' '.join(newclassdict[classlabel])) for classlabel in sorted(newclassdict.keys())] - newcorpus = [dictionary.doc2bow(doctokens) for doctokens in newdoc] - corpus += newcorpus - - return corpus, newcorpus diff --git a/shorttext/utils/kerasmodel_io.py b/shorttext/utils/kerasmodel_io.py deleted file mode 100644 index 720440b4..00000000 --- a/shorttext/utils/kerasmodel_io.py +++ /dev/null @@ -1,32 +0,0 @@ -from keras.models import model_from_json - -def save_model(nameprefix, model): - """ Save a keras sequential model into files. - - Given a keras sequential model, save the model with the given file path prefix. - It saves the model into a JSON file, and an HDF5 file (.h5). - - :param nameprefix: Prefix of the paths of the model files - :param model: keras sequential model to be saved - :return: None - :type nameprefix: str - :type model: keras.models.Sequential - """ - model_json = model.to_json() - open(nameprefix+'.json', 'wb').write(model_json) - model.save_weights(nameprefix+'.h5') - -def load_model(nameprefix): - """ Load a keras sequential model from files. - - Given the prefix of the file paths, load a keras sequential model from - a JSON file and an HDF5 file. - - :param nameprefix: Prefix of the paths of the model files - :return: keras sequential model - :type nameprefix: str - :rtype: keras.models.Sequential - """ - model = model_from_json(open(nameprefix+'.json', 'rb').read()) - model.load_weights(nameprefix+'.h5') - return model \ No newline at end of file diff --git a/shorttext/utils/stopwordset.pkl b/shorttext/utils/stopwordset.pkl deleted file mode 100644 index cfde1e9a..00000000 --- a/shorttext/utils/stopwordset.pkl +++ /dev/null @@ -1,4201 +0,0 @@ -c__builtin__ -set -p0 -((lp1 -Vwrde -p2 -aVebben -p3 -aVnegl -p4 -aVdere -p5 -aVniiss -p6 -aVotro -p7 -aVnoista -p8 -aVdazu -p9 -aVauf -p10 -aVotra -p11 -aVaus -p12 -aVm\u0131 -p13 -aVunder -p14 -aVaux -p15 -aV\u043f\u043e\u0434 -p16 -aVdela -p17 -aVdele -p18 -aVdell -p19 -aValgunos -p20 -aVtinham -p21 -aVolisimme -p22 -aVhaben -p23 -aVupp -p24 -aVtenais -p25 -aVfuera -p26 -aVesos -p27 -aVzonder -p28 -aVfece -p29 -aVvarit -p30 -aVn -p31 -aVellas -p32 -aVfeci -p33 -aVebbi -p34 -aVnem -p35 -aVavranno -p36 -aVnei -p37 -aVemilyen -p38 -aVned -p39 -aVtuolta -p40 -aVwhile -p41 -aVebbe -p42 -aVkun -p43 -aVestuvo -p44 -aVabove -p45 -aVestuve -p46 -aVekkor -p47 -aVfacevi -p48 -aVstarei -p49 -aVmed -p50 -aVmeg -p51 -aVdaha -p52 -aVwezen -p53 -aVmen -p54 -aVfacesse -p55 -aVhere -p56 -aVmeu -p57 -aVmet -p58 -aVnicht -p59 -aVmes -p60 -aVltal -p61 -aVhers -p62 -aVamolyan -p63 -aVsidan -p64 -aVjolta -p65 -aVgli -p66 -aV\u0442\u043e\u0433\u043e -p67 -aV\u043c\u043d\u0435 -p68 -aVniksi -p69 -aVaki -p70 -aVtt -p71 -aVvosostras -p72 -aVdella -p73 -aVesteve -p74 -aVdesselben -p75 -aVerais -p76 -aVsarai -p77 -aV\u0431\u044b\u043b\u043e -p78 -aVestive -p79 -aV\u0431\u044b\u043b\u0438 -p80 -aVheille -p81 -aVniin -p82 -aVeiniges -p83 -aVfussions -p84 -aVloro -p85 -aVseraient -p86 -aVeinigem -p87 -aVeinigen -p88 -aVwollen -p89 -aVuntil -p90 -aVseja -p91 -aV\u0443\u0436\u0435 -p92 -aVtendras -p93 -aVtendran -p94 -aVfacemmo -p95 -aVabbiamo -p96 -aV\u043d\u0435\u043b\u044c\u0437\u044f -p97 -aVquale -p98 -aVme -p99 -aVolet -p100 -aVfossimo -p101 -aVma -p102 -aVavremo -p103 -aVmi -p104 -aVminuun -p105 -aVmu -p106 -aVmellett -p107 -aVera -p108 -aVolen -p109 -aVero -p110 -aVakik -p111 -aVavaient -p112 -aVtiez -p113 -aVtuvieses -p114 -aVgegen -p115 -aVeussent -p116 -aVtuoksi -p117 -aVauraient -p118 -aVdykk -p119 -aVwant -p120 -aVtuviesen -p121 -aVjra -p122 -aVanderer -p123 -aVanderes -p124 -aVhoe -p125 -aVend -p126 -aVheill -p127 -aVenn -p128 -aVhow -p129 -aVdieselben -p130 -aVhos -p131 -aVanderen -p132 -aVanderem -p133 -aVfacesti -p134 -aVvocs -p135 -aVsanki -p136 -aVafter -p137 -aVsera -p138 -aV\u0442\u043e\u0433\u0434\u0430 -p139 -aVlas -p140 -aVm -p141 -aVhasta -p142 -aVsentid -p143 -aVeinmal -p144 -aVvart -p145 -aVvars -p146 -aVaient -p147 -aVover -p148 -aVhubiese -p149 -aVnostra -p150 -aVvara -p151 -aVnostre -p152 -aVezzel -p153 -aVnostri -p154 -aVbefore -p155 -aVgy -p156 -aVbegge -p157 -aVhon -p158 -aVr -p159 -aVthen -p160 -aVthem -p161 -aVsinulta -p162 -aVp -p163 -aVdeinem -p164 -aVdeinen -p165 -aVdeines -p166 -aVdeiner -p167 -aVthey -p168 -aVmucho -p169 -aVpelas -p170 -aVwenn -p171 -aVhubieran -p172 -aVl -p173 -aVestaris -p174 -aVat -p175 -aVeach -p176 -aVdamit -p177 -aVseriez -p178 -aVn -p179 -aVdiye -p180 -aVhabris -p181 -aVte -p182 -aVmikor -p183 -aVfacevamo -p184 -aVaurais -p185 -aVaurait -p186 -aVkztt -p187 -aVdiesen -p188 -aVdieser -p189 -aVdieses -p190 -aVsuyas -p191 -aVwerde -p192 -aVicke -p193 -aVkzl -p194 -aVelle -p195 -aVinom -p196 -aVnoille -p197 -aVbr -p198 -aVnoilla -p199 -aV\u0441\u043e\u0432\u0441\u0435\u043c -p200 -aVbirka -p201 -aVist -p202 -aVatt -p203 -aVstaranno -p204 -aVhennar -p205 -aVeiner -p206 -aVeines -p207 -aVavrebbe -p208 -aVmin -p209 -aVest -p210 -aVtuoi -p211 -aVniss -p212 -aVtuon -p213 -aVeinen -p214 -aVfr -p215 -aVhogyan -p216 -aV\u043c\u043e\u0439 -p217 -aVr -p218 -aVingen -p219 -aVtot -p220 -aVtoi -p221 -aVton -p222 -aVtoo -p223 -aVteit -p224 -aVtuossa -p225 -aVelszr -p226 -aVhiszen -p227 -aVestavam -p228 -aV\u043c\u043e\u044f -p229 -aVminun -p230 -aVverte -p231 -aVbde -p232 -aVfussent -p233 -aVsille -p234 -aVetter -p235 -aVminua -p236 -aVvarje -p237 -aVhaya -p238 -aVhemos -p239 -aVminut -p240 -aVhabidos -p241 -aVsek -p242 -aVszmra -p243 -aVeste -p244 -aV\u0447\u0435\u0433\u043e -p245 -aVesta -p246 -aVhabrn -p247 -aVteht -p248 -aVesto -p249 -aVmine -p250 -aVvagyok -p251 -aVkorso -p252 -aVhnelt -p253 -aVhabrs -p254 -aVmint -p255 -aVtiveram -p256 -aVniin -p257 -aVfosti -p258 -aVfoste -p259 -aVderes -p260 -aVderer -p261 -aVhouveremos -p262 -aVnit -p263 -aVohne -p264 -aVerano -p265 -aVmeidt -p266 -aVmeidn -p267 -aVthe -p268 -aV\u0435\u0441\u043b\u0438 -p269 -aVsomme -p270 -aVfusse -p271 -aVdon -p272 -aVmajd -p273 -aVsill -p274 -aVm -p275 -aVdog -p276 -aVyours -p277 -aVegyb -p278 -aVdov -p279 -aVyani -p280 -aVs -p281 -aVfussiez -p282 -aVsentidos -p283 -aVl -p284 -aVn -p285 -aVnlkl -p286 -aVoch -p287 -aVwirst -p288 -aVdessen -p289 -aVsarebbero -p290 -aV\u044d\u0442\u043e\u0439 -p291 -aV\u043c\u0435\u043d\u044f -p292 -aVolivat -p293 -aVavrete -p294 -aVdo -p295 -aVmeget -p296 -aVsobre -p297 -aVdi -p298 -aVde -p299 -aVda -p300 -aV\u0431\u0443\u0434\u0442\u043e -p301 -aVgeen -p302 -aVdu -p303 -aV\u0447\u0435\u043c -p304 -aV\u043d\u0438\u043a\u043e\u0433\u0434\u0430 -p305 -aVfurent -p306 -aVstiamo -p307 -aV\u043a\u043e\u043d\u0435\u0447\u043d\u043e -p308 -aVsentida -p309 -aVnichts -p310 -aVestados -p311 -aVsentido -p312 -aViets -p313 -aVtlle -p314 -aVblev -p315 -aVsugl -p316 -aVskulle -p317 -aVtenham -p318 -aVnur -p319 -aV\u0443 -p320 -aVnun -p321 -aVnuo -p322 -aVnum -p323 -aVblei -p324 -aVinni -p325 -aVegyes -p326 -aVstaremmo -p327 -aVhouvssemos -p328 -aVweiter -p329 -aVestiveram -p330 -aVhabran -p331 -aVwe -p332 -aVlegyen -p333 -aVdiese -p334 -aVwo -p335 -aVsoyons -p336 -aVwere -p337 -aVanche -p338 -aVfuramos -p339 -aVqueste -p340 -aVuns -p341 -aVquesta -p342 -aVestuviera -p343 -aVill -p344 -aVquesto -p345 -aVquesti -p346 -aVagainst -p347 -aVuna -p348 -aVund -p349 -aVune -p350 -aVd -p351 -aVcoi -p352 -aVcom -p353 -aVcol -p354 -aVuno -p355 -aVhaban -p356 -aVforam -p357 -aVtovbb -p358 -aV\u043d\u0430\u0441 -p359 -aVdurante -p360 -aV\u043e\u043f\u044f\u0442\u044c -p361 -aVniye -p362 -aVhabas -p363 -aVfra -p364 -aV\u0442\u0430\u043a -p365 -aVtendrs -p366 -aV\u015fu -p367 -aVbeen -p368 -aVestaban -p369 -aVestabas -p370 -aVnoen -p371 -aVtenemos -p372 -aVjoiden -p373 -aVmme -p374 -aVjossa -p375 -aVnagyon -p376 -aVaies -p377 -aVzich -p378 -aVhvilken -p379 -aVstavano -p380 -aVsuas -p381 -aVmivel -p382 -aVavessimo -p383 -aVkeist -p384 -aVaqueles -p385 -aVauras -p386 -aVait -p387 -aVavrei -p388 -aVseras -p389 -aVegyik -p390 -aVdello -p391 -aVmilt -p392 -aVserai -p393 -aVvagy -p394 -aVaie -p395 -aVdelle -p396 -aVaurai -p397 -aVfaremmo -p398 -aVunos -p399 -aVtss -p400 -aVis -p401 -aVit -p402 -aVik -p403 -aVim -p404 -aVil -p405 -aVio -p406 -aVin -p407 -aVif -p408 -aVmeit -p409 -aVmanchem -p410 -aVmanchen -p411 -aVmanches -p412 -aVmancher -p413 -aV\u0441 -p414 -aVkeressnk -p415 -aVmanche -p416 -aVella -p417 -aVjetzt -p418 -aVeltt -p419 -aVkim -p420 -aVhans -p421 -aVkvarhelst -p422 -aVdepois -p423 -aVihrem -p424 -aVihren -p425 -aVstettero -p426 -aVihres -p427 -aVavrebbero -p428 -aVnoihin -p429 -aVandere -p430 -aVjust -p431 -aV\u043d\u0430\u0434 -p432 -aVanderm -p433 -aVandern -p434 -aVikkje -p435 -aVanderr -p436 -aVanders -p437 -aVestiverem -p438 -aVfarei -p439 -aVhai -p440 -aVantes -p441 -aVezen -p442 -aVham -p443 -aVhan -p444 -aVkell -p445 -aVhab -p446 -aVela -p447 -aVhad -p448 -aVestuvieron -p449 -aVele -p450 -aVhay -p451 -aVbaz\u0131 -p452 -aVhar -p453 -aVhas -p454 -aVhat -p455 -aVopp -p456 -aVunter -p457 -aV\u0442\u0443\u0442 -p458 -aVmis -p459 -aVd -p460 -aVstavate -p461 -aVmykje -p462 -aVniden -p463 -aV\u0435\u0441\u0442\u044c -p464 -aVszemben -p465 -aVole -p466 -aVkunne -p467 -aVoli -p468 -aVfacendo -p469 -aVtendramos -p470 -aVsinussa -p471 -aVvrt -p472 -aVich -p473 -aVvr -p474 -aVfor -p475 -aVvalaki -p476 -aVmuito -p477 -aVuma -p478 -aVseramos -p479 -aVtuviese -p480 -aVfoi -p481 -aVel -p482 -aVannak -p483 -aVunse -p484 -aVdette -p485 -aVestuvieran -p486 -aVhvordan -p487 -aVeravate -p488 -aVestuvieras -p489 -aVtenidas -p490 -aVzur -p491 -aVvra -p492 -aVtheirs -p493 -aV\u043a\u0443\u0434\u0430 -p494 -aVo -p495 -aVsers -p496 -aVsern -p497 -aValgo -p498 -aVkellett -p499 -aVeiniger -p500 -aVzum -p501 -aVaber -p502 -aVsok -p503 -aV\u0437\u0430 -p504 -aVsom -p505 -aVerre -p506 -aVson -p507 -aVdown -p508 -aVsou -p509 -aVstavamo -p510 -aVsoy -p511 -aVframos -p512 -aVjene -p513 -aVfmes -p514 -aVniist -p515 -aVavait -p516 -aVkven -p517 -aVkeille -p518 -aVn -p519 -aVavais -p520 -aV\u0432\u0441\u044e -p521 -aVminusta -p522 -aVwat -p523 -aVwas -p524 -aVwar -p525 -aVsuyos -p526 -aVellos -p527 -aVuten -p528 -aVveya -p529 -aVazutn -p530 -aVfora -p531 -aVavions -p532 -aVfel -p533 -aVabbiate -p534 -aVtuviste -p535 -aVmuss -p536 -aVhj -p537 -aVtengan -p538 -aVnerde -p539 -aVtenan -p540 -aVazt -p541 -aVtengas -p542 -aVtena -p543 -aVfacevano -p544 -aVefter -p545 -aVabbiano -p546 -aVhabamos -p547 -aVni -p548 -aVno -p549 -aVna -p550 -aVtivemos -p551 -aVwhen -p552 -aVkeill -p553 -aVsomt -p554 -aVtil -p555 -aVjona -p556 -aVnu -p557 -aV\u0441\u043e -p558 -aVtuyas -p559 -aVkva -p560 -aVdies -p561 -aVser -p562 -aV\u0432\u0430\u043c -p563 -aVhavemos -p564 -aVkvi -p565 -aVsiente -p566 -aVjoita -p567 -aVjosta -p568 -aVser -p569 -aV\u0447\u0443\u0442\u044c -p570 -aVthi -p571 -aVoss -p572 -aVsollte -p573 -aVaurions -p574 -aVselbst -p575 -aVsejam -p576 -aV\u0440\u0430\u0437\u0432\u0435 -p577 -aVhabidas -p578 -aVquienes -p579 -aVved -p580 -aVvem -p581 -aVfuiste -p582 -aVert -p583 -aVramos -p584 -aVihnen -p585 -aVhouveria -p586 -aVhatten -p587 -aVbelki -p588 -aVdin -p589 -aVestis -p590 -aVdid -p591 -aVdie -p592 -aVwarst -p593 -aVdig -p594 -aVmeille -p595 -aVsomos -p596 -aV\u0431\u043e\u043b\u0435\u0435 -p597 -aVdit -p598 -aVdir -p599 -aVville -p600 -aVhvem -p601 -aVhver -p602 -aVmely -p603 -aVfacessi -p604 -aVfssemos -p605 -aVjoilla -p606 -aVjoille -p607 -aVsareste -p608 -aVsaresti -p609 -aVserait -p610 -aVnegli -p611 -aVsemmi -p612 -aV\u0432\u0435\u0434\u044c -p613 -aVmy -p614 -aVmagt -p615 -aVlegalbb -p616 -aVmeill -p617 -aVgeweest -p618 -aVtendr -p619 -aVamely -p620 -aVnel -p621 -aVhabais -p622 -aVmelyek -p623 -aVnereye -p624 -aVpero -p625 -aVabban -p626 -aVtm -p627 -aVteljes -p628 -aVvous -p629 -aVauront -p630 -aVaurons -p631 -aVook -p632 -aV\u0432\u0441\u0435\u0433\u043e -p633 -aVsiate -p634 -aVjag -p635 -aV\u0438\u043d\u043e\u0433\u0434\u0430 -p636 -aV\u0432\u0441\u0435\u0433\u0434\u0430 -p637 -aVvoc -p638 -aVseremos -p639 -aVs -p640 -aVcan -p641 -aVs -p642 -aVtuviera -p643 -aVs -p644 -aVihre -p645 -aVfossem -p646 -aVmintha -p647 -aVnoch -p648 -aVhayamos -p649 -aVnoita -p650 -aVnyt -p651 -aVtill -p652 -aVtengamos -p653 -aVmas -p654 -aV\u043d\u043e -p655 -aVhabremos -p656 -aVso -p657 -aVtuyos -p658 -aVaos -p659 -aVsuch -p660 -aVlhes -p661 -aVdove -p662 -aVman -p663 -aVhneen -p664 -aVpoikki -p665 -aVsu -p666 -aVbenne -p667 -aVsi -p668 -aVso -p669 -aVarrl -p670 -aVsa -p671 -aVse -p672 -aVtksi -p673 -aVsejamos -p674 -aVhende -p675 -aVhubierais -p676 -aVstia -p677 -aVmais -p678 -aVavessi -p679 -aVestaras -p680 -aVfacciate -p681 -aVestuviese -p682 -aVestaran -p683 -aVjoihin -p684 -aVfuisteis -p685 -aVnon -p686 -aVnoi -p687 -aVnoe -p688 -aVnog -p689 -aVaquilo -p690 -aVkeit -p691 -aVnot -p692 -aVqu -p693 -aVnow -p694 -aVnor -p695 -aVnos -p696 -aV\u043a\u0430\u043a\u043e\u0439 -p697 -aVavons -p698 -aVolit -p699 -aVhanno -p700 -aVolin -p701 -aV\u0435\u0433\u043e -p702 -aVkann -p703 -aVel -p704 -aVen -p705 -aVei -p706 -aVej -p707 -aVennek -p708 -aVed -p709 -aVeg -p710 -aVhaba -p711 -aV\u0438\u043c -p712 -aVez -p713 -aVeu -p714 -aVet -p715 -aV\u0438\u0437 -p716 -aVes -p717 -aVer -p718 -aVstiano -p719 -aVmeist -p720 -aVdort -p721 -aVhajam -p722 -aVpoco -p723 -aVbli -p724 -aVble -p725 -aV\u0434\u0432\u0430 -p726 -aV\u044d\u0442\u043e\u0433\u043e -p727 -aVvoltunk -p728 -aVdenselben -p729 -aVtienen -p730 -aVfarebbero -p731 -aVauch -p732 -aVs -p733 -aVtero -p734 -aVtienes -p735 -aVstarebbe -p736 -aVter -p737 -aV\u0438\u0445 -p738 -aVemes -p739 -aV\u043a\u0430\u043a\u0430\u044f -p740 -aVnoiden -p741 -aVzal -p742 -aVnoget -p743 -aVyourself -p744 -aVbell -p745 -aVamelyekben -p746 -aVons -p747 -aVtenhamos -p748 -aVdegl -p749 -aVont -p750 -aVjenem -p751 -aVjenen -p752 -aVestn -p753 -aVikke -p754 -aVests -p755 -aVcikkek -p756 -aVjenes -p757 -aVjener -p758 -aVvilka -p759 -aVms -p760 -aVmr -p761 -aVthat -p762 -aVhun -p763 -aVet -p764 -aVlhe -p765 -aVqual -p766 -aVhur -p767 -aVegyetlen -p768 -aVthan -p769 -aVhubiramos -p770 -aVamikor -p771 -aV\u0431\u044b\u043b -p772 -aVwerd -p773 -aVavevano -p774 -aVand -p775 -aValles -p776 -aValler -p777 -aVsull -p778 -aVhimself -p779 -aVallen -p780 -aVallem -p781 -aVmiksi -p782 -aVany -p783 -aV\u0442\u0430\u043a\u043e\u0439 -p784 -aVtions -p785 -aVestamos -p786 -aV\u0445\u043e\u0442\u044c -p787 -aVzou -p788 -aVlehetett -p789 -aVwird -p790 -aVmsik -p791 -aVamit -p792 -aVdykkar -p793 -aVahogy -p794 -aVjoksi -p795 -aVniin -p796 -aVsaremo -p797 -aVheist -p798 -aVlenni -p799 -aVsolche -p800 -aVlenne -p801 -aVseria -p802 -aVestejam -p803 -aVtnhamos -p804 -aVelas -p805 -aVestoy -p806 -aVestos -p807 -aVestou -p808 -aVsilt -p809 -aVayants -p810 -aVdaar -p811 -aVserais -p812 -aVdich -p813 -aVppen -p814 -aV\u0430 -p815 -aVonly -p816 -aVayante -p817 -aV\u043d\u0443 -p818 -aV\u0432\u043e\u0442 -p819 -aVho -p820 -aVolimme -p821 -aV\u043d\u0435 -p822 -aV\u043c\u043e\u0436\u0435\u0442 -p823 -aVtoch -p824 -aV\u043d\u0430 -p825 -aV\u0441\u0435\u0431\u0435 -p826 -aVehhez -p827 -aVnell -p828 -aV\u043d\u0438 -p829 -aVj -p830 -aVtuviramos -p831 -aVkenelt -p832 -aVdasselbe -p833 -aVj -p834 -aVavendo -p835 -aVsiihen -p836 -aVfr -p837 -aVleur -p838 -aVamg -p839 -aVwhere -p840 -aVseas -p841 -aVvolna -p842 -aVkvifor -p843 -aVolisivat -p844 -aVsean -p845 -aVfacessimo -p846 -aVhvorfor -p847 -aVestad -p848 -aVtuolla -p849 -aVestar -p850 -aVestas -p851 -aVutols -p852 -aVbetween -p853 -aVnostro -p854 -aVtiverem -p855 -aVju -p856 -aVtus -p857 -aVmellan -p858 -aVtst -p859 -aVjo -p860 -aVcomo -p861 -aVtuo -p862 -aVtua -p863 -aVcome -p864 -aVja -p865 -aVkoska -p866 -aV\u043e\u043d\u0430 -p867 -aVestivermos -p868 -aV\u043e\u043d\u0438 -p869 -aVtm -p870 -aVs -p871 -aVaura -p872 -aVkenen -p873 -aVvuestro -p874 -aVt -p875 -aVhva -p876 -aVavevamo -p877 -aVpor -p878 -aVteill -p879 -aVje -p880 -aVante -p881 -aVmaga -p882 -aVsiete -p883 -aVm -p884 -aVoder -p885 -aVtue -p886 -aV\u0441\u0435\u0431\u044f -p887 -aVhepsi -p888 -aVhouverem -p889 -aVtenas -p890 -aVfr -p891 -aVthose -p892 -aVhouverei -p893 -aVmyself -p894 -aVeit -p895 -aVthese -p896 -aVminulle -p897 -aVminulla -p898 -aV\u0442\u0435\u043f\u0435\u0440\u044c -p899 -aVvaikka -p900 -aVein -p901 -aVestvamos -p902 -aVeran -p903 -aVeram -p904 -aVsoit -p905 -aVteille -p906 -aVsois -p907 -aVmirt -p908 -aVpar -p909 -aVpas -p910 -aVsentidas -p911 -aVsame -p912 -aVhvilke -p913 -aVya -p914 -aVeri -p915 -aVhvis -p916 -aVngra -p917 -aVstaresti -p918 -aVeitt -p919 -aVolleet -p920 -aVquien -p921 -aVfecero -p922 -aVstareste -p923 -aV\u043d\u0438\u0447\u0435\u0433\u043e -p924 -aVtenha -p925 -aVdefa -p926 -aVhebben -p927 -aVtenho -p928 -aVperch -p929 -aVestivessem -p930 -aV\u0441\u0430\u043c -p931 -aVtivessem -p932 -aVmachen -p933 -aVnoka -p934 -aVsoient -p935 -aVsiit -p936 -aVaztn -p937 -aVdeze -p938 -aVbeing -p939 -aVne -p940 -aVt -p941 -aVtuvisemos -p942 -aV\u0432\u0430\u0441 -p943 -aVquella -p944 -aVavesti -p945 -aVaveste -p946 -aV\u043c\u0435\u0436\u0434\u0443 -p947 -aVquello -p948 -aVlett -p949 -aVami -p950 -aVditt -p951 -aV\u0438\u043b\u0438 -p952 -aVama -p953 -aVserei -p954 -aVmot -p955 -aVmoi -p956 -aVmon -p957 -aVtanto -p958 -aVsdan -p959 -aVserez -p960 -aVvosostros -p961 -aVmod -p962 -aVaurez -p963 -aVeussiez -p964 -aV\u043f\u043e\u0442\u043e\u043c\u0443 -p965 -aVt -p966 -aVnosotras -p967 -aV\u0442\u043e -p968 -aVavrai -p969 -aVkorleis -p970 -aVezt -p971 -aVssze -p972 -aVhouveramos -p973 -aVeddig -p974 -aVnaar -p975 -aVhavde -p976 -aVeurer -p977 -aVsommes -p978 -aVjobban -p979 -aVmeinen -p980 -aVmeinem -p981 -aVmeiner -p982 -aVmeines -p983 -aVtn -p984 -aVkenen -p985 -aVsarebbe -p986 -aVaveva -p987 -aVseine -p988 -aVfusemos -p989 -aVsero -p990 -aVkenet -p991 -aVhabe -p992 -aVon -p993 -aVom -p994 -aVhr -p995 -aVog -p996 -aVof -p997 -aVob -p998 -aVneden -p999 -aVhn -p1000 -aVou -p1001 -aVos -p1002 -aVor -p1003 -aVop -p1004 -aVkein -p1005 -aVhouvera -p1006 -aVhabiendo -p1007 -aVesses -p1008 -aVnihin -p1009 -aVtuvisteis -p1010 -aVyour -p1011 -aVhet -p1012 -aVwelches -p1013 -aVwelcher -p1014 -aVhep -p1015 -aVher -p1016 -aVthere -p1017 -aVlos -p1018 -aVtbb -p1019 -aVstarete -p1020 -aVeues -p1021 -aVeuer -p1022 -aVhem -p1023 -aVwelchem -p1024 -aVhei -p1025 -aVwelchen -p1026 -aVgibi -p1027 -aVmich -p1028 -aVwith -p1029 -aVvere -p1030 -aVhabis -p1031 -aV -p1032 -aVkeine -p1033 -aVad -p1034 -aVaf -p1035 -aVvors -p1036 -aV\u044d\u0442\u043e\u043c -p1037 -aVam -p1038 -aVal -p1039 -aVao -p1040 -aVan -p1041 -aVas -p1042 -aVsaremmo -p1043 -aVau -p1044 -aVuit -p1045 -aVav -p1046 -aVaz -p1047 -aVtenga -p1048 -aVvore -p1049 -aVagain -p1050 -aVesto -p1051 -aVstar -p1052 -aVmiss -p1053 -aVpedig -p1054 -aV\u0442\u043e\u0436\u0435 -p1055 -aVstar -p1056 -aVyou -p1057 -aVavessero -p1058 -aVolitte -p1059 -aVhubiste -p1060 -aVetwas -p1061 -aVsullo -p1062 -aVsulla -p1063 -aVsulle -p1064 -aVtendris -p1065 -aV\u043a\u0430\u043a -p1066 -aVunsen -p1067 -aVunsem -p1068 -aVeusse -p1069 -aVu -p1070 -aV\u044d\u0442\u043e\u0442 -p1071 -aVunser -p1072 -aVunses -p1073 -aVestuviste -p1074 -aVall -p1075 -aV\u0442\u043e\u043c -p1076 -aVnoilta -p1077 -aVdiesem -p1078 -aVnilt -p1079 -aVnoissa -p1080 -aValt -p1081 -aVals -p1082 -aVdr -p1083 -aVtu -p1084 -aVto -p1085 -aVniilt -p1086 -aVniiden -p1087 -aVderselbe -p1088 -aVti -p1089 -aV\u0438 -p1090 -aVkvar -p1091 -aVte -p1092 -aVta -p1093 -aVestando -p1094 -aVber -p1095 -aVestaba -p1096 -aVvery -p1097 -aVsono -p1098 -aVfai -p1099 -aVsont -p1100 -aVval -p1101 -aVtuvimos -p1102 -aVminden -p1103 -aVworden -p1104 -aVsinulla -p1105 -aVhendes -p1106 -aVjoista -p1107 -aVsinulle -p1108 -aVhabr -p1109 -aVts -p1110 -aVhabr -p1111 -aVtem -p1112 -aValtijd -p1113 -aVhaar -p1114 -aVkunnen -p1115 -aV\u043d\u0430\u0434\u043e -p1116 -aVfurther -p1117 -aVtes -p1118 -aVteu -p1119 -aV\u043d\u0435\u0435 -p1120 -aVwhat -p1121 -aVt -p1122 -aVhnest -p1123 -aVsua -p1124 -aVsuo -p1125 -aVsul -p1126 -aVsui -p1127 -aV\u043c\u043d\u043e\u0433\u043e -p1128 -aVsus -p1129 -aVsur -p1130 -aVdeles -p1131 -aVjede -p1132 -aViemand -p1133 -aVfarete -p1134 -aVhadde -p1135 -aV\u0442\u0430\u043c -p1136 -aVtoen -p1137 -aVegsz -p1138 -aV -p1139 -aVahhoz -p1140 -aVeras -p1141 -aVavesse -p1142 -aVstava -p1143 -aVdurch -p1144 -aVseris -p1145 -aVhnelle -p1146 -aVvid -p1147 -aVthn -p1148 -aVvil -p1149 -aVotros -p1150 -aVhogy -p1151 -aVfueras -p1152 -aVtutto -p1153 -aVminussa -p1154 -aVtutti -p1155 -aVvarte -p1156 -aVhouver -p1157 -aVdieselbe -p1158 -aVfueran -p1159 -aVsondern -p1160 -aVmore -p1161 -aVmellom -p1162 -aVdoor -p1163 -aVfusses -p1164 -aVhubieras -p1165 -aV\u0440\u0430\u0437 -p1166 -aVnk -p1167 -aVteist -p1168 -aVnerede -p1169 -aVder -p1170 -aVdes -p1171 -aVdet -p1172 -aVdei -p1173 -aVminhas -p1174 -aVdel -p1175 -aVdem -p1176 -aVden -p1177 -aVtuas -p1178 -aVdeg -p1179 -aVhnell -p1180 -aVwieder -p1181 -aVserais -p1182 -aVavemmo -p1183 -aVmesmo -p1184 -aVtant -p1185 -aVknnen -p1186 -aVvoltam -p1187 -aVvoltak -p1188 -aVnagyobb -p1189 -aVfu -p1190 -aVtuvieseis -p1191 -aVnotre -p1192 -aVtuona -p1193 -aV\u044d\u0442\u0438 -p1194 -aVnuma -p1195 -aVno -p1196 -aVa -p1197 -aVegy -p1198 -aVestuviramos -p1199 -aVkein -p1200 -aV\u0437\u0430\u0447\u0435\u043c -p1201 -aVise -p1202 -aVhayis -p1203 -aVthrough -p1204 -aVitt -p1205 -aVits -p1206 -aVzelf -p1207 -aValle -p1208 -aValla -p1209 -aVallo -p1210 -aVjoissa -p1211 -aVsinusta -p1212 -aVteidn -p1213 -aVhubisemos -p1214 -aVallt -p1215 -aVteidt -p1216 -aVhvor -p1217 -aVnossos -p1218 -aVmusste -p1219 -aV\u043f\u043e\u0447\u0442\u0438 -p1220 -aVyo -p1221 -aVjl -p1222 -aV\u043f\u0440\u043e -p1223 -aV\u043f\u0440\u0438 -p1224 -aVmitk -p1225 -aVces -p1226 -aVilyenkor -p1227 -aVtuvieran -p1228 -aVwrden -p1229 -aVdenne -p1230 -aVheihin -p1231 -aVdenna -p1232 -aVvannak -p1233 -aVft -p1234 -aVstemmo -p1235 -aVhubieseis -p1236 -aVnossa -p1237 -aV\u043d\u0438\u043c -p1238 -aVbir\u015fey -p1239 -aVugyanis -p1240 -aVtodo -p1241 -aV\u0432\u043f\u0440\u043e\u0447\u0435\u043c -p1242 -aVnosso -p1243 -aVeinem -p1244 -aVtenida -p1245 -aVserions -p1246 -aVmas -p1247 -aVtenido -p1248 -aVsuoi -p1249 -aVsiin -p1250 -aVjolle -p1251 -aVseis -p1252 -aVjolla -p1253 -aVestaremos -p1254 -aVlehet -p1255 -aVmukaan -p1256 -aV\u044f -p1257 -aVnokre -p1258 -aVtuohon -p1259 -aV\u043d\u0438\u0445 -p1260 -aVestada -p1261 -aVvoor -p1262 -aVnosotros -p1263 -aVestejamos -p1264 -aVtivesse -p1265 -aVmindenki -p1266 -aVhubiera -p1267 -aV\u043e\u043d -p1268 -aVdina -p1269 -aV\u043e\u0431 -p1270 -aVnach -p1271 -aVtants -p1272 -aVngot -p1273 -aVtuvieras -p1274 -aVtendremos -p1275 -aV\u0431\u044b -p1276 -aVtante -p1277 -aVjer -p1278 -aVkom -p1279 -aVteiss -p1280 -aVkon -p1281 -aVesas -p1282 -aVavec -p1283 -aVavez -p1284 -aVcontra -p1285 -aVjeg -p1286 -aV\u043a -p1287 -aVseamos -p1288 -aVcontro -p1289 -aVmos -p1290 -aVpara -p1291 -aVsera -p1292 -aVtive -p1293 -aVsta -p1294 -aVaan -p1295 -aVdans -p1296 -aVdann -p1297 -aVteve -p1298 -aV\u043e\u0442 -p1299 -aVteriam -p1300 -aVheilt -p1301 -aVtiver -p1302 -aViin -p1303 -aVfaccio -p1304 -aVsnn -p1305 -aVnoiksi -p1306 -aVestuvieses -p1307 -aV\u0442\u0440\u0438 -p1308 -aVeuch -p1309 -aVfaccia -p1310 -aVhenne -p1311 -aValso -p1312 -aVnin -p1313 -aVestuviesen -p1314 -aVtodos -p1315 -aVnuestras -p1316 -aVselv -p1317 -aVszinte -p1318 -aVtuve -p1319 -aVestivesse -p1320 -aVtuvo -p1321 -aVkan -p1322 -aVessa -p1323 -aVmost -p1324 -aVesse -p1325 -aV\u0431\u044b\u0442\u044c -p1326 -aVtivera -p1327 -aVminha -p1328 -aVmeine -p1329 -aVki -p1330 -aVvarfr -p1331 -aVhubieses -p1332 -aVfossi -p1333 -aVhubiesen -p1334 -aVstessimo -p1335 -aVfosse -p1336 -aVetes -p1337 -aV\u043a\u043e\u0433\u0434\u0430 -p1338 -aVezek -p1339 -aVket -p1340 -aVtened -p1341 -aVolyan -p1342 -aVquem -p1343 -aVmina -p1344 -aV\u0433\u0434\u0435 -p1345 -aVfaresti -p1346 -aVfareste -p1347 -aVvalami -p1348 -aVjoiksi -p1349 -aVporque -p1350 -aVsteste -p1351 -aVhis -p1352 -aVasl\u0131nda -p1353 -aVmein -p1354 -aVesteja -p1355 -aVestivssemos -p1356 -aVstesti -p1357 -aVmo -p1358 -aVstando -p1359 -aVduring -p1360 -aVhij -p1361 -aV\u0442\u044b -p1362 -aVma -p1363 -aVhim -p1364 -aVhin -p1365 -aVhouveriam -p1366 -aV\u0445\u043e\u0440\u043e\u0448\u043e -p1367 -aVvilket -p1368 -aVvissza -p1369 -aVseu -p1370 -aVsto -p1371 -aVses -p1372 -aVfuesen -p1373 -aVseg -p1374 -aVfueses -p1375 -aVegyre -p1376 -aVbare -p1377 -aVare -p1378 -aVsea -p1379 -aVsen -p1380 -aVsem -p1381 -aVsei -p1382 -aVingi -p1383 -aV\u0431\u0443\u0434\u0435\u0442 -p1384 -aVinkje -p1385 -aVsonst -p1386 -aVdein -p1387 -aVdeim -p1388 -aVsoll -p1389 -aVdalla -p1390 -aVjeden -p1391 -aVjedem -p1392 -aVdalle -p1393 -aVdallo -p1394 -aVison -p1395 -aVestivemos -p1396 -aVhi -p1397 -aVjedes -p1398 -aVjeder -p1399 -aVboth -p1400 -aVc -p1401 -aVquelle -p1402 -aVolisi -p1403 -aVmink -p1404 -aVseran -p1405 -aVquelli -p1406 -aVsamma -p1407 -aVavr -p1408 -aVsamme -p1409 -aVolla -p1410 -aVavr -p1411 -aVauriez -p1412 -aVhajamos -p1413 -aVltalban -p1414 -aVfummo -p1415 -aVestuvisteis -p1416 -aVteus -p1417 -aVwhom -p1418 -aV\u043c\u043e\u0436\u043d\u043e -p1419 -aV\u0436 -p1420 -aVollut -p1421 -aVdus -p1422 -aVamelyeket -p1423 -aVjohon -p1424 -aVestuvierais -p1425 -aVfut -p1426 -aVfus -p1427 -aVtenamos -p1428 -aVmindig -p1429 -aVfue -p1430 -aVfui -p1431 -aValatt -p1432 -aVketk -p1433 -aVvom -p1434 -aVhnen -p1435 -aVtaient -p1436 -aVvoi -p1437 -aVhnet -p1438 -aVitself -p1439 -aVvor -p1440 -aVvos -p1441 -aVns -p1442 -aVacaba -p1443 -aVfueron -p1444 -aVkeiss -p1445 -aVestes -p1446 -aVnokon -p1447 -aVkeiden -p1448 -aVzwar -p1449 -aVeure -p1450 -aVcikkeket -p1451 -aVnokor -p1452 -aVels -p1453 -aVentre -p1454 -aVtengis -p1455 -aV\u0437\u0434\u0435\u0441\u044c -p1456 -aVeles -p1457 -aVk -p1458 -aVy -p1459 -aVstesse -p1460 -aVestadas -p1461 -aVstessi -p1462 -aVskal -p1463 -aVnuestra -p1464 -aVniill -p1465 -aVnuestro -p1466 -aVbliver -p1467 -aVolisin -p1468 -aVvagyis -p1469 -aVe\u011fer -p1470 -aVtendr -p1471 -aVolisit -p1472 -aVkeihin -p1473 -aVhabras -p1474 -aVcui -p1475 -aVbin -p1476 -aVhennes -p1477 -aVbij -p1478 -aVaviez -p1479 -aVhayan -p1480 -aVhabida -p1481 -aVhayas -p1482 -aVbiz -p1483 -aVbis -p1484 -aVhabido -p1485 -aVfacciamo -p1486 -aVtantes -p1487 -aVhouveram -p1488 -aVjota -p1489 -aVvalamint -p1490 -aVeen -p1491 -aVsokkal -p1492 -aVmycket -p1493 -aVihrer -p1494 -aVsue -p1495 -aVjabb -p1496 -aVkenell -p1497 -aVsome -p1498 -aVhinter -p1499 -aVtovbb -p1500 -aVt -p1501 -aVilyen -p1502 -aVaquele -p1503 -aVourselves -p1504 -aVaquela -p1505 -aVminulta -p1506 -aVterei -p1507 -aVnhny -p1508 -aVper -p1509 -aValgunas -p1510 -aVpelo -p1511 -aVest -p1512 -aVpela -p1513 -aVbe -p1514 -aVnello -p1515 -aVnella -p1516 -aVnelle -p1517 -aV\u0447\u0442\u043e\u0431\u044b -p1518 -aVbu -p1519 -aVmutta -p1520 -aVweil -p1521 -aVby -p1522 -aVvon -p1523 -aVbist -p1524 -aV\u0432\u0441\u0435 -p1525 -aVfomos -p1526 -aVsaranno -p1527 -aVyli -p1528 -aVinte -p1529 -aVteniendo -p1530 -aVinto -p1531 -aVkeneen -p1532 -aVgewesen -p1533 -aVkanssa -p1534 -aVvaan -p1535 -aVneki -p1536 -aVheeft -p1537 -aVdegli -p1538 -aVfossero -p1539 -aVkeneksi -p1540 -aVavremmo -p1541 -aVhnt -p1542 -aVsuis -p1543 -aVdeira -p1544 -aVdessa -p1545 -aVazok -p1546 -aVkunde -p1547 -aVazon -p1548 -aVutna -p1549 -aV\u043f\u043e\u0441\u043b\u0435 -p1550 -aV -p1551 -aVver -p1552 -aVut -p1553 -aVuw -p1554 -aVup -p1555 -aVcikk -p1556 -aVum -p1557 -aVun -p1558 -aVtuolle -p1559 -aVud -p1560 -aVnogle -p1561 -aV\u043c\u044b -p1562 -aV\u043b\u0443\u0447\u0448\u0435 -p1563 -aV\u043f\u0435\u0440\u0435\u0434 -p1564 -aV\u0434\u043e -p1565 -aVnoko -p1566 -aVelles -p1567 -aVeller -p1568 -aV\u0434\u0440\u0443\u0433\u043e\u0439 -p1569 -aVwollte -p1570 -aV\u0434\u0430 -p1571 -aVnas -p1572 -aVellen -p1573 -aVhanem -p1574 -aVcuando -p1575 -aVsiden -p1576 -aVtemos -p1577 -aVavevo -p1578 -aVavevi -p1579 -aVeures -p1580 -aVdall -p1581 -aVstarai -p1582 -aVderselben -p1583 -aVtall -p1584 -aVdos -p1585 -aVeuren -p1586 -aVgy -p1587 -aVeurem -p1588 -aVtendrais -p1589 -aVagl -p1590 -aVe -p1591 -aVtmn -p1592 -aVmuchos -p1593 -aVhness -p1594 -aVtendra -p1595 -aVhaving -p1596 -aVonce -p1597 -aVtes -p1598 -aVsitta -p1599 -aVessas -p1600 -aVkvl -p1601 -aVkenest -p1602 -aVge -p1603 -aVstavi -p1604 -aVstavo -p1605 -aVknnte -p1606 -aVsarei -p1607 -aVnuestros -p1608 -aVtaln -p1609 -aVstiate -p1610 -aVtivermos -p1611 -aVniets -p1612 -aVmaar -p1613 -aVpersze -p1614 -aVyourselves -p1615 -aVsin -p1616 -aVtra -p1617 -aVtuosta -p1618 -aV\u043a\u0442\u043e -p1619 -aVniit -p1620 -aVtinha -p1621 -aVstarebbero -p1622 -aVblitt -p1623 -aVzo -p1624 -aVze -p1625 -aVvele -p1626 -aVngon -p1627 -aVzu -p1628 -aVeurent -p1629 -aVbiri -p1630 -aVeinige -p1631 -aVindem -p1632 -aVtivramos -p1633 -aVlei -p1634 -aVtais -p1635 -aVles -p1636 -aVtait -p1637 -aVsind -p1638 -aVsine -p1639 -aVsina -p1640 -aVhonom -p1641 -aVavreste -p1642 -aVtegen -p1643 -aVavresti -p1644 -aVkeresztl -p1645 -aV\u0447\u0442\u043e -p1646 -aVovat -p1647 -aVwie -p1648 -aVwil -p1649 -aVamelynek -p1650 -aVett -p1651 -aVvolt -p1652 -aVwir -p1653 -aVzijn -p1654 -aV\u043d\u0438\u0431\u0443\u0434\u044c -p1655 -aVmeihin -p1656 -aVviszont -p1657 -aVfrom -p1658 -aVche -p1659 -aVchi -p1660 -aVfel -p1661 -aV\u0447\u0435\u0440\u0435\u0437 -p1662 -aVfew -p1663 -aVkuin -p1664 -aVestabais -p1665 -aVmindent -p1666 -aVthemselves -p1667 -aVzij -p1668 -aV\u043e -p1669 -aVfar -p1670 -aVestars -p1671 -aVslik -p1672 -aVestuvieseis -p1673 -aVtm -p1674 -aVfar -p1675 -aVvuestra -p1676 -aVfarebbe -p1677 -aVhatte -p1678 -aVthis -p1679 -aVsiksi -p1680 -aVnekem -p1681 -aVpour -p1682 -aVmeer -p1683 -aVvotre -p1684 -aVfaceste -p1685 -aVreeds -p1686 -aVheidt -p1687 -aVazrt -p1688 -aVette -p1689 -aVheidn -p1690 -aVzwischen -p1691 -aVseriam -p1692 -aVtai -p1693 -aVfrn -p1694 -aVsit -p1695 -aVsiz -p1696 -aVsia -p1697 -aVsig -p1698 -aVwaren -p1699 -aVcual -p1700 -aVdelas -p1701 -aVitse -p1702 -aVsin -p1703 -aV\u043f\u043e -p1704 -aVfacevate -p1705 -aVhouve -p1706 -aVftes -p1707 -aVestivramos -p1708 -aVtenis -p1709 -aVisso -p1710 -aVbe -p1711 -aVestarn -p1712 -aVj -p1713 -aVolisitte -p1714 -aVazonban -p1715 -aV\u043e\u0434\u0438\u043d -p1716 -aVmg -p1717 -aVle -p1718 -aVla -p1719 -aVeue -p1720 -aVlo -p1721 -aV\u0432\u043e -p1722 -aVli -p1723 -aVdemselben -p1724 -aVkeiksi -p1725 -aVeux -p1726 -aVeut -p1727 -aVeus -p1728 -aVsie -p1729 -aVdal -p1730 -aVdan -p1731 -aVnill -p1732 -aVdai -p1733 -aVdat -p1734 -aVdoch -p1735 -aVdas -p1736 -aVqu -p1737 -aV\u0432\u044b -p1738 -aVstette -p1739 -aVstetti -p1740 -aVhossen -p1741 -aVnm -p1742 -aVsolches -p1743 -aVsolcher -p1744 -aVvilken -p1745 -aVhabra -p1746 -aVhubisteis -p1747 -aVdoing -p1748 -aVmijn -p1749 -aVjoilta -p1750 -aVolemme -p1751 -aVour -p1752 -aVsolchen -p1753 -aVsolchem -p1754 -aVout -p1755 -aVtuya -p1756 -aVtuyo -p1757 -aVolette -p1758 -aVstessero -p1759 -aVomdat -p1760 -aVderas -p1761 -aVfuerais -p1762 -aVfaceva -p1763 -aVeravamo -p1764 -aVmeilt -p1765 -aVnille -p1766 -aVfacevo -p1767 -aVformos -p1768 -aVill. -p1769 -aVque -p1770 -aVqui -p1771 -aVfuimos -p1772 -aVmilyen -p1773 -aVsintiendo -p1774 -aV\u0434\u0430\u0436\u0435 -p1775 -aVtuvierais -p1776 -aV\u0434\u043b\u044f -p1777 -aVda -p1778 -aVihr -p1779 -aVfurono -p1780 -aVsjl -p1781 -aV\u015fey -p1782 -aVsuya -p1783 -aVihn -p1784 -aVakkor -p1785 -aVilletve -p1786 -aVihm -p1787 -aVsuyo -p1788 -aVestara -p1789 -aVestemos -p1790 -aVtheir -p1791 -aV\u0432 -p1792 -aVabbia -p1793 -aVmill -p1794 -aVteilt -p1795 -aVblivit -p1796 -aVheb -p1797 -aVquando -p1798 -aVebbero -p1799 -aV\u0441\u0435\u0439\u0447\u0430\u0441 -p1800 -aV\u0431\u043e\u043b\u044c\u0448\u0435 -p1801 -aVherself -p1802 -aV\u0432\u0434\u0440\u0443\u0433 -p1803 -aVsinuun -p1804 -aVbei -p1805 -aVben -p1806 -aV\u0432\u0441\u0435\u0445 -p1807 -aVnr -p1808 -aVhouvemos -p1809 -aVseinem -p1810 -aVseinen -p1811 -aVelg -p1812 -aV\u0447\u0442\u043e\u0431 -p1813 -aV\u0431\u0435\u0437 -p1814 -aVheit -p1815 -aVeusses -p1816 -aVblir -p1817 -aVhave -p1818 -aVseiner -p1819 -aVseines -p1820 -aVmij -p1821 -aV\u043d\u0435\u0433\u043e -p1822 -aVmio -p1823 -aVmin -p1824 -aVmia -p1825 -aVmie -p1826 -aVwhrend -p1827 -aVmig -p1828 -aVisto -p1829 -aVwhich -p1830 -aVseront -p1831 -aVmille -p1832 -aVmir -p1833 -aVmit -p1834 -aVserons -p1835 -aVteria -p1836 -aV\u044d\u0442\u0443 -p1837 -aVeres -p1838 -aVtlt -p1839 -aVwho -p1840 -aVdetta -p1841 -aVnoina -p1842 -aVestivera -p1843 -aVmange -p1844 -aVsedan -p1845 -aVheiss -p1846 -aVnas\u0131l -p1847 -aVwhy -p1848 -aVmedan -p1849 -aVhouvessem -p1850 -aVvret -p1851 -aVdenn -p1852 -aVtuot -p1853 -aVkenelle -p1854 -aVmuy -p1855 -aVnagy -p1856 -aVismt -p1857 -aVniet -p1858 -aVsoyez -p1859 -aVhouvero -p1860 -aVsdant -p1861 -aVmoet -p1862 -aVhade -p1863 -aVshould -p1864 -aVsdana -p1865 -aV\u0442\u0435\u0431\u044f -p1866 -aVforem -p1867 -aVnoin -p1868 -aVestar -p1869 -aVayez -p1870 -aVjoina -p1871 -aVsinun -p1872 -aVdeine -p1873 -aVsinua -p1874 -aVavete -p1875 -aVwordt -p1876 -aVestarais -p1877 -aVsinut -p1878 -aVviel -p1879 -aVkeiner -p1880 -aVkeines -p1881 -aVket -p1882 -aVestar -p1883 -aVshe -p1884 -aVkeinem -p1885 -aVkeinen -p1886 -aVaquelas -p1887 -aVteramos -p1888 -aVvrt -p1889 -aVahol -p1890 -aVvre -p1891 -aVtengo -p1892 -aVmiei -p1893 -aVsehr -p1894 -aVhabrais -p1895 -aVfacciano -p1896 -aVfuese -p1897 -aVtuvieron -p1898 -aVnossas -p1899 -aVlui -p1900 -aVjos -p1901 -aVhaja -p1902 -aVfaremo -p1903 -aVavuti -p1904 -aVtambin -p1905 -aVavuto -p1906 -aVkez -p1907 -aVavuta -p1908 -aVavute -p1909 -aV\u0442\u0435\u043c -p1910 -aVett -p1911 -aV -p1912 -aVjoka -p1913 -aVszerint -p1914 -aVci -p1915 -aV\u043d\u0435\u0442 -p1916 -aVsit -p1917 -aVtes -p1918 -aVce -p1919 -aVhouver -p1920 -aV\u0436\u0435 -p1921 -aVblive -p1922 -aVniihin -p1923 -aVtiene -p1924 -aVsjlv -p1925 -aV\u0442\u043e\u0442 -p1926 -aVfarai -p1927 -aVests -p1928 -aVmg -p1929 -aVestn -p1930 -aVai -p1931 -aVtenidos -p1932 -aVestava -p1933 -aVigen -p1934 -aV\u0443\u0436 -p1935 -aVestis -p1936 -aVseras -p1937 -aVmeus -p1938 -aVsiamo -p1939 -aVile -p1940 -aV\u0431\u044b\u043b\u0430 -p1941 -aVours -p1942 -aVvort -p1943 -aVott -p1944 -aVestuvimos -p1945 -aVfacessero -p1946 -aVhubo -p1947 -aVtivssemos -p1948 -aVinn -p1949 -aVhube -p1950 -aVwill -p1951 -aVestiver -p1952 -aVkeilt -p1953 -aVniiksi -p1954 -aVvilkas -p1955 -aVnr -p1956 -aV\u043d\u0435\u0439 -p1957 -aVat -p1958 -aVkuka -p1959 -aVnous -p1960 -aVve -p1961 -aVvi -p1962 -aV\u0442\u043e\u043b\u044c\u043a\u043e -p1963 -aVnincs -p1964 -aVsitt -p1965 -aV\u0435\u043c\u0443 -p1966 -aVsar -p1967 -aVsar -p1968 -aVwelche -p1969 -aVnada -p1970 -aVmik -p1971 -aVnist -p1972 -aV\u0441\u0432\u043e\u044e -p1973 -aVveel -p1974 -aVessendo -p1975 -aVpelos -p1976 -aVestado -p1977 -aVsugli -p1978 -aVoff -p1979 -aVi -p1980 -aVavevate -p1981 -aVestuvisemos -p1982 -aVcon -p1983 -aVlesz -p1984 -aVmeiss -p1985 -aVweg -p1986 -aVok -p1987 -aVestaramos -p1988 -aVhabramos -p1989 -aVhouvermos -p1990 -aVdisse -p1991 -aVutan -p1992 -aVdess -p1993 -aVjonka -p1994 -aVayant -p1995 -aVmit -p1996 -aVhouvesse -p1997 -aVsokat -p1998 -aVemme -p1999 -aVhubimos -p2000 -aV\u0435\u0439 -p2001 -aV\u0435\u0435 -p2002 -aVteremos -p2003 -aVbecause -p2004 -aVmihin -p2005 -aVotras -p2006 -aV\u043d\u0430\u043a\u043e\u043d\u0435\u0446 -p2007 -aVwerden -p2008 -aVjotka -p2009 -aVcsak -p2010 -aVest -p2011 -aVdagl -p2012 -aVfanno -p2013 -aVdoen -p2014 -aVese -p2015 -aVdoes -p2016 -aVesa -p2017 -aVeso -p2018 -aVdesde -p2019 -aVteihin -p2020 -aVmitt -p2021 -aV\u043b\u0438 -p2022 -aVfordi -p2023 -aVayons -p2024 -aVhoss -p2025 -aVniille -p2026 -aVabout -p2027 -aVtambm -p2028 -aVanden -p2029 -aVestbamos -p2030 -aVonder -p2031 -aVander -p2032 -aVhier -p2033 -aVeinig -p2034 -aVem -p2035 -aVown -p2036 -aVsajt -p2037 -aVstanno -p2038 -aVmert -p2039 -aVpi -p2040 -aVdonde -p2041 -aVeine -p2042 -aVstaremo -p2043 -aVvai -p2044 -aVvan -p2045 -aVeens -p2046 -aVeivt -p2047 -aVayantes -p2048 -aVvad -p2049 -aVquante -p2050 -aVeussions -p2051 -aVquanta -p2052 -aVvar -p2053 -aVquanto -p2054 -aVquanti -p2055 -aVazzal -p2056 -aVfaranno -p2057 -aVbut -p2058 -aVkeness -p2059 -aVho -p2060 -aVha -p2061 -aVhe -p2062 -aVezrt -p2063 -aVnha -p2064 -aVdagli -p2065 -aVhouvramos -p2066 -aVj -p2067 -aVbelow -p2068 -aVfueseis -p2069 -aVsein -p2070 -aVhvad -p2071 -aVvuestros -p2072 -aVamelyek -p2073 -aVins -p2074 -aVvostri -p2075 -aVamelyet -p2076 -aVvostro -p2077 -aVind -p2078 -aVvostra -p2079 -aVvostre -p2080 -aVarra -p2081 -aVtendrn -p2082 -aVdeires -p2083 -aVsiano -p2084 -aVother -p2085 -aVseus -p2086 -aVsich -p2087 -aV\u0435\u0449\u0435 -p2088 -aVsarete -p2089 -aVogs -p2090 -aVutn -p2091 -aVagli -p2092 -aVhubieron -p2093 -aV\u043f\u043e\u0442\u043e\u043c -p2094 -aVh -p2095 -aVstai -p2096 -aVvuestras -p2097 -aVmist -p2098 -atp2099 -Rp2100 -. \ No newline at end of file diff --git a/shorttext/utils/textpreprocessing.py b/shorttext/utils/textpreprocessing.py deleted file mode 100644 index 6699001a..00000000 --- a/shorttext/utils/textpreprocessing.py +++ /dev/null @@ -1,95 +0,0 @@ -import re -import pickle -import os - -import spacy -from stemming.porter import stem - -# load stop words -this_dir, _ = os.path.split(__file__) -stopwordset = pickle.load(open(os.path.join(this_dir, 'stopwordset.pkl'), 'r')) - -# initialize spacy -class SpaCyNLPHolder: - def __init__(self): - self.nlp = None - - def getNLPInstance(self): - if self.nlp==None: - self.nlp = spacy.load('en') - return self.nlp -# prepare the singleton -spaCyNLPHolder = SpaCyNLPHolder() - -def spacy_tokenize(text): - """ Tokenize a sentence with spaCy. - - This works like `nltk.tokenize` which tokenize a sentence, but this runs faster. - This returns the strings of tokens. - - :param text: sentence to tokenize - :return: list of tokens - :type text: str - :rtype: list - """ - nlp = spaCyNLPHolder.getNLPInstance() # lazy loading - tokenizer = nlp(unicode(text)) - return map(str, [token for token in tokenizer]) - -def preprocess_text(text, pipeline): - """ Preprocess the text according to the given pipeline. - - Given the pipeline, which is a list of functions that process an - input text to another text (e.g., stemming, lemmatizing, removing punctuations etc.), - preprocess the text. - - :param text: text to be preprocessed - :param pipeline: a list of functions that convert a text to another text - :return: preprocessed text - :type text: str - :type pipeline: list - :rtype: str - """ - if len(pipeline)==0: - return text - else: - return preprocess_text(pipeline[0](text), pipeline[1:]) - -def text_preprocessor(pipeline): - """ Return the function that preprocesses text according to the pipeline. - - Given the pipeline, which is a list of functions that process an - input text to another text (e.g., stemming, lemmatizing, removing punctuations etc.), - return a function that preprocesses an input text outlined by the pipeline, essentially - a function that runs :func:`~preprocess_text` with the specified pipeline. - - :param pipeline: a list of functions that convert a text to another text - :return: a function that preprocesses text according to the pipeline - :type pipeline: list - :rtype: function - """ - return lambda text: preprocess_text(text, pipeline) - -def standard_text_preprocessor_1(): - """ Return a commonly used text preprocessor. - - Return a text preprocessor that is commonly used, with the following steps: - - - removing special characters, - - removing numerals, - - converting all alphabets to lower cases, - - removing stop words, and - - stemming the words (using Porter stemmer). - - This function calls :func:`~text_preprocessor`. - - :return: a function that preprocesses text according to the pipeline - :rtype: function - """ - pipeline = [lambda s: re.sub('[^\w\s]', '', s), - lambda s: re.sub('[\d]', '', s), - lambda s: s.lower(), - lambda s: ' '.join(filter(lambda s: not (s in stopwordset), spacy_tokenize(s))), - lambda s: ' '.join(map(stem, spacy_tokenize(s))) - ] - return text_preprocessor(pipeline) \ No newline at end of file diff --git a/shorttext/utils/wordembed.py b/shorttext/utils/wordembed.py deleted file mode 100644 index 6b865814..00000000 --- a/shorttext/utils/wordembed.py +++ /dev/null @@ -1,14 +0,0 @@ -import gensim - -def load_word2vec_model(path, binary=True): - """ Load a pre-trained Word2Vec model. - - :param path: path of the file of the pre-trained Word2Vec model - :param binary: whether the file is in binary format (Default: True) - :return: a pre-trained Word2Vec model - :type path: str - :type binary: bool - :rtype: gensim.models.keyedvectors.KeyedVectors - """ - return gensim.models.KeyedVectors.load_word2vec_format(path, binary=binary) - diff --git a/shorttext_tests.py b/shorttext_tests.py deleted file mode 100644 index a8601ae8..00000000 --- a/shorttext_tests.py +++ /dev/null @@ -1,11 +0,0 @@ -import unittest - -class SampleTest(unittest.TestCase): - def setUp(self): - self.sample_var = True - - def testSampleTestCase(self): - self.assertEqual(True, self.sample_var) - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff --git a/src/shorttext/__init__.py b/src/shorttext/__init__.py new file mode 100644 index 00000000..331cc8b3 --- /dev/null +++ b/src/shorttext/__init__.py @@ -0,0 +1,8 @@ + +from . import metrics +from . import classifiers +from . import data +from . import generators +from . import spell +from . import stack +from . import utils diff --git a/shorttext/classifiers/__init__.py b/src/shorttext/classifiers/__init__.py similarity index 80% rename from shorttext/classifiers/__init__.py rename to src/shorttext/classifiers/__init__.py index 653f8a05..00ba75be 100644 --- a/shorttext/classifiers/__init__.py +++ b/src/shorttext/classifiers/__init__.py @@ -4,10 +4,6 @@ from .embed import frameworks from .embed.sumvec import frameworks as sumvecframeworks -from .bow.topic.LatentTopicModeling import GensimTopicModeler, LDAModeler, LSIModeler, RPModeler -from .bow.topic.LatentTopicModeling import AutoencodingTopicModeler, load_autoencoder_topic -from .bow.topic.LatentTopicModeling import load_gensimtopicmodel - from .bow.topic.TopicVectorDistanceClassification import TopicVecCosineDistanceClassifier as TopicVectorCosineDistanceClassifier from .bow.topic.TopicVectorDistanceClassification import train_autoencoder_cosineClassifier, train_gensimtopicvec_cosineClassifier from .bow.topic.TopicVectorDistanceClassification import load_autoencoder_cosineClassifier, load_gensimtopicvec_cosineClassifier diff --git a/src/shorttext/classifiers/base.py b/src/shorttext/classifiers/base.py new file mode 100644 index 00000000..f2374024 --- /dev/null +++ b/src/shorttext/classifiers/base.py @@ -0,0 +1,18 @@ + +from abc import ABC, abstractmethod + + +class AbstractScorer(ABC): + """Abstract base class for scoring classifiers.""" + + @abstractmethod + def score(self, shorttext: str) -> dict[str, float]: + """Calculate classification scores. + + Args: + shorttext: Input text to classify. + + Returns: + Dictionary mapping class labels to scores. + """ + raise NotImplementedError() diff --git a/shorttext/classifiers/bow/__init__.py b/src/shorttext/classifiers/bow/__init__.py similarity index 100% rename from shorttext/classifiers/bow/__init__.py rename to src/shorttext/classifiers/bow/__init__.py diff --git a/src/shorttext/classifiers/bow/maxent/MaxEntClassification.py b/src/shorttext/classifiers/bow/maxent/MaxEntClassification.py new file mode 100644 index 00000000..5b644bae --- /dev/null +++ b/src/shorttext/classifiers/bow/maxent/MaxEntClassification.py @@ -0,0 +1,214 @@ + +from typing import Literal, Optional + +import sparse +import orjson +from tensorflow.keras import Model, Sequential +from tensorflow.keras.layers import Dense +from tensorflow.keras.regularizers import l2 + +from ....utils import kerasmodel_io as kerasio +from ....utils import tokenize +from ....utils import classification_exceptions as e +from ....utils.compactmodel_io import CompactIOMachine +from ....utils.dtm import convert_classdict_to_xy +from ...base import AbstractScorer + + +def logistic_framework( + nb_features: int, + nb_outputs: int, + l2reg: float = 0.01, + bias_l2reg: float = 0.01, + optimizer: Literal["sgd", "rmsprop", "adagrad", "adadelta", "adam", "adamax", "nadam"] = "adam" +) -> Model: + """Create a maximum entropy classifier neural network. + + Args: + nb_features: Number of input features. + nb_outputs: Number of output classes. + l2reg: L2 regularization coefficient. Default: 0.01. + bias_l2reg: L2 regularization for bias. Default: 0.01. + optimizer: Optimizer. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. Default: adam. + + Returns: + Keras Sequential model for maximum entropy classification. + """ + kmodel = Sequential() + kmodel.add(Dense(units=nb_outputs, + activation='softmax', + input_shape=(nb_features,), + kernel_regularizer=l2(l2reg), + bias_regularizer=l2(bias_l2reg)) + ) + kmodel.compile(loss='categorical_crossentropy', optimizer=optimizer) + return kmodel + + +class MaxEntClassifier(AbstractScorer, CompactIOMachine): + """Maximum entropy classifier. + + A classifier that implements the principle of maximum entropy + for text categorization using bag-of-words features. + + Reference: + Adam L. Berger et al., "A Maximum Entropy Approach to Natural + Language Processing," Computational Linguistics 22(1): 39-72 (1996). + """ + + def __init__(self, preprocessor: Optional[callable] = None): + """Initialize the classifier. + + Args: + preprocessor: Text preprocessing function. Default: lowercase. + """ + CompactIOMachine.__init__( + self, + {'classifier': 'maxent'}, + 'maxent', + ['_classlabels.txt', '.json', '.weights.h5', '_labels2idx.json', '_tokens2idx.json'] + ) + + if preprocessor is None: + preprocessor = lambda s: s.lower() + + self.preprocess_func = preprocessor + self.trained = False + + def shorttext_to_vec(self, shorttext: str) -> sparse.SparseArray: + """Convert short text to sparse vector. + + Args: + shorttext: Input text. + + Returns: + Sparse vector representation. + """ + tokens = tokenize(self.preprocess_func(shorttext)) + token_indices = [ + self.token2idx.get(token) + for token in tokens + if token in self.token2idx.keys() + ] + + vec = sparse.COO( + [[0]*len(token_indices), token_indices], + [1.0]*len(token_indices), + shape=(1, len(self.token2idx)) + ) + + return vec + + def train( + self, + classdict: dict[str, list[str]], + nb_epochs: int = 500, + l2reg: float = 0.01, + bias_l2reg: float = 0.01, + optimizer: Literal["sgd", "rmsprop", "adagrad", "adadelta", "adam", "adamax", "nadam"] = "adam" + ) -> None: + """Train the classifier. + + Args: + classdict: Training data. + nb_epochs: Number of training epochs. Default: 500. + l2reg: L2 regularization coefficient. Default: 0.01. + bias_l2reg: L2 regularization for bias. Default: 0.01. + optimizer: Optimizer. Default: adam. + """ + self.classlabels = sorted(classdict.keys()) + self.labels2idx = {label: idx for idx, label in enumerate(self.classlabels)} + + dtm_npdict_matrix, y = convert_classdict_to_xy( + classdict, self.labels2idx, preprocess_func=self.preprocess_func, tokenize_func=tokenize + ) + self.token2idx = { + token: idx + for idx, token in enumerate(dtm_npdict_matrix._lists_keystrings[1]) + } + + kmodel = logistic_framework( + dtm_npdict_matrix.dimension_sizes[1], + len(self.classlabels), + l2reg=l2reg, + bias_l2reg=bias_l2reg, + optimizer=optimizer + ) + kmodel.fit(dtm_npdict_matrix.to_numpy(), y.todense(), epochs=nb_epochs) + + self.model = kmodel + self.trained = True + + def savemodel(self, nameprefix: str) -> None: + """Save the trained model to files. + + Args: + nameprefix: Prefix for output files. + + Raises: + ModelNotTrainedException: If not trained. + """ + if not self.trained: + raise e.ModelNotTrainedException() + + kerasio.save_model(nameprefix, self.model) + open(nameprefix+'_tokens2idx.json', 'wb').write(orjson.dumps(self.token2idx)) + open(nameprefix+'_classlabels.txt', 'w').write('\n'.join(self.classlabels)) + open(nameprefix+'_labels2idx.json', 'wb').write(orjson.dumps(self.labels2idx)) + + def loadmodel(self, nameprefix: str) -> None: + """Load a trained model from files. + + Args: + nameprefix: Prefix for input files. + """ + self.model = kerasio.load_model(nameprefix) + self.token2idx = orjson.loads(open(nameprefix+"_tokens2idx.json", "rb").read()) + self.classlabels = [ + s.strip() + for s in open(nameprefix+'_classlabels.txt', 'r').readlines() + ] + self.labels2idx = orjson.loads(open(nameprefix+"_labels2idx.json", "rb").read()) + self.trained = True + + def score(self, shorttext: str) -> dict[str, float]: + """Calculate classification scores for all class labels. + + Args: + shorttext: Input text. + + Returns: + Dictionary mapping class labels to scores. + + Raises: + ModelNotTrainedException: If not trained. + """ + if not self.trained: + raise e.ModelNotTrainedException() + + vec = self.shorttext_to_vec(shorttext) + predictions = self.model.predict(vec.todense()) + + scoredict = { + classlabel: predictions[0][idx] + for idx, classlabel in enumerate(self.classlabels) + } + return scoredict + + +def load_maxent_classifier(name: str, compact: bool=True) -> MaxEntClassifier: + """Load a MaxEntClassifier from file. + + Args: + name: Model name (compact) or file prefix (non-compact). + compact: Whether to load compact model. Default: True. + + Returns: + MaxEntClassifier instance. + """ + classifier = MaxEntClassifier() + if compact: + classifier.load_compact_model(name) + else: + classifier.loadmodel(name) + return classifier \ No newline at end of file diff --git a/shorttext/classifiers/bow/maxent/__init__.py b/src/shorttext/classifiers/bow/maxent/__init__.py similarity index 100% rename from shorttext/classifiers/bow/maxent/__init__.py rename to src/shorttext/classifiers/bow/maxent/__init__.py diff --git a/src/shorttext/classifiers/bow/topic/SkLearnClassification.py b/src/shorttext/classifiers/bow/topic/SkLearnClassification.py new file mode 100644 index 00000000..a4cd0a5d --- /dev/null +++ b/src/shorttext/classifiers/bow/topic/SkLearnClassification.py @@ -0,0 +1,390 @@ + +from typing import Optional, Literal + +import numpy as np +import numpy.typing as npt +import joblib +import sklearn + +from ....utils import textpreprocessing as textpreprocess +from ....generators import load_autoencoder_topicmodel, load_gensimtopicmodel +from ....generators import LDAModeler, LSIModeler, RPModeler, AutoencodingTopicModeler +from ....generators import LatentTopicModeler +from ....utils import classification_exceptions as e +from ....utils import compactmodel_io as cio +from ...base import AbstractScorer + + +class TopicVectorSkLearnClassifier(AbstractScorer): + """Classifier using topic vectors with scikit-learn. + + Wraps any scikit-learn supervised learning algorithm and uses + topic vectors from LatentTopicModeler as features. + + Reference: + Xuan Hieu Phan et al., "A Hidden Topic-Based Framework toward + Building Applications with Short Web Documents," + IEEE Trans. Knowl. Data Eng. 23(7): 961-976 (2011). + + Xuan Hieu Phan et al., "Learning to Classify Short and Sparse + Text & Web with Hidden Topics from Large-scale Data Collections," + WWW 2008. + http://dl.acm.org/citation.cfm?id=1367510 + """ + + def __init__( + self, + topicmodeler: LatentTopicModeler, + sklearn_classifier: sklearn.base.BaseEstimator + ): + """Initialize the classifier. + + Args: + topicmodeler: A topic modeler instance. + sklearn_classifier: A scikit-learn classifier instance. + """ + self.topicmodeler = topicmodeler + self.classifier = sklearn_classifier + self.trained = False + + def train(self, classdict: dict[str, list[str]], *args, **kwargs) -> None: + """Train the classifier. + + Args: + classdict: Training data with class labels as keys and texts as values. + *args: Arguments passed to scikit-learn classifier fit(). + **kwargs: Arguments passed to scikit-learn classifier fit(). + + Raises: + ModelNotTrainedException: If topic modeler is not trained. + """ + x = [] + y = [] + self.classlabels = sorted(classdict.keys()) # classlabels must be sorted like the topic modelers + for classidx, classlabel in enumerate(self.classlabels): + topicvecs = [ + self.topicmodeler.retrieve_topicvec(shorttext) + for shorttext in classdict[classlabel] + ] + x += topicvecs + y += [classidx]*len(topicvecs) + self.classifier.fit(x, y, *args, **kwargs) + self.trained = True + + def getvector(self, shorttext: str) -> npt.NDArray[np.float64]: + """Get topic vector for short text. + + Args: + shorttext: Input text. + + Returns: + Topic vector representation. + + Raises: + ModelNotTrainedException: If model not trained. + """ + if not self.trained: + raise e.ModelNotTrainedException() + return self.topicmodeler.retrieve_topicvec(shorttext) + + def classify(self, shorttext: str) -> str: + """Classify short text into a class label. + + Args: + shorttext: Input text to classify. + + Returns: + Predicted class label. + + Raises: + ModelNotTrainedException: If model not trained. + """ + if not self.trained: + raise e.ModelNotTrainedException() + topicvec = self.getvector(shorttext) + return self.classlabels[self.classifier.predict([topicvec])[0]] + + def score(self, shorttext: str) -> dict[str, float]: + """Compute classification scores for all classes. + + Args: + shorttext: Input text. + + Returns: + Dictionary mapping class labels to scores. + + Raises: + ModelNotTrainedException: If model not trained. + """ + if not self.trained: + raise e.ModelNotTrainedException() + + topicvec = self.getvector(shorttext) + scoredict = { + classlabel: self.classifier.score([topicvec], [classidx]) + for classidx, classlabel in enumerate(self.classlabels) + } + return scoredict + + def savemodel(self, nameprefix: str) -> None: + """Save model to files. + + Saves the topic model, scikit-learn classifier, and class labels. + + Args: + nameprefix: Prefix for output files. + + Raises: + ModelNotTrainedException: If model not trained. + """ + if not self.trained: + raise e.ModelNotTrainedException() + self.topicmodeler.savemodel(nameprefix) + joblib.dump(self.classifier, nameprefix+'.pkl') + labelfile = open(nameprefix+'_classlabels.txt', 'w') + labelfile.write('\n'.join(self.classlabels)) + labelfile.close() + + def loadmodel(self, nameprefix: str) -> None: + """Load model from files. + + Args: + nameprefix: Prefix for input files. + """ + self.topicmodeler.loadmodel(nameprefix) + self.classifier = joblib.load(nameprefix+'.pkl') + labelfile = open(nameprefix+'_classlabels.txt', 'r') + self.classlabels = [s.strip() for s in labelfile.readlines()] + labelfile.close() + + def save_compact_model(self, name: str) -> None: + """Save model as compact file. + + Args: + name: Name of the compact model file. + + Raises: + ModelNotTrainedException: If model not trained. + """ + topicmodel_info = self.topicmodeler.get_info() + cio.save_compact_model( + name, + self.savemodel, + 'topic_sklearn', + topicmodel_info['suffices']+['.pkl', '_classlabels.txt'], + { + 'classifier': 'topic_sklearn', + 'topicmodel': topicmodel_info['classifier'] + } + ) + + def load_compact_model(self, name: str) -> None: + """Load model from compact file. + + Args: + name: Name of the compact model file. + """ + cio.load_compact_model( + name, + self.loadmodel, + 'topic_sklearn', + {'classifier': 'topic_sklearn', 'topicmodel': None} + ) + self.trained = True + + +def train_gensim_topicvec_sklearnclassifier( + classdict: dict[str, list[str]], + nb_topics: int, + sklearn_classifier: sklearn.base.BaseEstimator, + preprocessor: Optional[callable] = None, + topicmodel_algorithm: Literal["lda", "lsi", "rp"] = 'lda', + toweigh: bool = True, + normalize: bool = True, + gensim_paramdict: Optional[dict] = None, + sklearn_paramdict: Optional[dict] = None +) -> TopicVectorSkLearnClassifier: + """Train a classifier with gensim topic vectors and scikit-learn. + + Trains a topic model (LDA, LSI, or RP), then uses the topic vectors + as features to train a scikit-learn classifier. + + Args: + classdict: Training data. + nb_topics: Number of topics. + sklearn_classifier: Scikit-learn classifier instance (not trained). + preprocessor: Text preprocessing function. Default: standard_text_preprocessor_1. + topicmodel_algorithm: Topic model algorithm. Default: lda. + toweigh: Apply tf-idf weighting. Default: True. + normalize: Normalize topic vectors. Default: True. + gensim_paramdict: Arguments for gensim topic model. + sklearn_paramdict: Arguments for scikit-learn classifier. + + Returns: + Trained TopicVectorSkLearnClassifier. + + Reference: + Xuan Hieu Phan et al., "A Hidden Topic-Based Framework toward + Building Applications with Short Web Documents," + IEEE Trans. Knowl. Data Eng. 23(7): 961-976 (2011). + + Xuan Hieu Phan et al., "Learning to Classify Short and Sparse + Text & Web with Hidden Topics from Large-scale Data Collections," + WWW 2008. + http://dl.acm.org/citation.cfm?id=1367510 + """ + if preprocessor is None: + preprocessor = textpreprocess.standard_text_preprocessor_1() + if gensim_paramdict is None: + gensim_paramdict = {} + if sklearn_paramdict is None: + sklearn_paramdict = {} + + modelerdict = {'lda': LDAModeler, 'lsi': LSIModeler, 'rp': RPModeler} + topicmodeler = modelerdict[topicmodel_algorithm]( + preprocessor=preprocessor, + toweigh=toweigh, + normalize=normalize + ) + topicmodeler.train(classdict, nb_topics, **gensim_paramdict) + + classifier = TopicVectorSkLearnClassifier(topicmodeler, sklearn_classifier) + classifier.train(classdict, **sklearn_paramdict) + + return classifier + + +def load_gensim_topicvec_sklearnclassifier( + name: str, + preprocessor: Optional[callable] = None, + compact: bool = True +) -> TopicVectorSkLearnClassifier: + """Load a classifier with gensim topic vectors from files. + + Args: + name: Model name (compact) or file prefix (non-compact). + preprocessor: Text preprocessing function. Default: standard_text_preprocessor_1. + compact: Load compact model. Default: True. + + Returns: + TopicVectorSkLearnClassifier instance. + + Reference: + Xuan Hieu Phan et al., "A Hidden Topic-Based Framework toward + Building Applications with Short Web Documents," + IEEE Trans. Knowl. Data Eng. 23(7): 961-976 (2011). + + Xuan Hieu Phan et al., "Learning to Classify Short and Sparse + Text & Web with Hidden Topics from Large-scale Data Collections," + WWW 2008. + http://dl.acm.org/citation.cfm?id=1367510 + """ + if preprocessor is None: + preprocessor = textpreprocess.standard_text_preprocessor_1() + + if compact: + modelerdict = {'ldatopic': LDAModeler, 'lsitopic': LSIModeler, 'rptopic': RPModeler} + topicmodel_name = cio.get_model_config_field(name, 'topicmodel') + classifier = TopicVectorSkLearnClassifier(modelerdict[topicmodel_name](preprocessor=preprocessor), None) + classifier.load_compact_model(name) + classifier.trained = True + return classifier + else: + topicmodeler = load_gensimtopicmodel(name, preprocessor=preprocessor) + sklearn_classifier = joblib.load(name + '.pkl') + classifier = TopicVectorSkLearnClassifier(topicmodeler, sklearn_classifier) + classifier.trained = True + return classifier + + +def train_autoencoder_topic_sklearnclassifier( + classdict: dict[str, list[str]], + nb_topics: int, + sklearn_classifier: sklearn.base.BaseEstimator, + preprocessor: Optional[callable] = None, + normalize: bool = True, + keras_paramdict: Optional[dict] = None, + sklearn_paramdict: Optional[dict] = None +) -> TopicVectorSkLearnClassifier: + """Train a classifier with autoencoder topic vectors and scikit-learn. + + Trains an autoencoder topic model, then uses the encoded vectors + as features to train a scikit-learn classifier. + + Args: + classdict: Training data. + nb_topics: Number of encoding dimensions. + sklearn_classifier: Scikit-learn classifier instance (not trained). + preprocessor: Text preprocessing function. Default: standard_text_preprocessor_1. + normalize: Normalize topic vectors. Default: True. + keras_paramdict: Arguments for Keras autoencoder training. + sklearn_paramdict: Arguments for scikit-learn classifier. + + Returns: + Trained TopicVectorSkLearnClassifier. + + Reference: + Xuan Hieu Phan et al., "A Hidden Topic-Based Framework toward + Building Applications with Short Web Documents," + IEEE Trans. Knowl. Data Eng. 23(7): 961-976 (2011). + + Xuan Hieu Phan et al., "Learning to Classify Short and Sparse + Text & Web with Hidden Topics from Large-scale Data Collections," + WWW 2008. + http://dl.acm.org/citation.cfm?id=1367510 + """ + if preprocessor is None: + preprocessor = textpreprocess.standard_text_preprocessor_1() + if keras_paramdict is None: + keras_paramdict = {} + if sklearn_paramdict is None: + sklearn_paramdict = {} + + autoencoder = AutoencodingTopicModeler(preprocessor=preprocessor, normalize=normalize) + autoencoder.train(classdict, nb_topics, **keras_paramdict) + + classifier = TopicVectorSkLearnClassifier(autoencoder, sklearn_classifier) + classifier.train(classdict, **sklearn_paramdict) + + return classifier + + +def load_autoencoder_topic_sklearnclassifier( + name: str, + preprocessor: Optional[callable] = None, + compact: bool = True +) -> TopicVectorSkLearnClassifier: + """Load a classifier with autoencoder topic vectors from files. + + Args: + name: Model name (compact) or file prefix (non-compact). + preprocessor: Text preprocessing function. Default: standard_text_preprocessor_1. + compact: Load compact model. Default: True. + + Returns: + TopicVectorSkLearnClassifier instance. + + Reference: + Xuan Hieu Phan et al., "A Hidden Topic-Based Framework toward + Building Applications with Short Web Documents," + IEEE Trans. Knowl. Data Eng. 23(7): 961-976 (2011). + + Xuan Hieu Phan et al., "Learning to Classify Short and Sparse + Text & Web with Hidden Topics from Large-scale Data Collections," + WWW 2008. + http://dl.acm.org/citation.cfm?id=1367510 + """ + if preprocessor is None: + preprocessor = textpreprocess.standard_text_preprocessor_1() + + if compact: + classifier = TopicVectorSkLearnClassifier(AutoencodingTopicModeler(preprocessor=preprocessor), None) + classifier.load_compact_model(name) + classifier.trained = True + return classifier + else: + autoencoder = load_autoencoder_topicmodel(name, preprocessor=preprocessor) + sklearn_classifier = joblib.load(name + '.pkl') + classifier = TopicVectorSkLearnClassifier(autoencoder, sklearn_classifier) + classifier.trained = True + return classifier diff --git a/src/shorttext/classifiers/bow/topic/TopicVectorDistanceClassification.py b/src/shorttext/classifiers/bow/topic/TopicVectorDistanceClassification.py new file mode 100644 index 00000000..939ce0ed --- /dev/null +++ b/src/shorttext/classifiers/bow/topic/TopicVectorDistanceClassification.py @@ -0,0 +1,181 @@ + +from typing import Optional, Literal + +from ....generators import LatentTopicModeler, GensimTopicModeler, AutoencodingTopicModeler +from ....generators import load_autoencoder_topicmodel, load_gensimtopicmodel +from ...base import AbstractScorer + + +class TopicVecCosineDistanceClassifier(AbstractScorer): + """Classifier using cosine similarity with topic vectors. + + Classifies short text based on cosine similarity between topic vectors + of the input and class centroids. Topic vectors are generated by a + LatentTopicModeler. + """ + + def __init__(self, topicmodeler: LatentTopicModeler): + """Initialize the classifier. + + Args: + topicmodeler: A LatentTopicModeler instance. + """ + self.topicmodeler = topicmodeler + + def score(self, shorttext: str) -> dict[str, float]: + """Calculate cosine similarity to all class topic vectors. + + Args: + shorttext: Input text. + + Returns: + Dictionary mapping class labels to similarity scores. + """ + return self.topicmodeler.get_batch_cos_similarities(shorttext) + + def loadmodel(self, nameprefix: str) -> None: + """Load the topic model. + + Args: + nameprefix: Prefix for input files. + """ + self.topicmodeler.loadmodel(nameprefix) + + def savemodel(self, nameprefix: str) -> None: + """Save the topic model. + + Args: + nameprefix: Prefix for output files. + + Raises: + ModelNotTrainedException: If model not trained. + """ + self.topicmodeler.savemodel(nameprefix) + + def load_compact_model(self, name: str) -> None: + """Load compact model. + + Args: + name: Name of the compact model file. + """ + self.topicmodeler.load_compact_model(name) + + def save_compact_model(self, name: str) -> None: + """Save compact model. + + Args: + name: Name of the compact model file. + """ + self.topicmodeler.save_compact_model(name) + + +def train_gensimtopicvec_cosineClassifier( + classdict: dict[str, list[str]], + nb_topics: int, + preprocessor: Optional[callable] = None, + tokenizer: Optional[callable] = None, + algorithm: Literal["lda", "lsi", "rp"] = "lda", + toweigh: bool = True, + normalize: bool = True, + *args, **kwargs +) -> TopicVecCosineDistanceClassifier: + """Train a gensim topic model and return a cosine classifier. + + Args: + classdict: Training data. + nb_topics: Number of latent topics. + preprocessor: Text preprocessing function. Default: standard_text_preprocessor_1. + algorithm: Topic modeling algorithm. Options: lda, lsi, rp. Default: lda. + toweigh: Whether to apply tf-idf weighting. Default: True. + normalize: Whether to normalize topic vectors. Default: True. + *args: Additional arguments for gensim topic model. + **kwargs: Additional keyword arguments for gensim topic model. + + Returns: + TopicVecCosineDistanceClassifier instance. + """ + # train topic model + topicmodeler = GensimTopicModeler(preprocessor=preprocessor, + tokenizer=tokenizer, + algorithm=algorithm, + toweigh=toweigh, + normalize=normalize) + topicmodeler.train(classdict, nb_topics, *args, **kwargs) + + # cosine distance classifier + return TopicVecCosineDistanceClassifier(topicmodeler) + + +def load_gensimtopicvec_cosineClassifier( + name: str, + preprocessor: Optional[callable] = None, + tokenizer: Optional[callable] = None, + compact: bool=True +) -> TopicVecCosineDistanceClassifier: + """Load a gensim topic model and return a cosine classifier. + + Args: + name: Model name (compact) or file prefix (non-compact). + preprocessor: Text preprocessing function. Default: standard_text_preprocessor_1. + compact: Whether to load compact model. Default: True. + + Returns: + TopicVecCosineDistanceClassifier instance. + """ + topicmodeler = load_gensimtopicmodel( + name, preprocessor=preprocessor, tokenizer=tokenizer, compact=compact + ) + return TopicVecCosineDistanceClassifier(topicmodeler) + + +def train_autoencoder_cosineClassifier( + classdict: dict[str, list[str]], + nb_topics: int, + preprocessor: Optional[callable] = None, + tokenizer: Optional[callable] = None, + normalize: bool = True, + *args, **kwargs +) -> TopicVecCosineDistanceClassifier: + """Train an autoencoder topic model and return a cosine classifier. + + Args: + classdict: Training data. + nb_topics: Number of topics (encoding dimensions). + preprocessor: Text preprocessing function. Default: standard_text_preprocessor_1. + normalize: Whether to normalize topic vectors. Default: True. + *args: Additional arguments for Keras model fitting. + **kwargs: Additional keyword arguments for Keras model fitting. + + Returns: + TopicVecCosineDistanceClassifier instance. + """ + # train the autoencoder + autoencoder = AutoencodingTopicModeler( + preprocessor=preprocessor, tokenizer=tokenizer, normalize=normalize + ) + autoencoder.train(classdict, nb_topics, *args, **kwargs) + + # cosine distance classifier + return TopicVecCosineDistanceClassifier(autoencoder) + + +def load_autoencoder_cosineClassifier( + name: str, + preprocessor: Optional[callable] = None, + tokenizer: Optional[callable] = None, + compact: bool = True +) -> TopicVecCosineDistanceClassifier: + """Load an autoencoder topic model and return a cosine classifier. + + Args: + name: Model name (compact) or file prefix (non-compact). + preprocessor: Text preprocessing function. Default: standard_text_preprocessor_1. + compact: Whether to load compact model. Default: True. + + Returns: + TopicVecCosineDistanceClassifier instance. + """ + autoencoder = load_autoencoder_topicmodel( + name, preprocessor=preprocessor, tokenizer=tokenizer, compact=compact + ) + return TopicVecCosineDistanceClassifier(autoencoder) diff --git a/shorttext/classifiers/bow/topic/__init__.py b/src/shorttext/classifiers/bow/topic/__init__.py similarity index 70% rename from shorttext/classifiers/bow/topic/__init__.py rename to src/shorttext/classifiers/bow/topic/__init__.py index af52dc94..6467258f 100644 --- a/shorttext/classifiers/bow/topic/__init__.py +++ b/src/shorttext/classifiers/bow/topic/__init__.py @@ -1,3 +1,3 @@ -from . import LatentTopicModeling + from . import TopicVectorDistanceClassification from . import SkLearnClassification \ No newline at end of file diff --git a/shorttext/classifiers/embed/__init__.py b/src/shorttext/classifiers/embed/__init__.py similarity index 100% rename from shorttext/classifiers/embed/__init__.py rename to src/shorttext/classifiers/embed/__init__.py diff --git a/src/shorttext/classifiers/embed/nnlib/VarNNEmbedVecClassification.py b/src/shorttext/classifiers/embed/nnlib/VarNNEmbedVecClassification.py new file mode 100644 index 00000000..4aff4a42 --- /dev/null +++ b/src/shorttext/classifiers/embed/nnlib/VarNNEmbedVecClassification.py @@ -0,0 +1,254 @@ + +import os +import warnings +from typing import Any, Optional, Annotated + +import numpy as np +import numpy.typing as npt +from gensim.models.keyedvectors import KeyedVectors +from tensorflow.keras.models import Model +import orjson + +from ....utils import kerasmodel_io as kerasio +from ....utils.classification_exceptions import ModelNotTrainedException +from ....utils import tokenize +from ....utils.compactmodel_io import CompactIOMachine +from ...base import AbstractScorer + + +class VarNNEmbeddedVecClassifier(AbstractScorer, CompactIOMachine): + """Neural network classifier for short text categorization. + + Wraps Keras neural network models for supervised short text classification. + Each token is converted to an embedded vector using a pre-trained word-embedding + model (e.g., Word2Vec). Sentences are represented as matrices (rank-2 or rank-3 arrays) + and processed by the neural network. + + The neural network model must be a Keras Sequential model with output dimension + matching the number of class labels. + + Reference: + Pre-trained Word2Vec: https://code.google.com/archive/p/word2vec/ + Example models available in the frameworks module. + """ + + def __init__( + self, + wvmodel: KeyedVectors, + vecsize: Optional[int] = None, + maxlen: int = 15, + with_gensim: bool = False + ): + """Initialize the classifier. + + Args: + wvmodel: Word embedding model (e.g., Word2Vec). + vecsize: Vector size. Default: None (extracted from model). + maxlen: Maximum number of words per sentence. Default: 15. + with_gensim: Whether to use gensim format. Default: False. + """ + CompactIOMachine.__init__( + self, + {'classifier': 'nnlibvec'}, + 'nnlibvec', + ['_classlabels.txt', '.json', '.weights.h5', '_config.json'] + ) + self.wvmodel = wvmodel + self.vecsize = self.wvmodel.vector_size if vecsize is None else vecsize + self.maxlen = maxlen + self.with_gensim = False if not with_gensim else with_gensim + self.trained = False + + def convert_trainingdata_matrix( + self, + classdict: dict[str, list[str]] + ) -> tuple[list[str], Annotated[npt.NDArray[np.float64], "3D Array"], Annotated[npt.NDArray[np.int64], "2D Array"]]: + """Convert training data to neural network input format. + + Args: + classdict: Training data with class labels as keys and texts as values. + + Returns: + Tuple of (class_labels, embedded_vectors, labels_array). + """ + classlabels = sorted(classdict.keys()) # sort the class labels to ensure uniqueness + lblidx_dict = dict(zip(classlabels, range(len(classlabels)))) + + phrases = [] + indices = [] + for label in classlabels: + for shorttext in classdict[label]: + shorttext = shorttext if isinstance(shorttext, str) else '' + category_bucket = [0]*len(classlabels) + category_bucket[lblidx_dict[label]] = 1 + indices.append(category_bucket) + phrases.append(tokenize(shorttext)) + + train_embedvec = np.zeros(shape=(len(phrases), self.maxlen, self.vecsize)) + for i in range(len(phrases)): + for j in range(min(self.maxlen, len(phrases[i]))): + train_embedvec[i, j, :] = self.word_to_embedvec(phrases[i][j]) + indices = np.array(indices, dtype=np.int_) + + return classlabels, train_embedvec, indices + + def train( + self, + classdict: dict[str, list[str]], + kerasmodel: Model, + nb_epoch: int = 10 + ): + """Train the classifier. + + Args: + classdict: Training data. + kerasmodel: Keras Sequential model. + nb_epoch: Number of training epochs. Default: 10. + + Raises: + ModelNotTrainedException: If model not loaded. + """ + self.classlabels, train_embedvec, indices = self.convert_trainingdata_matrix(classdict) + kerasmodel.fit(train_embedvec, indices, epochs=nb_epoch) + self.model = kerasmodel + self.trained = True + + def savemodel(self, nameprefix: str) -> None: + """Save the trained model to files. + + Args: + nameprefix: Prefix for output files. + + Raises: + ModelNotTrainedException: If not trained. + """ + if not self.trained: + raise ModelNotTrainedException() + + kerasio.save_model(nameprefix, self.model) + open(nameprefix+'_classlabels.txt', 'w').write('\n'.join(self.classlabels)) + open(nameprefix + '_config.json', 'wb').write( + orjson.dumps( + {'with_gensim': False, 'maxlen': self.maxlen, 'vecsize': self.vecsize} + ) + ) + + def loadmodel(self, nameprefix: str) -> None: + """Load a trained model from files. + + Args: + nameprefix: Prefix for input files. + """ + self.model = kerasio.load_model(nameprefix) + self.classlabels = [line.strip() for line in open(nameprefix+'_classlabels.txt', 'r')] + + if os.path.exists(nameprefix+'_config.json'): + config = orjson.loads(open(nameprefix+'_config.json', 'rb').read()) + if 'maxlen' in config: + self.maxlen = config['maxlen'] + else: + self.maxlen = 15 + if 'vecsize' in config: + self.vecsize = config['vecsize'] + else: + self.vecsize = self.wvmodel.vector_size + if self.vecsize != self.wvmodel.vector_size: + warnings.warn( + f'Record vector size ({self.vecsize}) is not the same as that of the given word-embedding model ({self.wvmodel.vector_size})! ' + \ + f'Setting the vector size to be {self.wvmodel.vector_size}, but there may be run time error!' + ) + self.vecsize = self.wvmodel.vector_size + else: + self.maxlen = 15 + self.vecsize = self.wvmodel.vector_size + warnings.warn('Model files from old versions.') + + self.with_gensim = False + self.trained = True + + def word_to_embedvec(self, word: str) -> npt.NDArray[np.float64]: + """Convert a word to its embedding vector. + + Args: + word: Input word. + + Returns: + Embedding vector. Returns zeros if word not in vocabulary. + """ + return self.wvmodel[word].astype(np.float64) if word in self.wvmodel else np.zeros(self.vecsize) + + def shorttext_to_matrix( + self, + shorttext: str + ) -> Annotated[npt.NDArray[np.float64], "2D Array"]: + """Convert short text to embedding matrix. + + Args: + shorttext: Input text. + + Returns: + Matrix of shape (maxlen, vecsize) with embedding vectors. + """ + tokens = tokenize(shorttext) + matrix = np.zeros((self.maxlen, self.vecsize)) + for i in range(min(self.maxlen, len(tokens))): + matrix[i] = self.word_to_embedvec(tokens[i]) + return matrix + + def score( + self, + shorttext: str, + model_params: Optional[dict[str, Any]] = None + ) -> dict[str, float]: + """Calculate classification scores for all class labels. + + Args: + shorttext: Input text. + model_params: Additional parameters for model prediction. + + Returns: + Dictionary mapping class labels to scores. + + Raises: + ModelNotTrainedException: If not trained. + """ + if model_params is None: + model_params = {} + + if not self.trained: + raise ModelNotTrainedException() + + matrix = np.array([self.shorttext_to_matrix(shorttext)]) + predictions = self.model.predict(matrix, **model_params) + + score_dict = { + classlabel: predictions[0, j] + for j, classlabel in enumerate(self.classlabels) + } + + return score_dict + + +def load_varnnlibvec_classifier( + wvmodel: KeyedVectors, + name: str, + compact: bool = True, + vecsize: Optional[int] = None +) -> VarNNEmbeddedVecClassifier: + """Load a VarNNEmbeddedVecClassifier from file. + + Args: + wvmodel: Word embedding model. + name: Model name (compact) or file prefix (non-compact). + compact: Whether to load compact model. Default: True. + vecsize: Vector size. Default: None. + + Returns: + VarNNEmbeddedVecClassifier instance. + """ + classifier = VarNNEmbeddedVecClassifier(wvmodel, vecsize=vecsize) + if compact: + classifier.load_compact_model(name) + else: + classifier.loadmodel(name) + return classifier diff --git a/shorttext/classifiers/embed/nnlib/__init__.py b/src/shorttext/classifiers/embed/nnlib/__init__.py similarity index 100% rename from shorttext/classifiers/embed/nnlib/__init__.py rename to src/shorttext/classifiers/embed/nnlib/__init__.py diff --git a/src/shorttext/classifiers/embed/nnlib/frameworks.py b/src/shorttext/classifiers/embed/nnlib/frameworks.py new file mode 100644 index 00000000..929ae7d7 --- /dev/null +++ b/src/shorttext/classifiers/embed/nnlib/frameworks.py @@ -0,0 +1,195 @@ + +from typing import Optional, Literal + +from gensim.models.keyedvectors import KeyedVectors +from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LSTM, Activation +from tensorflow.keras.models import Sequential, Model +from tensorflow.keras.regularizers import l2 + + +# Codes were changed because of Keras. +# Keras 1 --> Keras 2: https://github.com/fchollet/keras/wiki/Keras-2.0-release-notes + + +# Paper: Yoon Kim, "Convolutional Neural Networks for Sentence Classification," arXiv:1408.5882 (2014). +# ref: https://gist.github.com/entron/b9bc61a74e7cadeb1fec +# ref: http://cs231n.github.io/convolutional-networks/ +def CNNWordEmbed( + nb_labels: int, + wvmodel: Optional[KeyedVectors] = None, + nb_filters: int = 1200, + n_gram: int = 2, + maxlen: int = 15, + vecsize: int = 300, + cnn_dropout: float = 0.0, + final_activation: Literal["softplus", "softsign", "relu", "tanh", "sigmoid", "hard_sigmoid", "linear"] = "softmax", + dense_wl2reg: float = 0.0, + dense_bl2reg: float = 0.0, + optimizer: Literal["sgd", "rmsprop", "adagrad", "adadelta", "adam", "adamax", "nadam"] = "adam" +) -> Model: + """Create a CNN for word embeddings. + + Args: + nb_labels: Number of class labels. + wvmodel: Word embedding model. If provided, vecsize is extracted from it. + nb_filters: Number of filters. Default: 1200. + n_gram: N-gram (window size). Default: 2. + maxlen: Maximum sentence length. Default: 15. + vecsize: Embedding vector size. Default: 300. + cnn_dropout: CNN dropout rate. Default: 0.0. + final_activation: Final layer activation. Default: softmax. + dense_wl2reg: L2 regularization for weights. Default: 0.0. + dense_bl2reg: L2 regularization for bias. Default: 0.0. + optimizer: Optimizer. Default: adam. + + Returns: + Keras Sequential model. + + Reference: + Yoon Kim, "Convolutional Neural Networks for Sentence Classification," + EMNLP 2014 (arXiv:1408.5882). + https://arxiv.org/abs/1408.5882 + """ + if wvmodel is not None: + vecsize = wvmodel.vector_size + + model = Sequential() + model.add(Conv1D(filters=nb_filters, + kernel_size=n_gram, + padding='valid', + activation='relu', + input_shape=(maxlen, vecsize))) + if cnn_dropout > 0.0: + model.add(Dropout(cnn_dropout)) + model.add(MaxPooling1D(pool_size=maxlen - n_gram + 1)) + model.add(Flatten()) + model.add(Dense(nb_labels, kernel_regularizer=l2(dense_wl2reg), bias_regularizer=l2(dense_bl2reg))) + model.add(Activation(final_activation)) + model.compile(loss='categorical_crossentropy', optimizer=optimizer) + + return model + + +def DoubleCNNWordEmbed( + nb_labels: int, + wvmodel: Optional[KeyedVectors] = None, + nb_filters_1: int = 1200, + nb_filters_2: int = 600, + n_gram: int = 2, + filter_length_2: int = 10, + maxlen: int = 15, + vecsize: int = 300, + cnn_dropout_1: float = 0.0, + cnn_dropout_2: float = 0.0, + final_activation: Literal["softplus", "softsign", "relu", "tanh", "sigmoid", "hard_sigmoid", "linear"] = "softmax", + dense_wl2reg: float = 0.0, + dense_bl2reg: float = 0.0, + optimizer: Literal["sgd", "rmsprop", "adagrad", "adadelta", "adam", "adamax", "nadam"] = 'adam' +) -> Model: + """Create a double-layer CNN for word embeddings. + + Args: + nb_labels: Number of class labels. + wvmodel: Word embedding model. If provided, vecsize is extracted from it. + nb_filters_1: Filters for first layer. Default: 1200. + nb_filters_2: Filters for second layer. Default: 600. + n_gram: N-gram for first layer. Default: 2. + filter_length_2: Window size for second layer. Default: 10. + maxlen: Maximum sentence length. Default: 15. + vecsize: Embedding vector size. Default: 300. + cnn_dropout_1: Dropout for first layer. Default: 0.0. + cnn_dropout_2: Dropout for second layer. Default: 0.0. + final_activation: Final layer activation. Default: softmax. + dense_wl2reg: L2 regularization for weights. Default: 0.0. + dense_bl2reg: L2 regularization for bias. Default: 0.0. + optimizer: Optimizer. Default: adam. + + Returns: + Keras Sequential model. + """ + if wvmodel is not None: + vecsize = wvmodel.vector_size + + model = Sequential() + model.add(Conv1D(filters=nb_filters_1, + kernel_size=n_gram, + padding='valid', + activation='relu', + input_shape=(maxlen, vecsize))) + if cnn_dropout_1 > 0.0: + model.add(Dropout(cnn_dropout_1)) + model.add(Conv1D(filters=nb_filters_2, + kernel_size=filter_length_2, + padding='valid', + activation='relu')) + if cnn_dropout_2 > 0.0: + model.add(Dropout(cnn_dropout_2)) + model.add(MaxPooling1D(pool_size=maxlen - n_gram -filter_length_2 + 1)) + model.add(Flatten()) + model.add(Dense(nb_labels, kernel_regularizer=l2(dense_wl2reg), bias_regularizer=l2(dense_bl2reg))) + model.add(Activation(final_activation)) + model.compile(loss='categorical_crossentropy', optimizer=optimizer) + + return model + + +def CLSTMWordEmbed( + nb_labels: int, + wvmodel: Optional[KeyedVectors] = None, + nb_filters: int = 1200, + n_gram: int = 2, + maxlen: int = 15, + vecsize: int = 300, + cnn_dropout: float = 0.0, + nb_rnnoutdim: int = 1200, + rnn_dropout: int = 0.2, + final_activation: Literal["softplus", "softsign", "relu", "tanh", "sigmoid", "hard_sigmoid", "linear"] = "softmax", + dense_wl2reg: float = 0.0, + dense_bl2reg: float = 0.0, + optimizer: Literal["sgd", "rmsprop", "adagrad", "adadelta", "adam", "adamax", "nadam"] = "adam" +) -> Model: + """Create a C-LSTM model for word embeddings. + + Args: + nb_labels: Number of class labels. + wvmodel: Word embedding model. If provided, vecsize is extracted from it. + nb_filters: Number of CNN filters. Default: 1200. + n_gram: N-gram (window size). Default: 2. + maxlen: Maximum sentence length. Default: 15. + vecsize: Embedding vector size. Default: 300. + cnn_dropout: CNN dropout rate. Default: 0.0. + nb_rnnoutdim: LSTM output dimension. Default: 1200. + rnn_dropout: LSTM dropout rate. Default: 0.2. + final_activation: Final layer activation. Default: softmax. + dense_wl2reg: L2 regularization for weights. Default: 0.0. + dense_bl2reg: L2 regularization for bias. Default: 0.0. + optimizer: Optimizer. Default: adam. + + Returns: + Keras Sequential model. + + Reference: + Chunting Zhou et al., "A C-LSTM Neural Network for Text Classification," + arXiv:1511.08630 (2015). + https://arxiv.org/abs/1511.08630 + """ + if wvmodel is not None: + vecsize = wvmodel.vector_size + + model = Sequential() + model.add(Conv1D(filters=nb_filters, + kernel_size=n_gram, + padding='valid', + activation='relu', + input_shape=(maxlen, vecsize))) + if cnn_dropout > 0.0: + model.add(Dropout(cnn_dropout)) + model.add(MaxPooling1D(pool_size=maxlen - n_gram + 1)) + model.add(LSTM(nb_rnnoutdim)) + if rnn_dropout > 0.0: + model.add(Dropout(rnn_dropout)) + model.add(Dense(nb_labels, kernel_regularizer=l2(dense_wl2reg), bias_regularizer=l2(dense_bl2reg))) + model.add(Activation(final_activation)) + model.compile(loss='categorical_crossentropy', optimizer=optimizer) + + return model diff --git a/src/shorttext/classifiers/embed/sumvec/SumEmbedVecClassification.py b/src/shorttext/classifiers/embed/sumvec/SumEmbedVecClassification.py new file mode 100644 index 00000000..72b90cbd --- /dev/null +++ b/src/shorttext/classifiers/embed/sumvec/SumEmbedVecClassification.py @@ -0,0 +1,157 @@ + +import pickle +from collections import defaultdict +from typing import Optional, Annotated + +import numpy as np +import numpy.typing as npt +from gensim.models.keyedvectors import KeyedVectors +from loguru import logger + +from ....utils.classification_exceptions import ModelNotTrainedException +from ....utils import shorttext_to_avgvec +from ....utils.compactmodel_io import CompactIOMachine +from ....utils.compute import cosine_similarity + + +class SumEmbeddedVecClassifier(CompactIOMachine): + """Classifier using summed word embeddings. + + Each class is represented as the sum of word embeddings for its + training sentences, normalized to a unit vector. Prediction uses + cosine similarity between the input vector and class centroids. + + Reference: + Pre-trained Word2Vec: https://code.google.com/archive/p/word2vec/ + """ + + def __init__( + self, + wvmodel: KeyedVectors, + vecsize: Optional[int] = None, + simfcn: Optional[callable] = None + ): + """Initialize the classifier. + + Args: + wvmodel: Word embedding model (e.g., Word2Vec). + vecsize: Vector size. Default: None (extracted from model). + simfcn: Similarity function. Default: cosine_similarity. + """ + CompactIOMachine.__init__( + self, + {'classifier': 'sumvec'}, + 'sumvec', + ['_embedvecdict.pkl'] + ) + self.wvmodel = wvmodel + self.vecsize = self.wvmodel.vector_size if vecsize is None else vecsize + self.simfcn = simfcn if simfcn is not None else cosine_similarity + self.trained = False + + def train(self, classdict: dict[str, list[str]]) -> None: + """Train the classifier. + + Args: + classdict: Training data with class labels as keys and texts as values. + + Raises: + ModelNotTrainedException: If not trained or loaded. + """ + self.addvec = defaultdict(lambda : np.zeros(self.vecsize)) + for classtype in classdict: + self.addvec[classtype] = np.sum( + [ + self.shorttext_to_embedvec(shorttext) + for shorttext in classdict[classtype] + ], + axis=0 + ) + self.addvec[classtype] /= np.linalg.norm(self.addvec[classtype]) + self.addvec = dict(self.addvec) + self.trained = True + + def savemodel(self, nameprefix: str) -> None: + """Save the trained model. + + Args: + nameprefix: Prefix for output files. + + Raises: + ModelNotTrainedException: If not trained. + """ + if not self.trained: + raise ModelNotTrainedException() + pickle.dump(self.addvec, open(nameprefix+'_embedvecdict.pkl', 'wb')) + + def loadmodel(self, nameprefix: str) -> None: + """Load a trained model. + + Args: + nameprefix: Prefix for input files. + """ + self.addvec = pickle.load(open(nameprefix+'_embedvecdict.pkl', 'rb')) + self.trained = True + + def shorttext_to_embedvec( + self, + shorttext: str + ) -> Annotated[npt.NDArray[np.float64], "1D Array"]: + """Convert short text to embedding vector. + + Args: + shorttext: Input text. + + Returns: + Normalized embedding vector. + """ + return shorttext_to_avgvec(shorttext, self.wvmodel) + + def score(self, shorttext: str) -> dict[str, float]: + """Calculate classification scores for all class labels. + + Args: + shorttext: Input text. + + Returns: + Dictionary mapping class labels to scores. + + Raises: + ModelNotTrainedException: If not trained. + """ + if not self.trained: + raise ModelNotTrainedException() + + vec = self.shorttext_to_embedvec(shorttext) + scoredict = {} + for classtype, addvec in self.addvec.items(): + try: + scoredict[classtype] = self.simfcn(vec, addvec) + except ValueError: + scoredict[classtype] = np.nan + return scoredict + + +def load_sumword2vec_classifier( + wvmodel: KeyedVectors, + name: str, + compact: bool = True, + vecsize: Optional[int] = None +) -> SumEmbeddedVecClassifier: + """Load a SumEmbeddedVecClassifier from file. + + Args: + wvmodel: Word embedding model. + name: Model name (compact) or prefix (non-compact). + compact: Whether to load compact model. Default: True. + vecsize: Vector size. Default: None. + + Returns: + SumEmbeddedVecClassifier instance. + """ + classifier = SumEmbeddedVecClassifier(wvmodel, vecsize=vecsize) + if compact: + classifier.load_compact_model(name) + else: + classifier.loadmodel(name) + return classifier diff --git a/src/shorttext/classifiers/embed/sumvec/VarNNSumEmbedVecClassification.py b/src/shorttext/classifiers/embed/sumvec/VarNNSumEmbedVecClassification.py new file mode 100644 index 00000000..a89a53d1 --- /dev/null +++ b/src/shorttext/classifiers/embed/sumvec/VarNNSumEmbedVecClassification.py @@ -0,0 +1,222 @@ + +from typing import Optional, Annotated + +import numpy as np +import numpy.typing as npt +from gensim.models.keyedvectors import KeyedVectors +from tensorflow.keras.models import Model + +from ....utils import kerasmodel_io as kerasio +from ....utils.classification_exceptions import ModelNotTrainedException +from ....utils.textpreprocessing import tokenize +from ....utils.compactmodel_io import CompactIOMachine +from ...base import AbstractScorer + + +class VarNNSumEmbeddedVecClassifier(AbstractScorer, CompactIOMachine): + """Neural network classifier using summed embeddings. + + Wraps Keras neural network models for supervised short text classification. + Each token is converted to an embedded vector using a pre-trained word-embedding + model. The sentence embedding is the sum of token embeddings, normalized to + a unit vector. + + The neural network model must be a Keras Sequential model with output dimension + matching the number of class labels. + + Reference: + Pre-trained Word2Vec: https://code.google.com/archive/p/word2vec/ + Example models available in the frameworks module. + """ + + def __init__( + self, + wvmodel: KeyedVectors, + vecsize: Optional[int] = None, + maxlen: int = 15 + ): + """Initialize the classifier. + + Args: + wvmodel: Word embedding model (e.g., Word2Vec). + vecsize: Vector size. Default: None (extracted from model). + maxlen: Maximum number of words per sentence. Default: 15. + """ + CompactIOMachine.__init__( + self, + {'classifier': 'sumnnlibvec'}, + 'sumnnlibvec', + ['_classlabels.txt', '.json', '.weights.h5'] + ) + self.wvmodel = wvmodel + self.vecsize = self.wvmodel.vector_size if vecsize is None else vecsize + self.maxlen = maxlen + self.trained = False + + def convert_traindata_embedvecs( + self, + classdict: dict[str, list[str]] + ) -> tuple[list[str], Annotated[npt.NDArray[np.float64], "2D Array"], Annotated[npt.NDArray[np.int64], "2D Array"]]: + """Convert training data to embedded vectors. + + Converts each short text into a normalized sum of word embeddings. + + Args: + classdict: Training data with class labels as keys and texts as values. + + Returns: + Tuple of (class_labels, embedding_matrix, labels_array). + """ + classlabels = sorted(classdict.keys()) + lblidx_dict = dict(zip(classlabels, range(len(classlabels)))) + + indices = [] + embedvecs = [] + for classlabel in classlabels: + for shorttext in classdict[classlabel]: + embedvec = np.sum( + np.array([ + self.word_to_embedvec(token) + for token in tokenize(shorttext) + ]), + axis=0 + ) + norm = np.linalg.norm(embedvec) + if norm == 0: + continue + embedvec /= norm + embedvecs.append(embedvec) + category_bucket = [0]*len(classlabels) + category_bucket[lblidx_dict[classlabel]] = 1 + indices.append(category_bucket) + + indices = np.array(indices) + embedvecs = np.array(embedvecs) + return classlabels, embedvecs, indices + + def train( + self, + classdict: dict[str, list[str]], + kerasmodel: Model, + nb_epoch: int = 10 + ) -> None: + """Train the classifier. + + Args: + classdict: Training data. + kerasmodel: Keras Sequential model. + nb_epoch: Number of training epochs. Default: 10. + + Raises: + ModelNotTrainedException: If not trained or loaded. + """ + self.classlabels, train_embedvec, indices = self.convert_traindata_embedvecs(classdict) + kerasmodel.fit(train_embedvec, indices, epochs=nb_epoch) + self.model = kerasmodel + self.trained = True + + def savemodel(self, nameprefix: str) -> None: + """Save the trained model to files. + + Args: + nameprefix: Prefix for output files. + + Raises: + ModelNotTrainedException: If not trained. + """ + if not self.trained: + raise ModelNotTrainedException() + + kerasio.save_model(nameprefix, self.model) + open(nameprefix+'_classlabels.txt', 'w').write('\n'.join(self.classlabels)) + + def loadmodel(self, nameprefix: str) -> None: + """Load a trained model from files. + + Args: + nameprefix: Prefix for input files. + """ + self.model = kerasio.load_model(nameprefix) + self.classlabels = [s.strip() for s in open(nameprefix+'_classlabels.txt', 'r')] + self.trained = True + + def word_to_embedvec(self, word: str) -> Annotated[npt.NDArray[np.float64], "1D Array"]: + """Convert a word to its embedding vector. + + Args: + word: Input word. + + Returns: + Embedding vector. Returns zeros if word not in vocabulary. + """ + return self.wvmodel[word].astype(np.float64) if word in self.wvmodel else np.zeros(self.vecsize) + + def shorttext_to_embedvec(self, shorttext: str) -> Annotated[npt.NDArray[np.float64], "1D Array"]: + """Convert short text to embedding vector. + + Sums token embeddings and normalizes to unit vector. + + Args: + shorttext: Input text. + + Returns: + Normalized embedding vector. + """ + vec = np.sum([ + self.wvmodel[token].astype(np.float64) + for token in tokenize(shorttext) + if token in self.wvmodel + ]) + norm = np.linalg.norm(vec) + if norm != 0: + vec /= np.linalg.norm(vec) + return vec + + def score(self, shorttext: str) -> dict[str, float]: + """Calculate classification scores for all class labels. + + Args: + shorttext: Input text. + + Returns: + Dictionary mapping class labels to scores. + + Raises: + ModelNotTrainedException: If not trained. + """ + if not self.trained: + raise ModelNotTrainedException() + + embedvec = np.array(self.shorttext_to_embedvec(shorttext)) + predictions = self.model.predict(np.array([embedvec])) + + scoredict = { + classlabel: predictions[0, idx] + for idx, classlabel in enumerate(self.classlabels) + } + return scoredict + + +def load_varnnsumvec_classifier( + wvmodel: KeyedVectors, + name: str, + compact: bool = True, + vecsize: Optional[int] = None +) -> VarNNSumEmbeddedVecClassifier: + """Load a VarNNSumEmbeddedVecClassifier from file. + + Args: + wvmodel: Word embedding model. + name: Model name (compact) or file prefix (non-compact). + compact: Whether to load compact model. Default: True. + vecsize: Vector size. Default: None. + + Returns: + VarNNSumEmbeddedVecClassifier instance. + """ + classifier = VarNNSumEmbeddedVecClassifier(wvmodel, vecsize=vecsize) + if compact: + classifier.load_compact_model(name) + else: + classifier.loadmodel(name) + return classifier diff --git a/shorttext/classifiers/embed/sumvec/__init__.py b/src/shorttext/classifiers/embed/sumvec/__init__.py similarity index 100% rename from shorttext/classifiers/embed/sumvec/__init__.py rename to src/shorttext/classifiers/embed/sumvec/__init__.py diff --git a/src/shorttext/classifiers/embed/sumvec/frameworks.py b/src/shorttext/classifiers/embed/sumvec/frameworks.py new file mode 100644 index 00000000..1502058f --- /dev/null +++ b/src/shorttext/classifiers/embed/sumvec/frameworks.py @@ -0,0 +1,62 @@ + +from typing import Optional, Literal + +from tensorflow.keras.layers import Dense, Activation +from tensorflow.keras.models import Sequential, Model +from tensorflow.keras.regularizers import l2 + +from ....utils.classification_exceptions import UnequalArrayLengthsException + + +def DenseWordEmbed( + nb_labels: int, + dense_nb_nodes: Optional[list[int]] = None, + dense_actfcn: Optional[Literal["softplus", "softsign", "relu", "tanh", "sigmoid", "hard_sigmoid", "linear"]] = None, + vecsize: int = 300, + reg_coef: float = 0.1, + final_activiation: Literal["softplus", "softsign", "relu", "tanh", "sigmoid", "hard_sigmoid", "linear"] = "softmax", + optimizer: Literal["sgd", "rmsprop", "adagrad", "adadelta", "adam", "adamax", "nadam"] = "adam" +) -> Model: + """Create a dense neural network for embedding-based classification. + + Args: + nb_labels: Number of class labels. + dense_nb_nodes: Nodes per layer. Default: []. + dense_actfcn: Activation functions per layer. Default: []. + vecsize: Embedding vector size. Default: 300. + reg_coef: L2 regularization coefficient. Default: 0.1. + final_activiation: Final layer activation. Default: softmax. + optimizer: Optimizer. Default: adam. + + Returns: + Keras Sequential model. + + Raises: + UnequalArrayLengthsException: If dense_nb_nodes and dense_actfcn have different lengths. + """ + if dense_nb_nodes is None: + dense_nb_nodes = [] + if dense_actfcn is None: + dense_actfcn = [] + + if len(dense_nb_nodes)!=len(dense_actfcn): + raise UnequalArrayLengthsException(dense_nb_nodes, dense_actfcn) + nb_layers = len(dense_nb_nodes) + + model = Sequential() + if nb_layers==0: + model.add(Dense(nb_labels, input_shape=(vecsize,), kernel_regularizer=l2(reg_coef))) + else: + model.add(Dense(dense_nb_nodes[0], + input_shape=(vecsize,), + activation=dense_actfcn[0], + kernel_regularizer=l2(reg_coef)) + ) + for nb_nodes, activation in zip(dense_nb_nodes[1:], dense_actfcn[1:]): + model.add(Dense(nb_nodes, activation=activation, kernel_regularizer=l2(reg_coef))) + model.add(Dense(nb_labels, kernel_regularizer=l2(reg_coef))) + + model.add(Activation(final_activiation)) + model.compile(loss="categorical_crossentropy", optimizer=optimizer) + + return model diff --git a/src/shorttext/cli/__init__.py b/src/shorttext/cli/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/shorttext/cli/categorization.py b/src/shorttext/cli/categorization.py new file mode 100644 index 00000000..d2817e1e --- /dev/null +++ b/src/shorttext/cli/categorization.py @@ -0,0 +1,109 @@ + +import os +from functools import partial +from argparse import ArgumentParser +from operator import itemgetter + +from loguru import logger + +from ..utils.compactmodel_io import get_model_classifier_name +from ..utils.classification_exceptions import AlgorithmNotExistException, WordEmbeddingModelNotExistException +from ..utils import load_word2vec_model, load_fasttext_model, load_poincare_model +from ..smartload import smartload_compact_model +from ..classifiers import TopicVectorCosineDistanceClassifier + + +# configs +allowed_classifiers = [ + 'ldatopic', 'lsitopic', 'rptopic', 'kerasautoencoder', + 'topic_sklearn', 'nnlibvec', 'sumvec', 'maxent' +] +needembedded_classifiers = ['nnlibvec', 'sumvec'] +topicmodels = ['ldatopic', 'lsitopic', 'rptopic', 'kerasautoencoder'] + + +# lazy functions for loading word embedding model +load_word2vec_nonbinary_model = partial(load_word2vec_model, binary=False) +load_poincare_binary_model = partial(load_poincare_model, binary=True) + +typedict = { + 'word2vec': load_word2vec_model, + 'word2vec_nonbinary': load_word2vec_nonbinary_model, + 'fasttext': load_fasttext_model, + 'poincare': load_poincare_model, + 'poincare_binary': load_poincare_binary_model +} + + +def get_argparser() -> ArgumentParser: + """Get argument parser for short text categorization CLI. + + Returns: + ArgumentParser for command line arguments. + """ + parser = ArgumentParser( + description='Perform prediction on short text with a given trained model.' + ) + parser.add_argument('model_filepath', help='Path of the trained (compact) model.') + parser.add_argument('--wv', default='', help='Path of the pre-trained Word2Vec model.') + parser.add_argument('--vecsize', default=300, type=int, help='Vector dimensions. (Default: 300)') + parser.add_argument('--topn', type=int, default=10, help='Number of top results to show.') + parser.add_argument('--inputtext', default=None, help='Single input text for classification. If omitted, will enter console mode.') + parser.add_argument('--type', default='word2vec', choices=typedict.keys(), + help='Type of word-embedding model (default: word2vec)') + return parser + + +# main block +def main(): + # argument parsing + args = get_argparser().parse_args() + + # check if the model file is given + if not os.path.exists(args.model_filepath): + raise IOError(f'Model file "{args.model_filepath}" not found!') + + # get the name of the classifier + logger.info('Retrieving classifier name...') + classifier_name = get_model_classifier_name(args.model_filepath) + + if classifier_name not in allowed_classifiers: + raise AlgorithmNotExistException(classifier_name) + + # load the Word2Vec model if necessary + wvmodel = None + if classifier_name in needembedded_classifiers: + # check if the word embedding model is available + if not os.path.exists(args.wv): + raise WordEmbeddingModelNotExistException(args.wv) + # if there, load it + logger.info(f'Loading word-embedding model from {args.wv}...') + wvmodel = typedict[args.type](args.wv) + + # load the classifier + logger.info('Initializing the classifier...') + if classifier_name in topicmodels: + topicmodel = smartload_compact_model(args.model_filepath, wvmodel, vecsize=args.vecsize) + classifier = TopicVectorCosineDistanceClassifier(topicmodel) + else: + classifier = smartload_compact_model(args.model_filepath, wvmodel, vecsize=args.vecsize) + + # predict single input or run in console mode + if args.inputtext is not None: + if len(args.inputtext.strip()) == 0: + print('No input text provided.') + return + scoredict = classifier.score(args.inputtext) + for label, score in sorted(scoredict.items(), key=itemgetter(1), reverse=True)[:args.topn]: + print(f'{label} : {score:.4f}') + else: + # Console + print('Enter text to classify (empty input to quit):') + while True: + shorttext = input('text> ').strip() + if not shorttext: + break + scoredict = classifier.score(shorttext) + for label, score in sorted(scoredict.items(), key=itemgetter(1), reverse=True)[:args.topn]: + print(f'{label} : {score:.4f}') + print('Done.') diff --git a/src/shorttext/cli/wordembedsim.py b/src/shorttext/cli/wordembedsim.py new file mode 100644 index 00000000..f5f03039 --- /dev/null +++ b/src/shorttext/cli/wordembedsim.py @@ -0,0 +1,58 @@ + +import argparse +import time + +from scipy.spatial.distance import cosine + +from ..metrics.embedfuzzy import jaccardscore_sents +from ..utils import tokenize, load_word2vec_model, load_fasttext_model, load_poincare_model +from ..utils import shorttext_to_avgvec +from ..metrics.wasserstein import word_mover_distance +from ..metrics.dynprog.jaccard import soft_jaccard_score +from ..utils.compute import cosine_similarity + +typedict = { + 'word2vec': load_word2vec_model, + 'fasttext': load_fasttext_model, + 'poincare': load_poincare_model +} + + +def getargparser(): + """Get argument parser for word embedding similarity CLI. + + Returns: + ArgumentParser for command line arguments. + """ + parser.add_argument('modelpath', help='Path of the Word2Vec model') + parser.add_argument('--type', default='word2vec', + help='Type of word-embedding model (default: "word2vec"; other options: "fasttext", "poincare")') + return parser + + +def main(): + # argument parsing + args = getargparser().parse_args() + + # preload tokenizer + tokenize('Mogu is cute.') + + time0 = time.time() + print(f"Loading {args.type} model: {args.modelpath}") + wvmodel = typedict[args.type](args.modelpath) + time1 = time.time() + end = False + print(f"... loading time: {time1 - time0} seconds") + + while not end: + sent1 = input('sent1> ') + if len(sent1)==0: + end = True + else: + sent2 = input('sent2> ') + + # output results + print(f"Cosine Similarity = {cosine_similarity(shorttext_to_avgvec(sent1, wvmodel), shorttext_to_avgvec(sent2, wvmodel)):.4f}") + print(f"Word-embedding Jaccard Score Similarity = {jaccardscore_sents(sent1, sent2, wvmodel):.4f}") + print(f"Word Mover's Distance = {word_mover_distance(tokenize(sent1), tokenize(sent2), wvmodel):.4f}") + print(f"Soft Jaccard Score (edit distance) = {soft_jaccard_score(tokenize(sent1), tokenize(sent2)):.4f}") diff --git a/src/shorttext/data/__init__.py b/src/shorttext/data/__init__.py new file mode 100644 index 00000000..5a1efbff --- /dev/null +++ b/src/shorttext/data/__init__.py @@ -0,0 +1,2 @@ + +from .data_retrieval import subjectkeywords, nihreports, inaugural, retrieve_jsondata_as_dict, retrieve_csvdata_as_dict, yield_crossvalidation_classdicts diff --git a/src/shorttext/data/data_retrieval.py b/src/shorttext/data/data_retrieval.py new file mode 100644 index 00000000..efa1ecc3 --- /dev/null +++ b/src/shorttext/data/data_retrieval.py @@ -0,0 +1,255 @@ + +import random +from collections import defaultdict +import json +import os +from os import PathLike +from pathlib import Path +import zipfile +import sys +import csv +from urllib.request import urlretrieve +from io import TextIOWrapper +from typing import Generator + +import pandas as pd +import numpy as np +import orjson + + +def retrieve_csvdata_as_dict(filepath: str | PathLike) -> dict[str, list[str]]: + """Retrieve the training data in a CSV file. + + Reads a CSV file where the first column contains class labels and the second column + contains text data. Returns a dictionary mapping class labels to lists of + short texts. + + Args: + filepath: Path to the CSV training data file. + + Returns: + A dictionary with class labels as keys and lists of short texts as values. + + Reference: + Data format inspired by common text classification benchmarks. + """ + datafile = open(filepath, 'r') + reader = csv.reader(datafile) + headerread = False + shorttextdict = defaultdict(lambda: []) + for label, content in reader: + if headerread: + if isinstance(content, str): + shorttextdict[label] += [content] + else: + headerread = True + return dict(shorttextdict) + + +def retrieve_jsondata_as_dict(filepath: str | PathLike) -> dict: + """Retrieve the training data in a JSON file. + + Reads a JSON file where class labels are keys and lists of short texts + are values. Returns the corresponding dictionary. + + Args: + filepath: Path to the JSON training data file. + + Returns: + A dictionary with class labels as keys and lists of short texts as values. + """ + return orjson.loads(open(filepath, 'rb').read()) + + +def get_or_download_data( + filename: str, + origin: str, + asbytes: bool = False +) -> TextIOWrapper: + """Retrieve or download a data file. + + Checks if the file exists in the user's home directory under .shorttext. + If not present, downloads from the given origin URL. + + Args: + filename: Name of the file to retrieve. + origin: URL to download the file from if not present locally. + asbytes: If True, opens the file in binary mode. Default is False. + + Returns: + A file object (text or binary mode depending on asbytes). + """ + # determine path + homedir = os.path.expanduser('~') + datadir = os.path.join(homedir, '.shorttext') + if not os.path.exists(datadir): + os.makedirs(datadir) + + targetfilepath = os.path.join(datadir, filename) + # download if not exist + if not os.path.exists(os.path.join(datadir, filename)): + print('Downloading...', file=sys.stderr) + print(f'Source: {origin}', file=sys.stderr) + print(f'Target: {targetfilepath}', file=sys.stderr) + try: + urlretrieve(origin, targetfilepath) + except: + print('Failure to download file!', file=sys.stderr) + print(sys.exc_info(), file=sys.stderr) + os.remove(targetfilepath) + + # return + return open(targetfilepath, 'rb' if asbytes else 'r') + + +def subjectkeywords() -> dict[str, list[str]]: + """Return an example dataset of subjects with keywords. + + Returns a small example dataset with three subjects and their + corresponding keywords, in the training input format. + + Returns: + A dictionary with subject labels as keys and lists of keywords as values. + """ + parentdir = Path(__file__).parent + return retrieve_csvdata_as_dict(parentdir / "shorttext_exampledata.csv") + + +def inaugural() -> dict[str, list[str]]: + """Return the Inaugural Addresses of US Presidents. + + Returns an example dataset containing the Inaugural Addresses of all + Presidents of the United States from George Washington to Barack Obama. + + Each key is formatted as "year-lastname" and the value is a list of + sentences from the address. + + Returns: + A dictionary with president identifiers as keys and lists of sentences as values. + + Reference: + https://www.presidency.us/kisa_exec/inaugural.html + """ + zfile = zipfile.ZipFile( + get_or_download_data( + "USInaugural.zip", + "https://shorttext-data-northernvirginia.s3.amazonaws.com/trainingdata/USInaugural.zip", + asbytes=True + ) + ) + address_jsonstr = zfile.open("addresses.json").read() + zfile.close() + return json.loads(address_jsonstr.decode('utf-8')) + + +def nihreports(txt_col='PROJECT_TITLE', label_col='FUNDING_ICs', sample_size=512): + """Return an example dataset sampled from NIH RePORT. + + Returns an example dataset from NIH (National Institutes of Health) + RePORT (Research Portfolio Online Reporting Tools) website. + + Args: + txt_col: Column for text data. Options: 'PROJECT_TITLE' or 'ABSTRACT_TEXT'. + Default: 'PROJECT_TITLE'. + label_col: Column for labels. Options: 'FUNDING_ICs' or 'IC_NAME'. + Default: 'FUNDING_ICs'. + sample_size: Number of samples to return. Set to None for all rows. Default: 512. + + Returns: + A dictionary with IC identifiers as keys and lists of text data as values. + + Reference: + https://exporter.nih.gov/ExPORTER_Catalog.aspx + Dataset adapted from the R package textmineR: + https://cran.r-project.org/web/packages/textmineR/index.html + """ + # validation + # txt_col = 'PROJECT_TITLE' or 'ABSTRACT_TEXT' + # label_col = 'FUNDING_ICs' or 'IC_NAME' + if not (txt_col in ['PROJECT_TITLE', 'ABSTRACT_TEXT']): + raise KeyError(f'Undefined text column: {txt_col}. Must be PROJECT_TITLE or ABSTRACT_TEXT.') + if not (label_col in ['FUNDING_ICs', 'IC_NAME']): + raise KeyError(f'Undefined label column: {label_col}. Must be FUNDING_ICs or IC_NAME.') + + zfile = zipfile.ZipFile(get_or_download_data('nih_full.csv.zip', + 'https://shorttext-data-northernvirginia.s3.amazonaws.com/trainingdata/nih_full.csv.zip', + asbytes=True), + 'r', + zipfile.ZIP_DEFLATED) + nih = pd.read_csv(zfile.open('nih_full.csv'), na_filter=False, usecols=[label_col, txt_col], encoding='cp437') + zfile.close() + nb_data = len(nih) + sample_size = nb_data if sample_size==None else min(nb_data, sample_size) + + classdict = defaultdict(lambda : []) + + for rowidx in np.random.randint(nb_data, size=min(nb_data, sample_size)): + label = nih.iloc[rowidx, nih.columns.get_loc(label_col)] + if label_col=='FUNDING_ICs': + if label=='': + label = 'OTHER' + else: + endpos = label.index(':') + label = label[:endpos] + classdict[label] += [nih.iloc[rowidx, nih.columns.get_loc(txt_col)]] + + return dict(classdict) + + +def merge_cv_dicts(dicts: list[dict[str, list[str]]]) -> dict[str, list[str]]: + """Merge multiple training data dictionaries. + + Combines multiple data dictionaries in the training data format + into a single dictionary. + + Args: + dicts: List of dictionaries to merge, each with class labels + as keys and lists of texts as values. + + Returns: + A merged dictionary with all class labels and texts combined. + """ + # NOTE: this is not a usualy Python dict merge. It does specialized merging. + mdict = defaultdict(lambda : []) + for thisdict in dicts: + for label in thisdict: + mdict[label] += thisdict[label] + return dict(mdict) + + +def yield_crossvalidation_classdicts( + classdict: dict[str, list[str]], + nb_partitions: int, + shuffle: bool = False +) -> Generator[tuple[dict[str, list[str]], dict[str, list[str]]], None, None]: + """Yield training and test data partitions for cross-validation. + + Partitions the training data into multiple sets. Each iteration yields + a (test_dict, train_dict) pair where one partition is used as test + data and the remaining partitions are combined as training data. + + Args: + classdict: Training data dictionary with class labels as keys + and lists of texts as values. + nb_partitions: Number of partitions to create. + shuffle: Whether to shuffle data before partitioning. Default: False. + + Yields: + Tuples of (test_dict, train_dict) for each partition. + """ + crossvaldicts = [] + for _ in range(nb_partitions): + crossvaldicts.append(defaultdict(lambda: [])) + + for label in classdict: + nb_data = len(classdict[label]) + partsize = nb_data / nb_partitions + sentences = classdict[label] if not shuffle else random.shuffle(sentences) + for i in range(nb_partitions): + crossvaldicts[i][label] += sentences[i * partsize:min(nb_data, (i + 1) * partsize)] + crossvaldicts = [dict(crossvaldict) for crossvaldict in crossvaldicts] + + for i in range(nb_partitions): + testdict = crossvaldicts[i] + traindict = merge_cv_dicts([crossvaldicts[j] for j in range(nb_partitions) if j != i]) + yield testdict, traindict diff --git a/shorttext/data/shorttext_exampledata.csv b/src/shorttext/data/shorttext_exampledata.csv similarity index 100% rename from shorttext/data/shorttext_exampledata.csv rename to src/shorttext/data/shorttext_exampledata.csv diff --git a/src/shorttext/generators/__init__.py b/src/shorttext/generators/__init__.py new file mode 100644 index 00000000..c1d9ae5b --- /dev/null +++ b/src/shorttext/generators/__init__.py @@ -0,0 +1,9 @@ +from .bow.GensimTopicModeling import load_gensimtopicmodel +from .bow.AutoEncodingTopicModeling import load_autoencoder_topicmodel + +from .bow.GensimTopicModeling import LatentTopicModeler, GensimTopicModeler, LDAModeler, LSIModeler, RPModeler +from .bow.AutoEncodingTopicModeling import AutoencodingTopicModeler + +from .charbase.char2vec import SentenceToCharVecEncoder, initialize_SentenceToCharVecEncoder +from .seq2seq.s2skeras import Seq2SeqWithKeras, load_seq2seq_model +from .seq2seq.charbaseS2S import CharBasedSeq2SeqGenerator, loadCharBasedSeq2SeqGenerator diff --git a/src/shorttext/generators/bow/AutoEncodingTopicModeling.py b/src/shorttext/generators/bow/AutoEncodingTopicModeling.py new file mode 100644 index 00000000..3b833dbf --- /dev/null +++ b/src/shorttext/generators/bow/AutoEncodingTopicModeling.py @@ -0,0 +1,309 @@ + +import json +import pickle +from typing import Optional, Any +from collections import Counter + +import numpy as np +import numpy.typing as npt +import sparse +from tensorflow.keras import Input +from tensorflow.keras import Model +from tensorflow.keras.layers import Dense +import orjson + +from .LatentTopicModeling import LatentTopicModeler +from ...utils import kerasmodel_io as kerasio, textpreprocessing as textpreprocess +from ...utils.compactmodel_io import CompactIOMachine +from ...utils.classification_exceptions import ModelNotTrainedException +from ...utils.dtm import generate_npdict_document_term_matrix, convert_classdict_to_corpus +from ...utils.compute import cosine_similarity +from ...schemas.models import AutoEncoderPackage + + +autoencoder_suffices = ['_encoder.json', '_encoder.weights.h5', '_classtopicvecs.pkl', + '_decoder.json', '_decoder.weights.h5', '_autoencoder.json', '_autoencoder.weights.h5', + '.json'] + + +def get_autoencoder_models( + vector_size: int, + nb_latent_vector_size: int +) -> AutoEncoderPackage: + """Create autoencoder model components. + + Args: + vector_size: Size of input vectors. + nb_latent_vector_size: Size of the latent space (number of topics). + + Returns: + AutoEncoderPackage containing autoencoder, encoder, and decoder models. + """ + # define all the layers of the autoencoder + input_vec = Input(shape=(vector_size,)) + encoded = Dense(nb_latent_vector_size, activation='relu')(input_vec) + decoded = Dense(vector_size, activation='sigmoid')(encoded) + + # define the autoencoder model + autoencoder = Model(inputs=input_vec, outputs=decoded) + + # define the encoder + encoder = Model(inputs=input_vec, outputs=encoded) + + # define the decoder + encoded_input = Input(shape=(nb_latent_vector_size,)) + decoder_layer = autoencoder.layers[-1] + decoder = Model(inputs=encoded_input, outputs=decoder_layer(encoded_input)) + + # compile the autoencoder + autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy') + + return AutoEncoderPackage( + autoencoder=autoencoder, + encoder=encoder, + decoder=decoder + ) + + +class AutoencodingTopicModeler(LatentTopicModeler, CompactIOMachine): + """Topic modeler using autoencoder. + + Uses a Keras autoencoder to learn latent topic representations. + The encoded vectors serve as topic vectors for short text classification. + + Reference: + Francois Chollet, "Building Autoencoders in Keras," + https://blog.keras.io/building-autoencoders-in-keras.html + """ + + def __init__( + self, + preprocessor: Optional[callable] = None, + tokenizer: Optional[callable] = None, + normalize: bool = True + ): + CompactIOMachine.__init__(self, {'classifier': 'kerasautoencoder'}, 'kerasautoencoder', autoencoder_suffices) + LatentTopicModeler.__init__(self, preprocessor, tokenizer, normalize=normalize) + + def train(self, classdict: dict[str, list[str]], nb_topics: int, *args, **kwargs) -> None: + """Train the autoencoder topic model. + + Args: + classdict: Training data with class labels as keys and texts as values. + nb_topics: Number of latent topics (encoding dimensions). + *args: Arguments for Keras model fitting. + **kwargs: Keyword arguments for Keras model fitting. + """ + self.nb_topics = nb_topics + corpus, docids = convert_classdict_to_corpus(classdict, self.preprocess_func) + dtm_matrix = generate_npdict_document_term_matrix( + corpus, docids, tokenize_func=self.tokenize_func + ) + vecsize = dtm_matrix.dimension_sizes[1] + self.token2indices = dtm_matrix._keystrings_to_indices[1] + self.classlabels = sorted(classdict.keys()) + + autoencoder_package = get_autoencoder_models(vecsize, self.nb_topics) + autoencoder = autoencoder_package.autoencoder + encoder = autoencoder_package.encoder + decoder = autoencoder_package.decoder + + # process training data + embedvecs = dtm_matrix.to_numpy() + + # fit the model + autoencoder.fit(embedvecs, embedvecs, *args, **kwargs) + + # store the autoencoder models + self.autoencoder = autoencoder + self.encoder = encoder + self.decoder = decoder + + # flag setting + self.trained = True + + # classes topic vector precomputation + self.classtopicvecs = {} + for label in classdict: + self.classtopicvecs[label] = self.precalculate_liststr_topicvec(classdict[label]) + + def retrieve_bow(self, shorttext: str) -> list[tuple[int, int]]: + """Get bag-of-words representation. + + Args: + shorttext: Input text. + + Returns: + List of (token_index, count) tuples. + """ + tokens_freq = Counter(self.tokenize_func(self.preprocess_func(shorttext))) + return [ + (self.token2indices[token], freq) + for token, freq in tokens_freq.items() + if token in self.token2indices.keys() + ] + + def retrieve_bow_vector(self, shorttext: str) -> npt.NDArray[np.float64]: + """Get bag-of-words vector. + + Args: + shorttext: Input text. + + Returns: + BOW vector (normalized if normalize=True). + """ + bow = self.retrieve_bow(shorttext) + if len(bow) > 0: + vec = sparse.COO( + [[0]*len(bow), [id for id, val in bow]], + [val for id, val in bow], + shape=(1, len(self.token2indices)) + ).todense()[0] + else: + vec = np.ones(len(self.token2indices)) + if self.normalize: + vec = vec.astype(np.float64) / np.linalg.norm(vec) + return vec + + def retrieve_topicvec(self, shorttext: str) -> npt.NDArray[np.float64]: + """Get topic vector for short text. + + Args: + shorttext: Input text. + + Returns: + Encoded vector representation. + + Raises: + ModelNotTrainedException: If model not trained. + """ + if not self.trained: + raise ModelNotTrainedException() + bow_vector = self.retrieve_bow_vector(shorttext) + encoded_vec = self.encoder.predict(np.expand_dims(bow_vector, axis=0))[0] + if self.normalize: + encoded_vec /= np.linalg.norm(encoded_vec) + return encoded_vec.astype(np.float64) + + def precalculate_liststr_topicvec(self, shorttexts: list[str]) -> npt.NDArray[np.float64]: + """Calculate average topic vector for a list of texts. + + Used during training to compute class centroids. + + Args: + shorttexts: List of texts. + + Returns: + Average topic vector (normalized). + + Raises: + ModelNotTrainedException: If model not trained. + """ + sumvec = sum([self.retrieve_topicvec(shorttext) for shorttext in shorttexts]) + sumvec /= np.linalg.norm(sumvec) + return sumvec + + def get_batch_cos_similarities(self, shorttext: str) -> dict[str, float]: + """Get cosine similarities to all class centroids. + + Args: + shorttext: Input text. + + Returns: + Dictionary mapping class labels to similarity scores. + + Raises: + ModelNotTrainedException: If model not trained. + """ + if not self.trained: + raise ModelNotTrainedException() + simdict = {} + for label, classtopicvec in self.classtopicvecs.items(): + simdict[label] = cosine_similarity( + classtopicvec, self.retrieve_topicvec(shorttext) + ) + return simdict + + def savemodel(self, nameprefix: str, save_complete_autoencoder: bool=True) -> None: + """Save the autoencoder model to files. + + Saves encoder, optional decoder, and autoencoder weights along with + configuration parameters. + + Args: + nameprefix: Prefix for output files. + save_complete_autoencoder: Whether to save decoder and complete autoencoder. Default: True. + + Raises: + ModelNotTrainedException: If model not trained. + """ + if not self.trained: + raise ModelNotTrainedException() + + parameters = {} + parameters['nb_topics'] = self.nb_topics + parameters['classlabels'] = self.classlabels + parameters['tokens2indices'] = self.token2indices + open(nameprefix + '.json', 'wb').write(orjson.dumps(parameters)) + kerasio.save_model(nameprefix+'_encoder', self.encoder) + if save_complete_autoencoder: + kerasio.save_model(nameprefix+'_decoder', self.decoder) + kerasio.save_model(nameprefix+'_autoencoder', self.autoencoder) + pickle.dump(self.classtopicvecs, open(nameprefix+'_classtopicvecs.pkl', 'wb')) + + def loadmodel(self, nameprefix: str, load_incomplete: bool=False) -> None: + """Load the autoencoder model from files. + + Args: + nameprefix: Prefix for input files. + load_incomplete: If True, only load encoder (for models from v0.2.1). Default: False. + + Raises: + ModelNotTrainedException: If loading fails. + """ + # load the JSON file (parameters) + parameters = json.load(open(nameprefix+'.json', 'r')) + self.nb_topics = parameters['nb_topics'] + self.classlabels = parameters['classlabels'] + self.token2indices = parameters['tokens2indices'] + self.encoder = kerasio.load_model(nameprefix+'_encoder') + self.classtopicvecs = pickle.load(open(nameprefix+'_classtopicvecs.pkl', 'rb')) + if not load_incomplete: + self.decoder = kerasio.load_model(nameprefix+'_decoder') + self.autoencoder = kerasio.load_model(nameprefix+'_autoencoder') + self.trained = True + + def get_info(self) -> dict[str, Any]: + """Get model metadata. + + Returns: + Dictionary with model information. + """ + return CompactIOMachine.get_info(self) + + +def load_autoencoder_topicmodel( + name: str, + preprocessor: Optional[callable] = None, + tokenizer: Optional[callable] = None, + compact: bool=True +) -> AutoencodingTopicModeler: + """Load an autoencoder topic model from files. + + Args: + name: Model name (compact) or file prefix (non-compact). + preprocessor: Text preprocessing function. + compact: Whether to load compact model. Default: True. + + Returns: + An AutoencodingTopicModeler instance. + """ + if preprocessor is None: + preprocessor = textpreprocess.standard_text_preprocessor_1() + + autoencoder = AutoencodingTopicModeler(preprocessor=preprocessor, tokenizer=tokenizer) + if compact: + autoencoder.load_compact_model(name) + else: + autoencoder.loadmodel(name) + return autoencoder diff --git a/src/shorttext/generators/bow/GensimTopicModeling.py b/src/shorttext/generators/bow/GensimTopicModeling.py new file mode 100644 index 00000000..ffd04c26 --- /dev/null +++ b/src/shorttext/generators/bow/GensimTopicModeling.py @@ -0,0 +1,385 @@ + +from typing import Optional, Literal, Any + +import gensim +import numpy as np +import numpy.typing as npt +from gensim.corpora import Dictionary +from gensim.models import TfidfModel, LdaModel, LsiModel, RpModel +from gensim.similarities import MatrixSimilarity +import orjson + +from ...utils.classification_exceptions import ModelNotTrainedException +from ...utils.compactmodel_io import CompactIOMachine, get_model_classifier_name +from ...utils import gensim_corpora as gc +from .LatentTopicModeling import LatentTopicModeler + + +gensim_topic_model_dict = {'lda': LdaModel, 'lsi': LsiModel, 'rp': RpModel} + + +class GensimTopicModeler(LatentTopicModeler): + """Topic modeler using gensim implementations. + + Supports LDA (Latent Dirichlet Allocation), LSI (Latent Semantic Indexing), + and Random Projections (RP) for topic modeling. + + Note: + For compact model I/O, use LDAModeler or LSIModeler instead. + """ + + def __init__( + self, + preprocessor: Optional[callable] = None, + tokenizer: Optional[callable] = None, + algorithm: Literal["lda", "lsi", "rp"] = "lda", + toweigh: bool = True, + normalize: bool = True + ): + """Initialize the topic modeler. + + Args: + preprocessor: Text preprocessing function. Default: standard_text_preprocessor_1. + algorithm: Topic modeling algorithm. Options: 'lda', 'lsi', 'rp'. Default: 'lda'. + toweigh: Whether to apply tf-idf weighting. Default: True. + normalize: Whether to normalize topic vectors. Default: True. + """ + LatentTopicModeler.__init__( + self, preprocessor=preprocessor, tokenizer=tokenizer, normalize=normalize + ) + self.algorithm = algorithm + self.toweigh = toweigh + + def generate_corpus(self, classdict: dict[str, list[str]]) -> None: + """Generate gensim dictionary and corpus. + + Args: + classdict: Training data. + """ + self.dictionary, self.corpus, self.classlabels = gc.generate_gensim_corpora( + classdict, + preprocess_and_tokenize=lambda sent: self.tokenize_func(self.preprocess_func(sent)) + ) + + def train(self, classdict: dict[str, list[str]], nb_topics: int, *args, **kwargs) -> None: + """Train the topic modeler. + + Args: + classdict: Training data with class labels as keys and texts as values. + nb_topics: Number of latent topics. + *args: Arguments for the gensim topic model. + **kwargs: Keyword arguments for the gensim topic model. + """ + self.nb_topics = nb_topics + self.generate_corpus(classdict) + if self.toweigh: + self.tfidf = TfidfModel(self.corpus) + normcorpus = self.tfidf[self.corpus] + else: + self.tfidf = None + normcorpus = self.corpus + + self.topicmodel = gensim_topic_model_dict[self.algorithm]( + normcorpus, num_topics=self.nb_topics, *args, **kwargs + ) + self.matsim = MatrixSimilarity(self.topicmodel[normcorpus]) + + # change the flag + self.trained = True + + def update(self, additional_classdict: dict[str, list[str]]) -> None: + """Update model with additional data. + + Warning: Does not support adding new class labels or new vocabulary. + For comprehensive updates, retrain the model. + + Args: + additional_classdict: Additional training data. + """ + # cannot use this way, as we want to update the corpus with existing words + self.corpus, newcorpus = gc.update_corpus_labels( + self.dictionary, + self.corpus, + additional_classdict, + preprocess_and_tokenize=lambda sent: self.tokenize_func(self.preprocess_func(sent)) + ) + self.topicmodel.update(newcorpus) + + def retrieve_bow(self, shorttext: str) -> list[tuple[int, int]]: + """Get bag-of-words representation. + + Args: + shorttext: Input text. + + Returns: + List of (word_id, count) tuples. + """ + return self.dictionary.doc2bow(self.tokenize_func(self.preprocess_func(shorttext))) + + def retrieve_bow_vector(self, shorttext: str) -> npt.NDArray[np.float64]: + """Get bag-of-words vector. + + Args: + shorttext: Input text. + + Returns: + BOW vector. + """ + bow = self.retrieve_bow(shorttext) + if len(bow) > 0: + vec = np.zeros(len(self.dictionary)) + for id, val in bow: + vec[id] = val + else: + vec = np.ones(len(self.dictionary)) + if self.normalize: + vec /= np.linalg.norm(vec) + return vec + + def retrieve_corpus_topicdist(self, shorttext: str) -> list[tuple[int, int | float]]: + """Get topic distribution (corpus form). + + Args: + shorttext: Input text. + + Returns: + List of (topic_id, weight) tuples. + + Raises: + ModelNotTrainedException: If model not trained. + """ + if not self.trained: + raise ModelNotTrainedException() + bow = self.retrieve_bow(shorttext) + return self.topicmodel[self.tfidf[bow] if self.toweigh else bow] + + def retrieve_topicvec(self, shorttext: str) -> npt.NDArray[np.float64]: + """Get topic vector for short text. + + Args: + shorttext: Input text. + + Returns: + Topic vector. + + Raises: + ModelNotTrainedException: If model not trained. + """ + if not self.trained: + raise ModelNotTrainedException() + topicdist = self.retrieve_corpus_topicdist(shorttext) + if len(topicdist) > 0: + topicvec = np.zeros(self.nb_topics) + for topicid, frac in topicdist: + topicvec[topicid] = frac + else: + topicvec = np.ones(self.nb_topics) + if self.normalize: + topicvec /= np.linalg.norm(topicvec) + return topicvec + + def get_batch_cos_similarities(self, shorttext: str) -> dict[str, float]: + """Get cosine similarities to all classes. + + Args: + shorttext: Input text. + + Returns: + Dictionary mapping class labels to similarity scores. + + Raises: + ModelNotTrainedException: If model not trained. + """ + if not self.trained: + raise ModelNotTrainedException() + simdict = {} + similarities = self.matsim[self.retrieve_corpus_topicdist(shorttext)] + for label, similarity in zip(self.classlabels, similarities): + simdict[label] = float(similarity) + return simdict + + def loadmodel(self, nameprefix: str) -> None: + """Load topic model from files. + + Args: + nameprefix: Prefix for input files. + """ + # load the JSON file (parameters) + parameters = orjson.loads(open(nameprefix+'.json', 'rb').read()) + self.nb_topics = parameters['nb_topics'] + self.toweigh = parameters['toweigh'] + self.algorithm = parameters['algorithm'] + self.classlabels = parameters['classlabels'] + + # load the dictionary + self.dictionary = Dictionary.load(nameprefix+'.gensimdict') + + # load the topic model + self.topicmodel = gensim_topic_model_dict[self.algorithm].load(nameprefix + '.gensimmodel') + + # load the similarity matrix + self.matsim = MatrixSimilarity.load(nameprefix+'.gensimmat') + + # load the tf-idf modek + if self.toweigh: + self.tfidf = TfidfModel.load(nameprefix+'.gensimtfidf') + + # flag + self.trained = True + + def savemodel(self, nameprefix: str) -> None: + """Save topic model to files. + + Args: + nameprefix: Prefix for output files. + + Raises: + ModelNotTrainedException: If model not trained. + """ + if not self.trained: + raise ModelNotTrainedException() + + parameters = {} + parameters['nb_topics'] = self.nb_topics + parameters['toweigh'] = self.toweigh + parameters['algorithm'] = self.algorithm + parameters['classlabels'] = self.classlabels + open(nameprefix+".json", "wb").write(orjson.dumps(parameters)) + + self.dictionary.save(nameprefix+'.gensimdict') + self.topicmodel.save(nameprefix+'.gensimmodel') + self.matsim.save(nameprefix+'.gensimmat') + if self.toweigh: + self.tfidf.save(nameprefix+'.gensimtfidf') + + def get_info(self) -> dict[str, Any]: + return {} + + +lda_suffices = [ + '.json', '.gensimdict', '.gensimmodel.state', '.gensimtfidf', '.gensimmodel', + '.gensimmat', '.gensimmodel.expElogbeta.npy', '.gensimmodel.id2word' +] + + +class LDAModeler(GensimTopicModeler, CompactIOMachine): + """LDA topic modeler with compact I/O support.""" + + def __init__( + self, + preprocessor: Optional[callable] = None, + tokenizer: Optional[callable] = None, + toweigh: bool = True, + normalize: bool = True + ): + GensimTopicModeler.__init__( + self, + preprocessor=preprocessor, + tokenizer=tokenizer, + algorithm="lda", + toweigh=toweigh, + normalize=normalize + ) + CompactIOMachine.__init__( + self, {'classifier': 'ldatopic'}, 'ldatopic', lda_suffices + ) + + def get_info(self) -> dict[str, Any]: + return CompactIOMachine.get_info(self) + + +lsi_suffices = ['.json', '.gensimdict', '.gensimtfidf', '.gensimmodel.projection', + '.gensimmodel', '.gensimmat'] + +class LSIModeler(GensimTopicModeler, CompactIOMachine): + """LSI topic modeler with compact I/O support.""" + + def __init__( + self, + preprocessor: Optional[callable] = None, + tokenizer: Optional[callable] = None, + toweigh: bool = True, + normalize: bool = True + ): + GensimTopicModeler.__init__( + self, + preprocessor=preprocessor, + tokenizer=tokenizer, + algorithm="lsi", + toweigh=toweigh, + normalize=normalize + ) + CompactIOMachine.__init__( + self, {'classifier': 'lsitopic'}, 'lsitopic', lsi_suffices + ) + + def get_info(self) -> dict[str, Any]: + return CompactIOMachine.get_info(self) + + +rp_suffices = ['.json', '.gensimtfidf', '.gensimmodel', '.gensimmat', '.gensimdict'] + +class RPModeler(GensimTopicModeler, CompactIOMachine): + """Random Projection topic modeler with compact I/O support.""" + + def __init__( + self, + preprocessor: Optional[callable] = None, + tokenizer: Optional[callable] = None, + toweigh: bool = True, + normalize: bool = True + ): + GensimTopicModeler.__init__( + self, + preprocessor=preprocessor, + tokenizer=tokenizer, + algorithm="rp", + toweigh=toweigh, + normalize=normalize + ) + CompactIOMachine.__init__( + self, {'classifier': 'rptopic'}, 'rptopic', rp_suffices + ) + + def get_info(self) -> dict[str, Any]: + return CompactIOMachine.get_info(self) + + +def load_gensimtopicmodel( + name: str, + preprocessor: Optional[callable] = None, + tokenizer: Optional[callable] = None, + compact: bool = True +) -> GensimTopicModeler: + """Load a gensim topic model from files. + + Args: + name: Model name (compact) or file prefix (non-compact). + preprocessor: Text preprocessing function. + compact: Whether to load compact model. Default: True. + + Returns: + A topic modeler instance. + """ + if compact: + modeler_dict = {'ldatopic': LDAModeler, 'lsitopic': LSIModeler, 'rptopic': RPModeler} + classifier_name = str(get_model_classifier_name(name)) + if classifier_name not in modeler_dict.keys(): + raise ValueError(f"Unknown classifier name: {classifier_name}") + + topic_modeler = modeler_dict[classifier_name](preprocessor=preprocessor, tokenizer=tokenizer) + topic_modeler.load_compact_model(name) + else: + modeler_dict = {'lda': LDAModeler, 'lsi': LSIModeler, 'rp': RPModeler} + + config_info = orjson.loads(open(name+".json", "rb").read()) + algorithm_name = config_info.get("algorithm") + if algorithm_name is None: + raise ValueError("No classifier name!") + if algorithm_name not in modeler_dict.keys(): + raise ValueError(f"Unknown classifier name: {algorithm_name}") + + topic_modeler = modeler_dict[algorithm_name](preprocessor=preprocessor, tokenizer=tokenizer) + topic_modeler.loadmodel(name) + + return topic_modeler diff --git a/src/shorttext/generators/bow/LatentTopicModeling.py b/src/shorttext/generators/bow/LatentTopicModeling.py new file mode 100644 index 00000000..66af42ef --- /dev/null +++ b/src/shorttext/generators/bow/LatentTopicModeling.py @@ -0,0 +1,161 @@ + +from abc import ABC, abstractmethod +from typing import Optional, Any + +import numpy as np +import numpy.typing as npt + +from ...utils import textpreprocessing as textpreprocess, classification_exceptions as e +from ...utils.textpreprocessing import tokenize + + +# abstract class +class LatentTopicModeler(ABC): + """Abstract base class for topic modelers. + + Provides interface for converting short texts to topic vector + representations using various topic modeling algorithms. + """ + + def __init__( + self, + preprocessor: Optional[callable] = None, + tokenizer: Optional[callable] = None, + normalize: bool = True + ): + """Initialize the topic modeler. + + Args: + preprocessor: Text preprocessing function. Default: standard_text_preprocessor_1. + tokenizer: Tokenization function. Default: tokenize. + normalize: Whether to normalize output vectors. Default: True. + """ + if preprocessor is None: + self.preprocess_func = textpreprocess.standard_text_preprocessor_1() + else: + self.preprocess_func = preprocessor + if tokenizer is None: + self.tokenize_func = tokenize + else: + self.tokenize_func = tokenizer + + self.normalize = normalize + self.trained = False + + @abstractmethod + def train(self, classdict: dict[str, list[str]], nb_topics: int, *args, **kwargs) -> None: + """Train the topic modeler. + + Args: + classdict: Training data with class labels as keys and texts as values. + nb_topics: Number of latent topics. + *args: Additional arguments for the training algorithm. + **kwargs: Additional keyword arguments. + + Raises: + NotImplementedError: This is an abstract method. + """ + raise NotImplemented() + + @abstractmethod + def retrieve_bow(self, shorttext: str) -> list[tuple[int, int]]: + """Get bag-of-words representation. + + Args: + shorttext: Input text. + + Returns: + List of (word_id, count) tuples. + + Raises: + NotImplementedError: Abstract method. + """ + raise NotImplemented() + + @abstractmethod + def retrieve_bow_vector(self, shorttext: str) -> npt.NDArray[np.float64]: + """Get bag-of-words vector. + + Args: + shorttext: Input text. + + Returns: + BOW vector. + + Raises: + NotImplementedError: Abstract method. + """ + raise NotImplemented() + + @abstractmethod + def retrieve_topicvec(self, shorttext: str) -> npt.NDArray[np.float64]: + """Get topic vector for short text. + + Args: + shorttext: Input text. + + Returns: + Topic vector. + + Raises: + NotImplementedError: Abstract method. + """ + raise NotImplemented() + + @abstractmethod + def get_batch_cos_similarities(self, shorttext: str) -> dict[str, float]: + """Get cosine similarities to all classes. + + Args: + shorttext: Input text. + + Returns: + Dictionary mapping class labels to similarity scores. + + Raises: + NotImplementedError: Abstract method. + """ + raise NotImplemented() + + def __getitem__(self, shorttext) -> npt.NDArray[np.float64]: + """Get topic vector for text (shortcut for retrieve_topicvec).""" + return self.retrieve_topicvec(shorttext) + + def __contains__(self, shorttext): + """Check if model is trained.""" + if not self.trained: + raise e.ModelNotTrainedException() + return True + + @abstractmethod + def loadmodel(self, nameprefix: str): + """Load model from files. + + Args: + nameprefix: Prefix for input files. + + Raises: + NotImplementedError: Abstract method. + """ + raise NotImplemented() + + @abstractmethod + def savemodel(self, nameprefix: str): + """Save model to files. + + Args: + nameprefix: Prefix for output files. + + Raises: + NotImplementedError: Abstract method. + """ + raise NotImplemented() + + @abstractmethod + def get_info(self) -> dict[str, Any]: + """Get model metadata. + + Returns: + Dictionary with model information. + """ + raise NotImplemented() diff --git a/shorttext/generators/bow/__init__.py b/src/shorttext/generators/bow/__init__.py similarity index 67% rename from shorttext/generators/bow/__init__.py rename to src/shorttext/generators/bow/__init__.py index 5aad70c0..705c2c41 100644 --- a/shorttext/generators/bow/__init__.py +++ b/src/shorttext/generators/bow/__init__.py @@ -1,3 +1,4 @@ + from . import AutoEncodingTopicModeling from . import GensimTopicModeling -from . import LatentTopicModeling \ No newline at end of file +from . import LatentTopicModeling diff --git a/src/shorttext/generators/charbase/__init__.py b/src/shorttext/generators/charbase/__init__.py new file mode 100644 index 00000000..02f47f7c --- /dev/null +++ b/src/shorttext/generators/charbase/__init__.py @@ -0,0 +1,3 @@ + +from . import char2vec + diff --git a/src/shorttext/generators/charbase/char2vec.py b/src/shorttext/generators/charbase/char2vec.py new file mode 100644 index 00000000..c62a43a6 --- /dev/null +++ b/src/shorttext/generators/charbase/char2vec.py @@ -0,0 +1,146 @@ + +from functools import partial +from os import PathLike + +import numpy as np +import numpy.typing as npt +from scipy.sparse import csc_matrix +from gensim.corpora import Dictionary +from sklearn.preprocessing import OneHotEncoder +from deprecation import deprecated + +from ...utils.misc import textfile_generator + + +class SentenceToCharVecEncoder: + """One-hot encoder for character-level text representations. + + Converts sentences into one-hot encoded vectors at the character + level. Useful for character-level sequence models. + + Reference: + General architecture inspired by char-RNN and related models. + """ + + def __init__(self, dictionary: Dictionary, signalchar: str='\n'): + """Initialize the character vector encoder. + + Args: + dictionary: Gensim Dictionary mapping characters to indices. + signalchar: Signal character for sequence markers. Default: '\\n'. + """ + self.dictionary = dictionary + self.signalchar = signalchar + numchars = len(self.dictionary) + self.onehot_encoder = OneHotEncoder() + self.onehot_encoder.fit(np.arange(numchars).reshape((numchars, 1))) + + def calculate_prelim_vec(self, sent: str) -> npt.NDArray[np.float64]: + """Convert sentence to one-hot character vectors. + + Args: + sent: Input sentence. + + Returns: + One-hot encoded sparse matrix where each row represents + a character's encoding. + """ + return self.onehot_encoder.transform( + np.array([self.dictionary.token2id[c] for c in sent]).reshape((len(sent), 1)) + ).astype(np.float64) + + def encode_sentence( + self, + sent: str, + maxlen: int, + startsig: bool = False, + endsig=False + ) -> csc_matrix: + """Encode a sentence to a sparse character vector matrix. + + Args: + sent: Input sentence to encode. + maxlen: Maximum length of the encoded sequence. + startsig: Whether to prepend signal character. Default: False. + endsig: Whether to append signal character. Default: False. + + Returns: + Sparse matrix representing the sentence with shape + (maxlen + startsig + endsig, num_chars). + """ + cor_sent = (self.signalchar if startsig else '') + sent[:min(maxlen, len(sent))] + (self.signalchar if endsig else '') + sent_vec = self.calculate_prelim_vec(cor_sent).tocsc() + if sent_vec.shape[0] == maxlen + startsig + endsig: + return sent_vec + else: + return csc_matrix((sent_vec.data, sent_vec.indices, sent_vec.indptr), + shape=(maxlen + startsig + endsig, sent_vec.shape[1]), + dtype=np.float64) + + def encode_sentences( + self, + sentences: list[str], + maxlen: int, + sparse: bool = True, + startsig: bool = False, + endsig: bool = False + ) -> list[npt.NDArray[np.float64]] | npt.NDArray[np.float64]: + """Encode multiple sentences into character vectors. + + Args: + sentences: List of sentences to encode. + maxlen: Maximum length for each encoded sentence. + sparse: Whether to return sparse matrices. Default: True. + startsig: Whether to prepend signal character. Default: False. + endsig: Whether to append signal character. Default: False. + + Returns: + If sparse=True: list of sparse matrices. + If sparse=False: numpy array of shape (n_sentences, maxlen, num_chars). + """ + encode_sent_func = partial(self.encode_sentence, startsig=startsig, endsig=endsig, maxlen=maxlen) + list_encoded_sentences_map = map(encode_sent_func, sentences) + if sparse: + return list(list_encoded_sentences_map) + else: + return np.array([sparsevec.toarray() for sparsevec in list_encoded_sentences_map]) + + def __len__(self) -> int: + """Return the number of unique characters in the dictionary.""" + return len(self.dictionary) + + +def initialize_SentenceToCharVecEncoder( + textfile: str | PathLike, + encoding: bool=None +) -> SentenceToCharVecEncoder: + """Create a SentenceToCharVecEncoder from a text file. + + Builds a character dictionary from the given text file and returns + an encoder instance. + + Args: + textfile: Path to the text file for building the character dictionary. + encoding: Encoding of the text file. Default: None. + + Returns: + A SentenceToCharVecEncoder instance. + """ + dictionary = Dictionary( + map( + lambda line: [c for c in line], + textfile_generator(textfile, encoding=encoding) + ) + ) + return SentenceToCharVecEncoder(dictionary) + + +@deprecated(deprecated_in="4.0.0", removed_in="5.0.0") +def initSentenceToCharVecEncoder( + textfile: str | PathLike, + encoding: bool=None +) -> SentenceToCharVecEncoder: + """ + Deprecated. Use initialize_SentenceToCharVecEncoder instead. + """ + return initialize_SentenceToCharVecEncoder(textfile, encoding=encoding) \ No newline at end of file diff --git a/src/shorttext/generators/seq2seq/__init__.py b/src/shorttext/generators/seq2seq/__init__.py new file mode 100644 index 00000000..40c864ad --- /dev/null +++ b/src/shorttext/generators/seq2seq/__init__.py @@ -0,0 +1,3 @@ + +from . import s2skeras +from . import charbaseS2S diff --git a/src/shorttext/generators/seq2seq/charbaseS2S.py b/src/shorttext/generators/seq2seq/charbaseS2S.py new file mode 100644 index 00000000..ccaf26a0 --- /dev/null +++ b/src/shorttext/generators/seq2seq/charbaseS2S.py @@ -0,0 +1,212 @@ + +from typing import Literal +from os import PathLike + +import numpy as np +import numpy.typing as npt +import gensim +import orjson + +from .s2skeras import Seq2SeqWithKeras, load_seq2seq_model, kerasseq2seq_suffices +from ..charbase.char2vec import SentenceToCharVecEncoder +from ...utils.compactmodel_io import CompactIOMachine + + +charbases2s_suffices = kerasseq2seq_suffices + ['_dictionary.dict', '_charbases2s.json'] + + +class CharBasedSeq2SeqGenerator(CompactIOMachine): + """Character-based sequence-to-sequence model. + + Implements seq2seq at the character level. Uses Seq2SeqWithKeras internally. + + Reference: + Oriol Vinyals, Quoc Le, "A Neural Conversational Model," arXiv:1506.05869 (2015). + https://arxiv.org/abs/1506.05869 + """ + def __init__( + self, + sent2charvec_encoder: SentenceToCharVecEncoder, + latent_dim: int, + maxlen: int + ): + """Initialize the generator. + + Args: + sent2charvec_encoder: Character encoder. + latent_dim: Number of latent dimensions. + maxlen: Maximum length of a sentence. + """ + super().__init__( + {'classifier': 'charbases2s'}, + 'charbases2s', + charbases2s_suffices + ) + self.compiled = False + if sent2charvec_encoder != None: + self.sent2charvec_encoder = sent2charvec_encoder + self.dictionary = self.sent2charvec_encoder.dictionary + self.nbelem = len(self.dictionary) + self.latent_dim = latent_dim + self.maxlen = maxlen + self.s2sgenerator = Seq2SeqWithKeras(self.nbelem, self.latent_dim) + + def compile( + self, + optimizer: Literal["sgd", "rmsprop", "adagrad", "adadelta", "adam", "adamax", "nadam"] = 'rmsprop', + loss: str = 'categorical_crossentropy' + ) -> None: + """Compile the Keras model. + + Args: + optimizer: Optimizer for gradient descent. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. Default: rmsprop. + loss: Loss function from tensorflow.keras. Default: 'categorical_crossentropy'. + """ + if not self.compiled: + self.s2sgenerator.prepare_model() + self.s2sgenerator.compile(optimizer=optimizer, loss=loss) + self.compiled = True + + def prepare_trainingdata( + self, + txtseq: str + ) -> tuple[npt.NDArray[np.float64], npt.NDArray[np.float64], npt.NDArray[np.float64]]: + """Transform text to numerical vector format. + + Args: + txtseq: Input text. + + Returns: + Tuple of (encoder_input, decoder_input, decoder_output) as rank-3 tensors. + """ + encoder_input = self.sent2charvec_encoder.encode_sentences(txtseq[:-1], startsig=True, maxlen=self.maxlen, sparse=False) + decoder_input = self.sent2charvec_encoder.encode_sentences(txtseq[1:], startsig=True, maxlen=self.maxlen, sparse=False) + decoder_output = self.sent2charvec_encoder.encode_sentences(txtseq[1:], endsig=True, maxlen=self.maxlen, sparse=False) + return encoder_input, decoder_input, decoder_output + + def train( + self, + txtseq: str, + batch_size: int = 64, + epochs: int = 100, + optimizer: Literal["sgd", "rmsprop", "adagrad", "adadelta", "adam", "adamax", "nadam"] = 'rmsprop', + loss: str = 'categorical_crossentropy' + ) -> None: + """Train the character-based seq2seq model. + + Args: + txtseq: Training text. + batch_size: Batch size. Default: 64. + epochs: Number of epochs. Default: 100. + optimizer: Optimizer for gradient descent. Default: rmsprop. + loss: Loss function from tensorflow.keras. Default: 'categorical_crossentropy'. + """ + encoder_input, decoder_input, decoder_output = self.prepare_trainingdata(txtseq) + self.compile(optimizer=optimizer, loss=loss) + self.s2sgenerator.fit(encoder_input, decoder_input, decoder_output, batch_size=batch_size, epochs=epochs) + + def decode(self, txtseq: str, stochastic: bool=True) -> str: + """Generate output text from input text. + + Args: + txtseq: Input text. + stochastic: Whether to use stochastic sampling. Default: True. + + Returns: + Generated output text. + """ + # Encode the input as state vectors. + inputvec = np.array([self.sent2charvec_encoder.encode_sentence(txtseq, maxlen=self.maxlen, endsig=True).toarray()]) + states_value = self.s2sgenerator.encoder_model.predict(inputvec) + + # Generate empty target sequence of length 1. + target_seq = np.zeros((1, 1, self.nbelem)) + # Populate the first character of target sequence with the start character. + target_seq[0, 0, self.dictionary.token2id['\n']] = 1. + + # Sampling loop for a batch of sequences + # (to simplify, here we assume a batch of size 1). + stop_condition = False + decoded_txtseq = '' + while not stop_condition: + output_tokens, h, c = self.s2sgenerator.decoder_model.predict([target_seq] + states_value) + + # Sample a token + if stochastic: + sampled_token_index = np.random.choice(np.arange(output_tokens.shape[2]), + p=output_tokens[0, -1, :]) + else: + sampled_token_index = np.argmax(output_tokens[0, -1, :]) + sampled_char = self.dictionary[sampled_token_index] + decoded_txtseq += sampled_char + + # Exit condition: either hit max length + # or find stop character. + if (sampled_char == '\n' or len(decoded_txtseq) > self.maxlen): + stop_condition = True + + # Update the target sequence (of length 1). + target_seq = np.zeros((1, 1, self.nbelem)) + target_seq[0, 0, sampled_token_index] = 1. + + # Update states + states_value = [h, c] + + return decoded_txtseq + + def savemodel(self, prefix: str, final: bool=False) -> None: + """Save the trained model to files. + + For compact save, use save_compact_model instead. + + Args: + prefix: Prefix of the file path. + final: Whether the model is final (cannot be further trained). Default: False. + + Raises: + ModelNotTrainedException: If no trained model exists. + """ + self.s2sgenerator.savemodel(prefix, final=final) + self.dictionary.save(prefix+'_dictionary.dict') + open(prefix + '_charbases2s.json', 'wb').write( + orjson.dumps({ + 'maxlen': self.maxlen, 'latent_dim': self.latent_dim + }) + ) + + def loadmodel(self, prefix: str) -> None: + """Load a trained model from files. + + For compact load, use load_compact_model instead. + + Args: + prefix: Prefix of the file path. + """ + self.dictionary = gensim.corpora.Dictionary.load(prefix+'_dictionary.dict') + self.s2sgenerator = load_seq2seq_model(prefix, compact=False) + self.sent2charvec_encoder = SentenceToCharVecEncoder(self.dictionary) + self.nbelem = len(self.dictionary) + hyperparameters = orjson.loads(open(prefix+'_charbases2s.json', 'rb').read()) + self.latent_dim, self.maxlen = hyperparameters['latent_dim'], hyperparameters['maxlen'] + self.compiled = True + + +def loadCharBasedSeq2SeqGenerator( + path: str | PathLike, + compact: bool = True +) -> CharBasedSeq2SeqGenerator: + """Load a trained CharBasedSeq2SeqGenerator from file. + + Args: + path: Path of the model file. + compact: Whether to load a compact model. Default: True. + + Returns: + CharBasedSeq2SeqGenerator instance for seq2seq inference. + """ + seq2seqer = CharBasedSeq2SeqGenerator(None, 0, 0) + if compact: + seq2seqer.load_compact_model(path) + else: + seq2seqer.loadmodel(path) + return seq2seqer diff --git a/src/shorttext/generators/seq2seq/s2skeras.py b/src/shorttext/generators/seq2seq/s2skeras.py new file mode 100644 index 00000000..9b8a452b --- /dev/null +++ b/src/shorttext/generators/seq2seq/s2skeras.py @@ -0,0 +1,209 @@ + +from typing import Literal +from os import PathLike + +import numpy as np +import numpy.typing as npt +import orjson +from tensorflow.keras.models import load_model +from tensorflow.keras.models import Model +from tensorflow.keras.layers import Input, LSTM, Dense +from deprecation import deprecated + +from ...utils.compactmodel_io import CompactIOMachine +from ...utils.classification_exceptions import ModelNotTrainedException + +# Reference: https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html + +kerasseq2seq_suffices = ['.weights.h5', '.json', '_s2s_hyperparam.json', '_encoder.weights.h5', '_encoder.json', '_decoder.h5', '_decoder.weights.json'] + + +class Seq2SeqWithKeras(CompactIOMachine): + """Sequence-to-sequence (seq2seq) model using Keras. + + Implements encoder-decoder architecture for sequence generation tasks. + + Reference: + Ilya Sutskever, James Martens, Geoffrey Hinton, "Generating Text with Recurrent Neural Networks," + ICML (2011). https://www.cs.utoronto.ca/~ilya/pubs/2011/LANG-RNN.pdf + + Ilya Sutskever, Oriol Vinyals, Quoc V. Le, "Sequence to Sequence Learning with Neural Networks," + arXiv:1409.3215 (2014). https://arxiv.org/abs/1409.3215 + + Francois Chollet, "A ten-minute introduction to sequence-to-sequence learning in Keras," + The Keras Blog. https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html + + Aurelien Geron, Hands-On Machine Learning with Scikit-Learn and TensorFlow (Sebastopol, CA: O'Reilly Media, 2017). + """ + def __init__(self, vecsize: int, latent_dim: int): + """Initialize the model. + + Args: + vecsize: Vector size of the sequence. + latent_dim: Latent dimension in the RNN cell. + """ + super().__init__( + {'classifier': 'kerasseq2seq'}, + 'kerasseq2seq', + kerasseq2seq_suffices + ) + self.vecsize = vecsize + self.latent_dim = latent_dim + self.compiled = False + self.trained = False + + def prepare_model(self) -> None: + """Prepare the Keras model.""" + # Define an input sequence and process it. + encoder_inputs = Input(shape=(None, self.vecsize)) + encoder = LSTM(self.latent_dim, return_state=True) + encoder_outputs, state_h, state_c = encoder(encoder_inputs) + # We discard `encoder_outputs` and only keep the states. + encoder_states = [state_h, state_c] + + # Set up the decoder, using `encoder_states` as initial state. + decoder_inputs = Input(shape=(None, self.vecsize)) + # We set up our decoder to return full output sequences, + # and to return internal states as well. We don't use the + # return states in the training model, but we will use them in inference. + decoder_lstm = LSTM(self.latent_dim, return_sequences=True, return_state=True) + decoder_outputs, _, _ = decoder_lstm(decoder_inputs, + initial_state=encoder_states) + decoder_dense = Dense(self.vecsize, activation='softmax') + decoder_outputs = decoder_dense(decoder_outputs) + + # Define the model that will turn + # `encoder_input_data` & `decoder_input_data` into `decoder_target_data` + model = Model([encoder_inputs, decoder_inputs], decoder_outputs) + + # Define sampling models + encoder_model = Model(encoder_inputs, encoder_states) + + decoder_state_input_h = Input(shape=(self.latent_dim,)) + decoder_state_input_c = Input(shape=(self.latent_dim,)) + decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c] + decoder_outputs, state_h, state_c = decoder_lstm( + decoder_inputs, initial_state=decoder_states_inputs) + decoder_states = [state_h, state_c] + decoder_outputs = decoder_dense(decoder_outputs) + decoder_model = Model([decoder_inputs] + decoder_states_inputs, + [decoder_outputs] + decoder_states) + + self.model = model + self.encoder_model = encoder_model + self.decoder_model = decoder_model + + def compile( + self, + optimizer: Literal["sgd", "rmsprop", "adagrad", "adadelta", "adam", "adamax", "nadam"] = 'rmsprop', + loss: str = 'categorical_crossentropy' + ) -> None: + """Compile the Keras model. + + Args: + optimizer: Optimizer for gradient descent. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. Default: rmsprop. + loss: Loss function from tensorflow.keras. Default: 'categorical_crossentropy'. + """ + self.model.compile(optimizer=optimizer, loss=loss) + self.compiled = True + + def fit( + self, + encoder_input: npt.NDArray[np.float64], + decoder_input: npt.NDArray[np.float64], + decoder_output: npt.NDArray[np.float64], + batch_size: int = 64, + epochs: int = 100 + ) -> None: + """Fit the seq2seq model. + + Args: + encoder_input: Encoder input, a rank-3 tensor. + decoder_input: Decoder input, a rank-3 tensor. + decoder_output: Decoder output, a rank-3 tensor. + batch_size: Batch size. Default: 64. + epochs: Number of epochs. Default: 100. + """ + self.model.fit([encoder_input, decoder_input], decoder_output, + batch_size=batch_size, + epochs=epochs) + self.trained = True + + def savemodel(self, prefix: str, final: bool=False) -> None: + """Save the trained model to files. + + For compact save, use save_compact_model instead. + + Args: + prefix: Prefix of the file path. + final: Whether the model is final (cannot be further trained). Default: False. + + Raises: + ModelNotTrainedException: If no trained model exists. + """ + if not self.trained: + raise ModelNotTrainedException() + + # save hyperparameters + open(prefix + '_s2s_hyperparam.json', 'wb').write( + orjson.dumps({'vecsize': self.vecsize, 'latent_dim': self.latent_dim}) + ) + + # save whole model + if final: + self.model.save_weights(prefix+'.weights.h5') + else: + self.model.save(prefix+'.weights.h5') + open(prefix+'.json', 'w').write(self.model.to_json()) + + # save encoder and decoder + if final: + self.encoder_model.save_weights(prefix+'_encoder.weights.h5') + self.decoder_model.save_weights(prefix + '_decoder.weights.h5') + else: + self.encoder_model.save(prefix + '_encoder.weights.h5') + self.decoder_model.save(prefix+'_decoder.weights.h5') + open(prefix+'_encoder.json', 'w').write(self.encoder_model.to_json()) + open(prefix+'_decoder.json', 'w').write(self.decoder_model.to_json()) + + def loadmodel(self, prefix: str) -> None: + """Load a trained model from files. + + For compact load, use load_compact_model instead. + + Args: + prefix: Prefix of the file path. + """ + hyperparameters = orjson.loads(open(prefix+'_s2s_hyperparam.json', 'rb').read()) + self.vecsize, self.latent_dim = hyperparameters['vecsize'], hyperparameters['latent_dim'] + self.model = load_model(prefix+'.weights.h5') + self.encoder_model = load_model(prefix+'_encoder.weights.h5') + self.decoder_model = load_model(prefix+'_decoder.weights.h5') + self.trained = True + + +def load_seq2seq_model(path: str | PathLike, compact: bool=True) -> Seq2SeqWithKeras: + """Load a trained Seq2SeqWithKeras model from file. + + Args: + path: Path of the model file. + compact: Whether to load a compact model. Default: True. + + Returns: + Seq2SeqWithKeras instance for sequence-to-sequence inference. + """ + generator = Seq2SeqWithKeras(0, 0) + if compact: + generator.load_compact_model(path) + else: + generator.loadmodel(path) + generator.compiled = True + return generator + + +@deprecated(deprecated_in="4.0.0", removed_in="5.0.0") +def loadSeq2SeqWithKeras(path: str | PathLike, compact: bool=True) -> Seq2SeqWithKeras: + """ + Deprecated. Call load_seq2seq_model instead. + """ + return load_seq2seq_model(path, compact=compact) diff --git a/src/shorttext/metrics/__init__.py b/src/shorttext/metrics/__init__.py new file mode 100644 index 00000000..bfb2af18 --- /dev/null +++ b/src/shorttext/metrics/__init__.py @@ -0,0 +1,4 @@ + +from . import dynprog +from . import embedfuzzy +from . import wasserstein diff --git a/src/shorttext/metrics/dynprog/__init__.py b/src/shorttext/metrics/dynprog/__init__.py new file mode 100644 index 00000000..4bb67a12 --- /dev/null +++ b/src/shorttext/metrics/dynprog/__init__.py @@ -0,0 +1,4 @@ + +from . import dldist +from . import jaccard +from . import lcp diff --git a/src/shorttext/metrics/dynprog/dldist.py b/src/shorttext/metrics/dynprog/dldist.py new file mode 100644 index 00000000..462f517f --- /dev/null +++ b/src/shorttext/metrics/dynprog/dldist.py @@ -0,0 +1,45 @@ + +import numpy as np +import numba as nb + + +@nb.njit +def damerau_levenshtein(word1: str, word2: str) -> int: + """Calculate the Damerau-Levenshtein distance between two words. + + Computes the edit distance considering adjacent transpositions + (swapping two adjacent characters counts as one edit). + + Args: + word1: First word. + word2: Second word. + + Returns: + The Damerau-Levenshtein distance between the two words. + + Reference: + https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance + """ + len1 = len(word1) + len2 = len(word2) + matrix = np.zeros((len1+1, len2+1), dtype=np.int8) + + for i in range(len1+1): + matrix[i, 0] = i + for j in range(len2+1): + matrix[0, j] = j + + for i in range(len1+1): + for j in range(len2+1): + cost = 0 + if i > 0 and j > 0 and (word1[i-1] != word2[j-1]): + cost = 1 + delcost = matrix[i-1, j] + 1 + inscost = matrix[i, j-1] + 1 + subcost = matrix[i-1, j-1] + cost + score = min(min(delcost, inscost), subcost) + if ((i > 1) & (j > 1) & (word1[i - 1] == word2[j - 2]) & (word1[i - 2] == word2[j - 1])): + score = min(score, matrix[i-2, j-2] + cost) + matrix[i, j] = score + + return matrix[len1, len2] diff --git a/src/shorttext/metrics/dynprog/jaccard.py b/src/shorttext/metrics/dynprog/jaccard.py new file mode 100644 index 00000000..e1091ac0 --- /dev/null +++ b/src/shorttext/metrics/dynprog/jaccard.py @@ -0,0 +1,83 @@ + +from itertools import product + +from .dldist import damerau_levenshtein +from .lcp import longest_common_prefix + + +def similarity(word1: str, word2: str) -> float: + """Calculate similarity between two words. + + Computes similarity as the maximum of: + - 1 - Damerau-Levenshtein distance / max length + - Longest common prefix length / max length + + Args: + word1: First word. + word2: Second word. + + Returns: + Similarity score between 0 and 1. + + Reference: + Daniel E. Russ, Kwan-Yuet Ho, Calvin A. Johnson, Melissa C. Friesen, + "Computer-Based Coding of Occupation Codes for Epidemiological Analyses," + IEEE CBMS 2014, pp. 347-350. + http://ieeexplore.ieee.org/abstract/document/6881904/ + """ + maxlen = max(len(word1), len(word2)) + editdistance = damerau_levenshtein(word1, word2) + lcp = longest_common_prefix(word1, word2) + return max(1. - float(editdistance)/maxlen, float(lcp)/maxlen) + + +def soft_intersection_list(tokens1: list[str], tokens2: list[str]) -> set[str]: + """Compute soft intersection between two token lists. + + Finds the best matching pairs between tokens using similarity, + where each token can only match once. + + Args: + tokens1: First list of tokens. + tokens2: Second list of tokens. + + Returns: + Set of ((token1, token2), similarity) tuples representing matches. + """ + intersected_list = [((token1, token2), similarity(token1, token2)) for token1, token2 in product(tokens1, tokens2)] + intersected_list = sorted(intersected_list, key=lambda item: item[1], reverse=True) + + included_list = set() + used_tokens1 = set() + used_tokens2 = set() + for (token1, token2), sim in intersected_list: + if (not (token1 in used_tokens1)) and (not (token2 in used_tokens2)): + included_list.add(((token1, token2), sim)) + used_tokens1.add(token1) + used_tokens2.add(token2) + + return included_list + + +def soft_jaccard_score(tokens1: str, tokens2: str) -> float: + """Compute soft Jaccard score between token lists. + + Uses fuzzy matching based on edit distance and longest common prefix. + + Args: + tokens1: First list of tokens. + tokens2: Second list of tokens. + + Returns: + Soft Jaccard score between 0 and 1. + + Reference: + Daniel E. Russ, Kwan-Yuet Ho, Calvin A. Johnson, Melissa C. Friesen, + "Computer-Based Coding of Occupation Codes for Epidemiological Analyses," + IEEE CBMS 2014, pp. 347-350. + http://ieeexplore.ieee.org/abstract/document/6881904/ + """ + intersection_list = soft_intersection_list(tokens1, tokens2) + num_intersections = sum([item[1] for item in intersection_list]) + num_unions = len(tokens1) + len(tokens2) - num_intersections + return num_intersections / num_unions diff --git a/src/shorttext/metrics/dynprog/lcp.py b/src/shorttext/metrics/dynprog/lcp.py new file mode 100644 index 00000000..a91bee2e --- /dev/null +++ b/src/shorttext/metrics/dynprog/lcp.py @@ -0,0 +1,22 @@ + +import numba as nb + + +@nb.njit +def longest_common_prefix(word1: str, word2: str) -> int: + """Calculate the longest common prefix length of two strings. + + Args: + word1: First string. + word2: Second string. + + Returns: + Length of the longest common prefix. + """ + lcp = 0 + for i in range(min(len(word1), len(word2))): + if word1[i] == word2[i]: + lcp += 1 + else: + break + return lcp diff --git a/src/shorttext/metrics/embedfuzzy/__init__.py b/src/shorttext/metrics/embedfuzzy/__init__.py new file mode 100644 index 00000000..d1b9b4d9 --- /dev/null +++ b/src/shorttext/metrics/embedfuzzy/__init__.py @@ -0,0 +1,2 @@ + +from .jaccard import jaccardscore_sents \ No newline at end of file diff --git a/src/shorttext/metrics/embedfuzzy/jaccard.py b/src/shorttext/metrics/embedfuzzy/jaccard.py new file mode 100644 index 00000000..7d3c073c --- /dev/null +++ b/src/shorttext/metrics/embedfuzzy/jaccard.py @@ -0,0 +1,61 @@ + +from itertools import product +from typing import Optional + +import numpy as np +from gensim.models.keyedvectors import KeyedVectors + +from ...utils import tokenize +from ...utils.compute import cosine_similarity + + +def jaccardscore_sents( + sent1: str, + sent2: str, + wvmodel: KeyedVectors, + sim_words: Optional[callable] = None +) -> float: + """Compute Jaccard score between sentences using embeddings. + + Uses word embeddings to compute a fuzzy Jaccard score where + word similarity is measured via embedding cosine similarity. + + Args: + sent1: First sentence. + sent2: Second sentence. + wvmodel: Word embedding model. + sim_words: Similarity function for word vectors. Default: cosine. + + Returns: + Fuzzy Jaccard score between 0 and 1. + """ + if sim_words is None: + sim_words = cosine_similarity + + tokens1 = tokenize(sent1) + tokens2 = tokenize(sent2) + tokens1 = list(filter(lambda w: w in wvmodel, tokens1)) + tokens2 = list(filter(lambda w: w in wvmodel, tokens2)) + allowable1 = [True] * len(tokens1) + allowable2 = [True] * len(tokens2) + + simdict = {(i, j): sim_words(wvmodel[tokens1[i]].astype(np.float64), wvmodel[tokens2[j]].astype(np.float64)) + for i, j in product(range(len(tokens1)), range(len(tokens2)))} + + intersection = 0.0 + simdictitems = sorted(simdict.items(), key=lambda s: s[1], reverse=True) + for idxtuple, sim in simdictitems: + i, j = idxtuple + if allowable1[i] and allowable2[j]: + intersection += sim + allowable1[i] = False + allowable2[j] = False + + union = len(tokens1) + len(tokens2) - intersection + + if union > 0: + return intersection / union + elif intersection == 0: + return 1. + else: + return np.inf diff --git a/src/shorttext/metrics/wasserstein/__init__.py b/src/shorttext/metrics/wasserstein/__init__.py new file mode 100755 index 00000000..d274bad0 --- /dev/null +++ b/src/shorttext/metrics/wasserstein/__init__.py @@ -0,0 +1,2 @@ + +from .wordmoverdist import word_mover_distance_linprog, word_mover_distance \ No newline at end of file diff --git a/src/shorttext/metrics/wasserstein/wordmoverdist.py b/src/shorttext/metrics/wasserstein/wordmoverdist.py new file mode 100644 index 00000000..75bb8caf --- /dev/null +++ b/src/shorttext/metrics/wasserstein/wordmoverdist.py @@ -0,0 +1,111 @@ + +from itertools import product +from typing import Optional +import warnings + +import numpy as np +from scipy.spatial.distance import euclidean +from scipy.sparse import csr_matrix +from scipy.optimize import linprog, OptimizeResult +from gensim.models.keyedvectors import KeyedVectors + +from ...utils.gensim_corpora import tokens_to_fracdict + + +def word_mover_distance_linprog( + first_sent_tokens: list[str], + second_sent_tokens: list[str], + wvmodel: KeyedVectors, + distancefunc: Optional[callable] = None +) -> OptimizeResult: + """Compute Word Mover's distance via linear programming. + + Uses scipy.optimize.linprog to compute the transport problem + for the Word Mover's Distance. + + Args: + first_sent_tokens: First list of tokens. + second_sent_tokens: Second list of tokens. + wvmodel: Word embedding model. + distancefunc: Distance function for word vectors. Default: Euclidean. + + Returns: + scipy.optimize.OptimizeResult containing the optimization result. + + Reference: + Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger, + "From Word Embeddings to Document Distances," ICML 2015. + """ + if distancefunc is None: + distancefunc = euclidean + + nb_tokens_first_sent = len(first_sent_tokens) + nb_tokens_second_sent = len(second_sent_tokens) + + all_tokens = list(set(first_sent_tokens+second_sent_tokens)) + wordvecs = {token: wvmodel[token].astype(np.float64) for token in all_tokens} + + first_sent_buckets = tokens_to_fracdict(first_sent_tokens) + second_sent_buckets = tokens_to_fracdict(second_sent_tokens) + + collapsed_idx_func = lambda i, j: i*nb_tokens_second_sent + j + + # assigning T + T = np.zeros(nb_tokens_first_sent*nb_tokens_second_sent) + for i, j in product(range(nb_tokens_first_sent), range(nb_tokens_second_sent)): + T[collapsed_idx_func(i, j)] = distancefunc(wordvecs[first_sent_tokens[i]], + wordvecs[second_sent_tokens[j]]) + + # assigning Aeq and beq + Aeq = csr_matrix( + (nb_tokens_first_sent+nb_tokens_second_sent, + nb_tokens_first_sent*nb_tokens_second_sent) + ) + beq = np.zeros(nb_tokens_first_sent+nb_tokens_second_sent) + for i in range(nb_tokens_first_sent): + for j in range(nb_tokens_second_sent): + Aeq[i, collapsed_idx_func(i, j)] = 1. + beq[i] = first_sent_buckets[first_sent_tokens[i]] + for j in range(nb_tokens_second_sent): + for i in range(nb_tokens_first_sent): + Aeq[j+nb_tokens_first_sent, collapsed_idx_func(i, j)] = 1. + beq[j+nb_tokens_first_sent] = second_sent_buckets[second_sent_tokens[j]] + + return linprog(T, A_eq=Aeq, b_eq=beq) + + +def word_mover_distance( + first_sent_tokens: list[str], + second_sent_tokens: list[str], + wvmodel: KeyedVectors, + distancefunc: Optional[callable] = None +) -> float: + """Compute Word Mover's distance between token lists. + + Uses word embeddings to compute the minimum transport cost + between words in two sentences. + + Args: + first_sent_tokens: First list of tokens. + second_sent_tokens: Second list of tokens. + wvmodel: Word embedding model. + distancefunc: Distance function for word vectors. Default: Euclidean. + + Returns: + The Word Mover's distance (lower is more similar). + + Reference: + Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger, + "From Word Embeddings to Document Distances," ICML 2015. + """ + if distancefunc is None: + distancefunc = euclidean + + linprog_result = word_mover_distance_linprog( + first_sent_tokens, + second_sent_tokens, + wvmodel, + distancefunc=distancefunc + ) + + return linprog_result['fun'] diff --git a/src/shorttext/schemas/__init__.py b/src/shorttext/schemas/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/shorttext/schemas/models.py b/src/shorttext/schemas/models.py new file mode 100644 index 00000000..b85d3f13 --- /dev/null +++ b/src/shorttext/schemas/models.py @@ -0,0 +1,18 @@ + +from dataclasses import dataclass + +from tensorflow.keras import Model + + +@dataclass +class AutoEncoderPackage: + """Package containing autoencoder components. + + Attributes: + autoencoder: The full autoencoder model. + encoder: The encoder part of the autoencoder. + decoder: The decoder part of the autoencoder. + """ + autoencoder: Model + encoder: Model + decoder: Model diff --git a/src/shorttext/smartload.py b/src/shorttext/smartload.py new file mode 100644 index 00000000..61417cef --- /dev/null +++ b/src/shorttext/smartload.py @@ -0,0 +1,71 @@ + +from typing import Optional +from os import PathLike + +import gensim + +from .utils import standard_text_preprocessor_1 +from .utils import compactmodel_io as cio +from .utils import classification_exceptions as e +from .classifiers import load_varnnlibvec_classifier, load_sumword2vec_classifier +from .generators import load_autoencoder_topicmodel, load_gensimtopicmodel +from .generators import load_seq2seq_model, loadCharBasedSeq2SeqGenerator +from .classifiers import load_autoencoder_topic_sklearnclassifier, load_gensim_topicvec_sklearnclassifier +from .classifiers import load_maxent_classifier +from .utils.dtm import load_numpy_documentmatrixmatrix + + +def smartload_compact_model( + filename: str | PathLike, + wvmodel: Optional[gensim.models.keyedvectors.KeyedVectors], + preprocessor: Optional[callable] = None, + vecsize: Optional[int] = None +): + """Load a classifier or model from a compact file. + + Automatically detects the model type and loads the appropriate classifier. + Set wvmodel to None if no word embedding model is needed. + + Args: + filename: Path to the compact model file. + wvmodel: Word embedding model. Can be None for non-embedding models. + preprocessor: Text preprocessing function. Default: standard_text_preprocessor_1. + vecsize: Vector size. Default: None (extracted from model). + + Returns: + Appropriate classifier or model instance. + + Raises: + AlgorithmNotExistException: If model type is unknown. + """ + if preprocessor is None: + preprocessor = standard_text_preprocessor_1() + + classifier_name = cio.get_model_classifier_name(filename) + match classifier_name: + case 'ldatopic' | 'lsitopic' | 'rptopic': + return load_gensimtopicmodel(filename, preprocessor=preprocessor, compact=True) + case 'kerasautoencoder': + return load_autoencoder_topicmodel(filename, preprocessor=preprocessor, compact=True) + case 'topic_sklearn': + topicmodel = cio.get_model_config_field(filename, 'topicmodel') + if topicmodel in ['ldatopic', 'lsitopic', 'rptopic']: + return load_gensim_topicvec_sklearnclassifier(filename, preprocessor=preprocessor, compact=True) + elif topicmodel in ['kerasautoencoder']: + return load_autoencoder_topic_sklearnclassifier(filename, preprocessor=preprocessor, compact=True) + else: + raise e.AlgorithmNotExistException(topicmodel) + case 'nnlibvec': + return load_varnnlibvec_classifier(wvmodel, filename, compact=True, vecsize=vecsize) + case 'sumvec': + return load_sumword2vec_classifier(wvmodel, filename, compact=True, vecsize=vecsize) + case 'maxent': + return load_maxent_classifier(filename, compact=True) + case 'kerasseq2seq': + return load_seq2seq_model(filename, compact=True) + case 'charbases2s': + return loadCharBasedSeq2SeqGenerator(filename, compact=True) + case "npdtm": + return load_numpy_documentmatrixmatrix(filename) + case _: + raise e.AlgorithmNotExistException(classifier_name) diff --git a/src/shorttext/spell/__init__.py b/src/shorttext/spell/__init__.py new file mode 100644 index 00000000..9515448a --- /dev/null +++ b/src/shorttext/spell/__init__.py @@ -0,0 +1,5 @@ + +from .basespellcorrector import SpellCorrector + +from .norvig import NorvigSpellCorrector + diff --git a/src/shorttext/spell/basespellcorrector.py b/src/shorttext/spell/basespellcorrector.py new file mode 100644 index 00000000..c9632c04 --- /dev/null +++ b/src/shorttext/spell/basespellcorrector.py @@ -0,0 +1,30 @@ + +from abc import ABC, abstractmethod + + +class SpellCorrector(ABC): + """Abstract base class for spell correctors. + + Defines the interface for spelling correction algorithms. + """ + + @abstractmethod + def train(self, text: str) -> None: + """Train the spell corrector on a corpus. + + Args: + text: Training text corpus. + """ + raise NotImplemented() + + @abstractmethod + def correct(self, word: str) -> str: + """Recommend a spelling correction for a word. + + Args: + word: Word to correct. + + Returns: + The corrected word. + """ + return word diff --git a/src/shorttext/spell/editor.py b/src/shorttext/spell/editor.py new file mode 100644 index 00000000..bdd9e151 --- /dev/null +++ b/src/shorttext/spell/editor.py @@ -0,0 +1,46 @@ + +from typing import Generator + +import numba as nb + + +@nb.njit +def compute_set_edits1(word: str) -> set[str]: + """Generate all single-edit distance words. + + Creates all possible words that are one edit (insert, delete, + transpose, replace) away from the input word. + + Args: + word: Input word. + + Returns: + Set of all possible single-edit variations. + """ + letters = 'abcdefghijklmnopqrstuvwxyz' + + splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] + deletes = [L + R[1:] for L, R in splits if R] + transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1] + replaces = [L + c + R[1:] for L, R in splits if R for c in letters] + inserts = [L + c + R for L, R in splits for c in letters] + + returned_set = set(deletes + transposes + replaces + inserts) + + return returned_set + + +@nb.njit +def compute_set_edits2(word: str) -> Generator[str, None, None]: + """Generate all double-edit distance words. + + Creates all possible words that are two edits away from the + input word by applying compute_set_edits1 to each result. + + Args: + word: Input word. + + Yields: + All possible double-edit variations. + """ + return (e2 for e1 in compute_set_edits1(word) for e2 in compute_set_edits1(e1)) diff --git a/src/shorttext/spell/norvig.py b/src/shorttext/spell/norvig.py new file mode 100644 index 00000000..48bc74c2 --- /dev/null +++ b/src/shorttext/spell/norvig.py @@ -0,0 +1,83 @@ + +# reference: https://norvig.com/spell-correct.html + +import re +from collections import Counter +from typing import Generator + +from . import SpellCorrector +from .editor import compute_set_edits1, compute_set_edits2 + + +class NorvigSpellCorrector(SpellCorrector): + """Spell corrector based on Peter Norvig's algorithm. + + Uses word frequency counts to suggest corrections for misspelled + words by finding edits that exist in the vocabulary. + + Reference: + https://norvig.com/spell-correct.html + """ + + def __init__(self): + """Initialize the spell corrector.""" + self.train('') + + def train(self, text: str) -> None: + """Train on a text corpus. + + Builds a word frequency dictionary from the input text. + + Args: + text: Training text corpus. + """ + self.words = re.findall('\\w+', text.lower()) + self.WORDS = Counter(self.words) + self.N = sum(self.WORDS.values()) + + def P(self, word: str) -> float: + """Compute word probability from the training corpus. + + Args: + word: Word to get probability for. + + Returns: + Probability of the word appearing in the corpus. + """ + return self.WORDS[word] / float(self.N) + + def correct(self, word: str) -> str: + """Recommend spelling correction for a word. + + Args: + word: Word to correct. + + Returns: + Most likely correction, or the original word if no better option. + """ + return max(self.candidates(word), key=self.P) + + def known(self, words: list[str]) -> set[str]: + """Filter words found in the training vocabulary. + + Args: + words: List of words to check. + + Returns: + Subset of words that appear in the training corpus. + """ + return set(w for w in words if w in self.WORDS) + + def candidates(self, word: str) -> Generator[str, None, None]: + """Generate spelling correction candidates. + + Checks exact match, then edits of distance 1 and 2. + + Args: + word: Word to find candidates for. + + Yields: + Viable correction candidates. + """ + return (self.known([word]) or self.known(compute_set_edits1(word)) or self.known(compute_set_edits2(word)) or [word]) + diff --git a/shorttext/stack/__init__.py b/src/shorttext/stack/__init__.py similarity index 100% rename from shorttext/stack/__init__.py rename to src/shorttext/stack/__init__.py diff --git a/src/shorttext/stack/stacking.py b/src/shorttext/stack/stacking.py new file mode 100644 index 00000000..2a5c256b --- /dev/null +++ b/src/shorttext/stack/stacking.py @@ -0,0 +1,295 @@ + +import pickle +from abc import ABC, abstractmethod +from typing import Optional, Annotated, Generator, Literal + +import numpy as np +import numpy.typing as npt +from tensorflow.keras.layers import Dense, Reshape +from tensorflow.keras.models import Sequential +from tensorflow.keras.regularizers import l2 + +from ..utils.classification_exceptions import ModelNotTrainedException +from ..utils import kerasmodel_io as kerasio +from ..utils.compactmodel_io import CompactIOMachine +from ..classifiers.base import AbstractScorer + + +# abstract class +class StackedGeneralization(ABC): + """Abstract base class for stacked generalization. + + An intermediate model that takes output from other classifiers as input + features and performs another level of classification. + + The classifiers must have the :meth:`~score` method that takes a string as input. + + Reference: + David H. Wolpert, "Stacked Generalization," Neural Netw 5: 241-259 (1992). + + M. Paz Sesmero et al., "Generating ensembles of heterogeneous classifiers + using Stacked Generalization," WIREs Data Mining and Knowledge Discovery 5: 21-34 (2015). + """ + + def __init__( + self, + intermediate_classifiers: Optional[dict[str, AbstractScorer]] = None + ): + """Initialize the stacking class. + + Args: + intermediate_classifiers: Dictionary mapping names to classifier instances. + """ + self.classifiers = intermediate_classifiers if intermediate_classifiers is not None else {} + self.classlabels = [] + self.trained = False + + def register_classifiers(self) -> None: + """Register the intermediate classifiers. + + Must be called before training. + """ + self.classifier2idx = {} + self.idx2classifier = {} + for idx, key in enumerate(self.classifiers.keys()): + self.classifier2idx[key] = idx + self.idx2classifier[idx] = key + + def register_classlabels(self, labels: list[str]) -> None: + """Register output labels. + + Args: + labels: List of output class labels. + + Must be called before training. + """ + self.classlabels = labels + self.labels2idx = {classlabel: idx for idx, classlabel in enumerate(self.classlabels)} + + def add_classifier(self, name: str, classifier: AbstractScorer) -> None: + """Add a classifier to the stack. + + Args: + name: Name for the classifier (no spaces or special characters). + classifier: Classifier instance with a :meth:`~score` method. + """ + self.classifiers[name] = classifier + self.register_classifiers() + + def delete_classifier(self, name: str) -> None: + """Delete a classifier from the stack. + + Args: + name: Name of the classifier to delete. + + Raises: + KeyError: If classifier name not found. + """ + del self.classifiers[name] + self.register_classifiers() + + def translate_shorttext_intfeature_matrix( + self, + shorttext: str + ) -> Annotated[npt.NDArray[np.float64], "2D Array"]: + """Convert short text to feature matrix for stacking. + + Args: + shorttext: Input text. + + Returns: + Feature matrix of shape (n_classifiers, n_labels). + """ + feature_matrix = np.zeros((len(self.classifier2idx), len(self.labels2idx))) + for key, idx in self.classifier2idx.items(): + classifier = self.classifiers[key] + scoredict = classifier.score(shorttext) + for label in scoredict: + feature_matrix[idx, self.labels2idx[label]] = scoredict[label] + return feature_matrix + + def convert_label_to_buckets( + self, + label: str + ) -> Annotated[npt.NDArray[np.int64], "1D Array"]: + """Convert label to one-hot bucket representation. + + Args: + label: Class label. + + Returns: + One-hot array with 1 at the label's position. + """ + buckets = np.zeros(len(self.labels2idx), dtype=np.int64) + buckets[self.labels2idx[label]] = 1 + return buckets + + def convert_traindata_matrix( + self, + classdict: dict[str, list[str]], + tobucket: bool = True + ) -> Generator[tuple[Annotated[npt.NDArray[np.float64], "2D Array"], Annotated[npt.NDArray[np.int64], "1D Array"]], None, None]: + """Yield training data matrices. + + Args: + classdict: Training data dictionary. + tobucket: Whether to convert labels to buckets. Default: True. + + Yields: + Tuples of (feature_matrix, label_array). + """ + for label, texts in classdict.items(): + y = self.convert_label_to_buckets(label) if tobucket else self.labels2idx[label] + for shorttext in texts: + x = self.translate_shorttext_intfeature_matrix(shorttext) + yield x, y + + @abstractmethod + def train(self, classdict: dict[str, list[str]], *args, **kwargs) -> None: + """Train the stacked generalization model. + + Args: + classdict: Training data. + *args: Additional arguments. + **kwargs: Additional keyword arguments. + + Raises: + NotImplementedError: Abstract method. + """ + raise NotImplemented() + + @abstractmethod + def score(self, shorttext: str, *args, **kwargs) -> dict[str, float]: + """Calculate classification scores for all labels. + + Args: + shorttext: Input text. + *args: Additional arguments. + **kwargs: Additional keyword arguments. + + Returns: + Dictionary mapping class labels to scores. + + Raises: + NotImplementedError: Abstract method. + """ + raise NotImplemented() + + +class LogisticStackedGeneralization(StackedGeneralization, CompactIOMachine): + """Stacked generalization using logistic regression. + + Uses neural network with sigmoid output to combine predictions from + intermediate classifiers. + + Note: + Saves the stacked model but not the intermediate classifiers. + """ + + def __init__( + self, + intermediate_classifiers: Optional[dict[str, AbstractScorer]] = None, + ): + CompactIOMachine.__init__(self, + {'classifier': 'stacked_logistics'}, + 'stacked_logistics', + ['_stackedlogistics.pkl', '_stackedlogistics.weights.h5', '_stackedlogistics.json']) + StackedGeneralization.__init__(self, intermediate_classifiers=intermediate_classifiers) + + def train( + self, + classdict: dict[str, list[str]], + optimizer: Literal["sgd", "rmsprop", "adagrad", "adadelta", "adam", "adamax", "nadam"] = "adam", + l2reg: float = 0.01, + bias_l2reg: float = 0.01, + nb_epoch: int = 1000 + ) -> None: + """Train the stacked generalization model. + + Args: + classdict: Training data. + optimizer: Optimizer for training. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. Default: adam. + l2reg: L2 regularization coefficient. Default: 0.01. + bias_l2reg: L2 regularization for bias. Default: 0.01. + nb_epoch: Number of training epochs. Default: 1000. + """ + + # register + self.register_classifiers() + self.register_classlabels(sorted(classdict.keys())) # sorted the keys + + kmodel = Sequential() + kmodel.add(Reshape((len(self.classifier2idx) * len(self.labels2idx),), + input_shape=(len(self.classifier2idx), len(self.labels2idx)))) + kmodel.add(Dense(units=len(classdict), + activation='sigmoid', + kernel_regularizer=l2(l2reg), + bias_regularizer=l2(bias_l2reg)) + ) + kmodel.compile(loss='categorical_crossentropy', optimizer=optimizer) + + Xy = [(xone, yone) for xone, yone in self.convert_traindata_matrix(classdict, tobucket=True)] + X = np.array([item[0] for item in Xy]) + y = np.array([item[1] for item in Xy]) + + kmodel.fit(X, y, epochs=nb_epoch) + + self.model = kmodel + self.trained = True + + def score(self, shorttext: str) -> dict[str, float]: + """Calculate classification scores for all labels. + + Args: + shorttext: Input text. + + Returns: + Dictionary mapping class labels to scores. + + Raises: + ModelNotTrainedException: If model not trained. + """ + if not self.trained: + raise ModelNotTrainedException() + + input_matrix = self.translate_shorttext_intfeature_matrix(shorttext) + prediction = self.model.predict(np.array([input_matrix])) + + scoredict = {label: prediction[0][idx] for idx, label in enumerate(self.classlabels)} + + return scoredict + + def savemodel(self, nameprefix: str) -> None: + """Save the stacked model to files. + + Note: Intermediate classifiers are not saved. Save them separately. + + Args: + nameprefix: Prefix for output files. + + Raises: + ModelNotTrainedException: If model not trained. + """ + if not self.trained: + raise ModelNotTrainedException() + + stackedmodeldict = {'classifiers': self.classifier2idx, + 'classlabels': self.classlabels} + pickle.dump(stackedmodeldict, open(nameprefix+'_stackedlogistics.pkl', 'wb')) + kerasio.save_model(nameprefix+'_stackedlogistics', self.model) + + def loadmodel(self, nameprefix: str) -> None: + """Load the stacked model from files. + + Note: Intermediate classifiers are not loaded. Load them separately. + + Args: + nameprefix: Prefix for input files. + """ + stackedmodeldict = pickle.load(open(nameprefix+'_stackedlogistics.pkl', 'rb')) + self.register_classlabels(stackedmodeldict['classlabels']) + self.classifier2idx = stackedmodeldict['classifiers'] + self.idx2classifier = {val: key for key, val in self.classifier2idx.items()} + self.model = kerasio.load_model(nameprefix+'_stackedlogistics') + + self.trained = True diff --git a/src/shorttext/utils/__init__.py b/src/shorttext/utils/__init__.py new file mode 100644 index 00000000..c73208a7 --- /dev/null +++ b/src/shorttext/utils/__init__.py @@ -0,0 +1,17 @@ + +from . import misc +from . import kerasmodel_io +from . import classification_exceptions +from . import gensim_corpora +from . import textpreprocessing +from . import compactmodel_io +from . import dtm + +from .textpreprocessing import tokenize, stemword +from .textpreprocessing import text_preprocessor, standard_text_preprocessor_1, standard_text_preprocessor_2 + +from .wordembed import load_word2vec_model, load_fasttext_model, load_poincare_model, shorttext_to_avgvec +from .wordembed import RESTfulKeyedVectors +from .dtm import NumpyDocumentTermMatrix + + diff --git a/src/shorttext/utils/classification_exceptions.py b/src/shorttext/utils/classification_exceptions.py new file mode 100644 index 00000000..44a82010 --- /dev/null +++ b/src/shorttext/utils/classification_exceptions.py @@ -0,0 +1,49 @@ + +from os import PathLike +from pathlib import Path + +from deprecation import deprecated +import numpy as np + + +class ModelNotTrainedException(Exception): + """Exception raised when attempting to use an untrained model.""" + def __init__(self): + self.message = 'Model not trained.' + + +class AlgorithmNotExistException(Exception): + """Exception raised when a requested algorithm is not available.""" + def __init__(self, algoname: str): + self.message = f"Algorithm {algoname} not exist." + + +class WordEmbeddingModelNotExistException(Exception): + """Exception raised when the word embedding model file is not found.""" + def __init__(self, path: str | PathLike): + self.message = f"Given path of the word-embedding model not exist: {path.as_posix() if isinstance(path, Path) else path}" + + +class UnequalArrayLengthsException(Exception): + """Exception raised when two arrays have unequal lengths.""" + def __init__(self, arr1: np.ndarray | list, arr2: np.ndarray | list): + self.message = f"Unequal lengths: {len(arr1)} and {len(arr2)}" + + +@deprecated(deprecated_in="4.0.0", removed_in="5.0.0") +class NotImplementedException(Exception): + """Exception raised when a method is not implemented.""" + def __init__(self): + self.message = 'Method not implemented.' + + +class IncorrectClassificationModelFileException(Exception): + """Exception raised when model file doesn't match expected type.""" + def __init__(self, expectedname: str, actualname: str): + self.message = f"Incorrect model (expected: {expectedname} ; actual: {actualname})" + + +class OperationNotDefinedException(Exception): + """Exception raised when an operation is not defined.""" + def __init__(self, opname: str): + self.message = f"Operation {opname} not defined" diff --git a/src/shorttext/utils/compactmodel_io.py b/src/shorttext/utils/compactmodel_io.py new file mode 100644 index 00000000..346c9030 --- /dev/null +++ b/src/shorttext/utils/compactmodel_io.py @@ -0,0 +1,206 @@ +""" +This module contains general routines to zip all model files into one compact file. +The model can be copied or transferred easily. + +The methods and decorators in this module are called by other codes. It is not +recommended for developers to call them directly. +""" + +from abc import ABC, abstractmethod +from tempfile import mkdtemp +import zipfile +import json +import os +from os import PathLike +from typing import Any, Self + +import orjson + +from . import classification_exceptions as e + + +def removedir(dir: str) -> None: + """Remove all subdirectories and files under the specified path. + + Args: + dir: Path of the directory to clean. + """ + for filename in os.listdir(dir): + if os.path.isdir(filename): + removedir(os.path.join(dir, filename)) + os.rmdir(os.path.join(filename)) + else: + os.remove(os.path.join(dir, filename)) + os.rmdir(dir) + + +def save_compact_model( + filename: str, + savefunc: callable, + prefix: str, + suffices: str, + infodict: dict[str, Any] +) -> None: + """Save the model in one compact file by zipping all related files. + + Args: + filename: Name of the output model file. + savefunc: Function that performs the saving action. Takes one argument (str) - the prefix. + prefix: Prefix of the names of the files related to the model. + suffices: List of file suffixes. + infodict: Dictionary with model information. Must contain the key 'classifier'. + """ + # create temporary directory + tempdir = mkdtemp() + savefunc(os.path.join(tempdir, prefix)) + + # zipping + outputfile = zipfile.ZipFile(filename, mode='w', allowZip64 = True) + for suffix in suffices: + outputfile.write(os.path.join(tempdir, prefix+suffix), prefix+suffix) + outputfile.writestr('modelconfig.json', json.dumps(infodict)) + outputfile.close() + + # delete temporary files + removedir(tempdir) + + +def load_compact_model( + filename: str, + loadfunc: callable, + prefix: str, + infodict: dict[str, Any] +) -> Any: + """Load a model from a compact file. + + Args: + filename: Name of the model file. + loadfunc: Function that performs the loading action. Takes one argument (str) - the prefix. + prefix: Prefix of the names of the files. + infodict: Dictionary with model information. Must contain the key 'classifier'. + + Returns: + The loaded model instance. + """ + # create temporary directory + tempdir = mkdtemp() + + # unzipping + inputfile = zipfile.ZipFile(filename, mode='r') + inputfile.extractall(tempdir) + inputfile.close() + + # check model config + readinfodict = json.load(open(os.path.join(tempdir, 'modelconfig.json'), 'r')) + if readinfodict['classifier'] != infodict['classifier']: + raise e.IncorrectClassificationModelFileException( + infodict['classifier'], + readinfodict['classifier'] + ) + + # load the model + returnobj = loadfunc(os.path.join(tempdir, prefix)) + + # delete temporary files + removedir(tempdir) + + return returnobj + + +class CompactIOMachine(ABC): + """Base class that implements compact model I/O. + + Replaces the original compactio decorator. + """ + + def __init__( + self, + infodict: dict[str, Any], + prefix: str, + suffices: list[str] + ): + """Initialize the compact I/O machine. + + Args: + infodict: Dictionary with model information. Must contain 'classifier'. + prefix: Prefix for model file names. + suffices: List of file suffixes for the model files. + """ + self.infodict = infodict + self.prefix = prefix + self.suffices = suffices + + @abstractmethod + def savemodel(self, nameprefix: str) -> None: + """Save the model to files. + + Args: + nameprefix: Prefix for model file paths. + """ + raise NotImplemented() + + @abstractmethod + def loadmodel(self, nameprefix: str) -> Self: + """Load the model from files. + + Args: + nameprefix: Prefix for model file paths. + """ + raise NotImplemented() + + def save_compact_model(self, filename: str, *args, **kwargs) -> None: + """Save the model in a compressed binary format. + + Args: + filename: Name of the model file. + *args: Additional arguments. + **kwargs: Additional keyword arguments. + """ + save_compact_model(filename, self.savemodel, self.prefix, self.suffices, self.infodict, *args, **kwargs) + + def load_compact_model(self, filename: str, *args, **kwargs) -> Self: + """Load the model from a compressed binary format. + + Args: + filename: Name of the model file. + *args: Additional arguments. + **kwargs: Additional keyword arguments. + """ + return load_compact_model(filename, self.loadmodel, self.prefix, self.infodict, *args, **kwargs) + + def get_info(self) -> dict[str, Any]: + """Get model metadata. + + Returns: + Dictionary with classifier, prefix, and suffices. + """ + return {'classifier': self.infodict['classifier'], + 'prefix': self.prefix, + 'suffices': self.suffices} + + +def get_model_config_field(filename: str | PathLike, parameter: str) -> str: + """Get a configuration parameter from a compact model file. + + Args: + filename: Path to the model file. + parameter: Parameter name to retrieve. + + Returns: + The parameter value. + """ + inputfile = zipfile.ZipFile(filename, mode='r') + readinfodict = json.load(inputfile.open("modelconfig.json", "r")) + return readinfodict[parameter] + + +def get_model_classifier_name(filename: str| PathLike) -> str: + """Get the classifier name from a compact model file. + + Args: + filename: Path to the model file. + + Returns: + The classifier name. + """ + return get_model_config_field(filename, 'classifier') diff --git a/src/shorttext/utils/compute.py b/src/shorttext/utils/compute.py new file mode 100644 index 00000000..1365cd14 --- /dev/null +++ b/src/shorttext/utils/compute.py @@ -0,0 +1,23 @@ + +from typing import Annotated + +import numpy as np +import numpy.typing as npt +import numba as nb + + +@nb.njit(nb.float64(nb.float64[::1], nb.float64[::1])) +def cosine_similarity( + vec1: Annotated[npt.NDArray[np.float64], "1D array"], + vec2: Annotated[npt.NDArray[np.float64], "1D array"] +) -> float: + """Compute cosine similarity between two vectors. + + Args: + vec1: First vector. + vec2: Second vector. + + Returns: + Cosine similarity score between 0 and 1. + """ + return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) diff --git a/src/shorttext/utils/dtm.py b/src/shorttext/utils/dtm.py new file mode 100644 index 00000000..4e5162ea --- /dev/null +++ b/src/shorttext/utils/dtm.py @@ -0,0 +1,389 @@ + +from collections import Counter +from typing import Optional, Any, Self, Annotated + +import numpy as np +import numpy.typing as npt +import npdict +from os import PathLike + +import sparse + +from .classification_exceptions import UnequalArrayLengthsException +from .compactmodel_io import CompactIOMachine +from .textpreprocessing import advanced_text_tokenizer_1 + +npdtm_suffices = ["_npdict.npy"] + + +def _construct_sparse_coo_dtm_matrix( + sorted_token_list: list[str], + tokens_counters: list[list[tuple[str, int]]] +) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64], npt.NDArray[np.float64]]: + """Construct sparse COO matrix for document-term matrix. + + Args: + sorted_token_list: Sorted list of tokens. + tokens_counters: List of token counters for each document. + + Returns: + Tuple of (x_coords, y_coords, data) for sparse COO matrix. + """ + token_index_map = {token: idx for idx, token in enumerate(sorted_token_list)} + ids_counters = [ + {token_index_map[token]: counts for token, counts in counter} + for counter in tokens_counters + ] + docs_nbtokens = [len(counter) for counter in ids_counters] + nb_coo_data = sum(docs_nbtokens) + coordx_array = np.empty(nb_coo_data, dtype=np.int64) + coordy_array = np.empty(nb_coo_data, dtype=np.int64) + val_array = np.empty(nb_coo_data) + + i = 0 + for doc_id, counter in enumerate(ids_counters): + for tokenid, counts in counter.items(): + coordx_array[i] = doc_id + coordy_array[i] = tokenid + val_array[i] = counts + i += 1 + + return coordx_array, coordy_array, val_array + + +def generate_npdict_document_term_matrix( + corpus: list[str], + doc_ids: list[Any], + tokenize_func: callable +) -> npdict.NumpyNDArrayWrappedDict: + """Generate document-term matrix as numpy dict. + + Args: + corpus: List of documents. + doc_ids: List of document IDs. + tokenize_func: Tokenization function. + + Returns: + NumpyNDArrayWrappedDict containing the document-term matrix. + + Raises: + UnequalArrayLengthsException: If corpus and doc_ids have different lengths. + """ + try: + assert len(corpus) == len(doc_ids) + except AssertionError: + raise UnequalArrayLengthsException(corpus, doc_ids) + + # grabbing tokens from each document in the corpus + doc_tokens = [tokenize_func(document) for document in corpus] + tokens_set = set([ + token + for document in doc_tokens + for token in document + ]) + sorted_tokens_list = sorted(list(tokens_set)) + tokens_counters = [dict(Counter(tokens)) for tokens in doc_tokens] + tokens_counters_tuples = [[(token, counts) for token, counts in counter.items()] for counter in tokens_counters] + coord_x, coord_y, data = _construct_sparse_coo_dtm_matrix( + sorted_tokens_list, tokens_counters_tuples + ) + npdtm = npdict.SparseArrayWrappedDict.from_sparsearray_given_keywords( + [doc_ids, sorted_tokens_list], + sparse.COO([coord_x, coord_y], data=data, shape=(len(doc_tokens), len(sorted_tokens_list))) + ) + return npdtm + + +def convert_classdict_to_corpus( + classdict: dict[str, list[str]], + preprocess_func: callable +) -> tuple[list[str], list[str]]: + """Convert class dictionary to corpus and document IDs. + + Args: + classdict: Training data with class labels as keys and texts as values. + preprocess_func: Text preprocessing function. + + Returns: + Tuple of (corpus, doc_ids). + """ + corpus = [ + preprocess_func(datum) + for doc_under_class in classdict.values() + for datum in doc_under_class + ] + docids = [ + f"{label}-{i}" + for label, doc_under_class in classdict.items() + for i in range(len(doc_under_class)) + ] + return corpus, docids + + +def convert_classdict_to_xy( + classdict: dict[str, list[str]], + labels2idx: dict[str, int], + preprocess_func: callable, + tokenize_func: callable +) -> tuple[npdict.NumpyNDArrayWrappedDict, Annotated[sparse.SparseArray, "2D Array"]]: + """Convert class dictionary to feature matrix and labels. + + Args: + classdict: Training data. + labels2idx: Mapping from labels to indices. + preprocess_func: Text preprocessing function. + tokenize_func: Tokenization function. + + Returns: + Tuple of (document-term matrix, label matrix). + """ + nbdata = sum(len(data) for data in classdict.values()) + nblabels = len(labels2idx) + + # making x + corpus, docids = convert_classdict_to_corpus(classdict, preprocess_func=preprocess_func) + dtm_npdict_matrix = generate_npdict_document_term_matrix(corpus, docids, tokenize_func) + + # making y + y = sparse.COO( + [ + list(range(nbdata)), + [ + labels2idx[label] + for label, doc_under_class in classdict.items() + for _ in doc_under_class + ] + ], + [1.]*nbdata, + shape=(nbdata, nblabels) + ) + + return dtm_npdict_matrix, y + + +def compute_document_frequency( + npdtm: npdict.NumpyNDArrayWrappedDict +) -> npt.NDArray[np.int32]: + """Compute document frequency for each token. + + Args: + npdtm: Document-term matrix. + + Returns: + Array of document frequencies for each token. + """ + if isinstance(npdtm, npdict.SparseArrayWrappedDict): + return np.sum(npdtm.to_coo() > 0, axis=0).todense() + else: + return np.sum(npdtm.to_numpy() > 0, axis=0) + + +def compute_tfidf_document_term_matrix( + npdtm: npdict.NumpyNDArrayWrappedDict, + sparse: bool=True +) -> npdict.NumpyNDArrayWrappedDict: + """Compute TF-IDF weighted document-term matrix. + + Args: + npdtm: Document-term matrix. + sparse: Whether to return sparse format. Default: True. + + Returns: + TF-IDF weighted document-term matrix. + """ + doc_frequencies = compute_document_frequency(npdtm) + nbdocs = npdtm.dimension_sizes[0] + if isinstance(npdtm, npdict.SparseArrayWrappedDict): + new_dtm_sparray = npdtm.to_coo() * np.log(nbdocs / doc_frequencies) + return npdtm.generate_dict(new_dtm_sparray, dense=not sparse) + + new_dtm_nparray = npdtm.to_numpy() * np.log(nbdocs / doc_frequencies) + new_npdtm = npdtm.generate_dict(new_dtm_nparray) + if sparse: + return npdict.SparseArrayWrappedDict.from_NumpyNDArrayWrappedDict( + new_npdtm, default_initial_value=0.0 + ) + else: + return new_npdtm + + +class NumpyDocumentTermMatrix(CompactIOMachine): + """Document-term matrix using numpy dict. + + Provides an interface for working with document-term matrices + with compact model I/O support. + """ + + def __init__( + self, + corpus: Optional[list[str]]=None, + docids: Optional[list[Any]]=None, + tfidf: bool=False, + tokenize_func: Optional[callable]=None + ): + """Initialize the document-term matrix. + + Args: + corpus: List of documents. + docids: List of document IDs. + tfidf: Whether to apply TF-IDF weighting. Default: False. + tokenize_func: Tokenization function. Default: advanced_text_tokenizer_1. + """ + super().__init__({'classifier': 'npdtm'}, 'npdtm', npdtm_suffices) + self.tokenize_func = tokenize_func if tokenize_func is not None else advanced_text_tokenizer_1() + + # generate DTM + if corpus is not None: + self.generate_dtm(corpus, docids=docids, tfidf=tfidf) + + def generate_dtm( + self, + corpus: list[str], + docids: Optional[list[Any]]=None, + tfidf: bool=False + ) -> None: + """Generate document-term matrix from corpus. + + Args: + corpus: List of documents. + docids: List of document IDs. + tfidf: Whether to apply TF-IDF weighting. Default: False. + """ + # wrangling document IDs + if docids is None: + doc_ids = [f"doc{i}" for i in range(len(corpus))] + else: + if len(docids) == len(corpus): + doc_ids = docids + elif len(docids) > len(corpus): + doc_ids = docids[:len(corpus)] + else: + doc_ids = docids + [f"doc{i}" for i in range(len(docids), len(corpus))] + + self.npdtm = generate_npdict_document_term_matrix(corpus, doc_ids, self.tokenize_func) + + if tfidf: + self.npdtm = compute_tfidf_document_term_matrix(self.npdtm, sparse=True) + + def get_termfreq(self, docid: str, token: str) -> float: + """Get term frequency for a document and token. + + Args: + docid: Document ID. + token: Token. + + Returns: + Term frequency. + """ + return self.npdtm[docid, token] + + def get_total_termfreq(self, token: str) -> float: + """Get total frequency of a token across all documents. + + Args: + token: Token. + + Returns: + Total term frequency. + """ + token_index = self.npdtm._keystrings_to_indices[1][token] + if isinstance(self.npdtm, npdict.SparseArrayWrappedDict): + matrix = self.npdtm.to_coo() + else: + matrix = self.npdtm.to_numpy() + return np.sum(matrix[:, token_index]) + + def get_doc_frequency(self, token) -> int: + """Get document frequency of a token. + + Args: + token: Token. + + Returns: + Number of documents containing the token. + """ + token_index = self.npdtm._keystrings_to_indices[1][token] + if isinstance(self.npdtm, npdict.SparseArrayWrappedDict): + freq_array = self.npdtm.to_coo()[:, token_index] + else: + freq_array = self.npdtm.to_numpy()[:, token_index] + return np.sum(freq_array > 0, axis=0) + + def get_token_occurences(self, token: str) -> dict[str, float]: + """Get token occurrences across all documents. + + Args: + token: Token. + + Returns: + Dictionary mapping document IDs to term frequencies. + """ + return { + docid: self.npdtm[docid, token] + for docid in self.npdtm._lists_keystrings[0] + } + + def get_doc_tokens(self, docid: str) -> dict[str, float]: + """Get tokens for a specific document. + + Args: + docid: Document ID. + + Returns: + Dictionary mapping tokens to frequencies. + """ + return { + token: self.npdtm[docid, token] + for token in self.npdtm._lists_keystrings[1] + } + + def savemodel(self, nameprefix: str) -> None: + """Save the document-term matrix. + + Args: + nameprefix: Prefix for output file. + """ + self.npdtm.save(nameprefix+"_npdict.npy") + + def loadmodel(self, nameprefix: str) -> Self: + """Load the document-term matrix. + + Args: + nameprefix: Prefix for input file. + """ + self.npdtm = npdict.SparseArrayWrappedDict.load(nameprefix+"_npdict.npy") + + @property + def docids(self) -> list[str]: + """List of document IDs.""" + return self.npdtm._lists_keystrings[0] + + @property + def tokens(self) -> list[str]: + """List of tokens.""" + return self.npdtm._lists_keystrings[1] + + @property + def nbdocs(self) -> int: + """Number of documents.""" + return len(self.docids) + + @property + def nbtokens(self) -> int: + """Number of unique tokens.""" + return len(self.tokens) + + +def load_numpy_documentmatrixmatrix(filepath: str | PathLike) -> NumpyDocumentTermMatrix: + """Load a document-term matrix from a compact file. + + Args: + filepath: Path to the compact model file. + + Returns: + NumpyDocumentTermMatrix instance. + """ + npdtm = NumpyDocumentTermMatrix() + npdtm.load_compact_model(filepath) + return npdtm + diff --git a/src/shorttext/utils/gensim_corpora.py b/src/shorttext/utils/gensim_corpora.py new file mode 100644 index 00000000..9d09471e --- /dev/null +++ b/src/shorttext/utils/gensim_corpora.py @@ -0,0 +1,110 @@ + +from collections import Counter +from typing import Optional + +import gensim +from deprecation import deprecated + +from .textpreprocessing import tokenize + + +def generate_gensim_corpora( + classdict: dict[str, list[str]], + preprocess_and_tokenize: Optional[callable] = None +) -> tuple[gensim.corpora.Dictionary, list[list[tuple[int, int]]], list[str]]: + """Generate gensim dictionary and corpus from training data. + + Args: + classdict: Training data with class labels as keys and lists of texts as values. + preprocess_and_tokenize: Function to preprocess and tokenize text. Default: tokenize. + + Returns: + Tuple of (dictionary, corpus, class_labels). + """ + if preprocess_and_tokenize is None: + preprocess_and_tokenize = tokenize + + classlabels = sorted(classdict.keys()) + doc = [preprocess_and_tokenize(' '.join(classdict[classlabel])) for classlabel in classlabels] + dictionary = gensim.corpora.Dictionary(doc) + corpus = [dictionary.doc2bow(doctokens) for doctokens in doc] + return dictionary, corpus, classlabels + + +@deprecated(deprecated_in="5.0.0", removed_in="6.0.0") +def save_corpus( + dictionary: gensim.corpora.Dictionary, + corpus: list[list[tuple[int, int]]], + prefix: str +) -> None: + """Save gensim corpus and dictionary to files. + + Args: + dictionary: Dictionary to save. + corpus: Corpus to save. + prefix: Prefix for output files. + + Note: + Deprecated since 5.0.0, will be removed in 6.0.0. + """ + dictionary.save(prefix+'_dictionary.dict') + gensim.corpora.MmCorpus.serialize(prefix+'_corpus.mm', corpus) + + +@deprecated(deprecated_in="5.0.0", removed_in="6.0.0") +def load_corpus(prefix: str) -> tuple[gensim.corpora.MmCorpus, gensim.corpora.Dictionary]: + """Load gensim corpus and dictionary from files. + + Args: + prefix: Prefix of files to load. + + Returns: + Tuple of (corpus, dictionary). + + Note: + Deprecated since 5.0.0, will be removed in 6.0.0. + """ + corpus = gensim.corpora.MmCorpus(prefix+'_corpus.mm') + dictionary = gensim.corpora.Dictionary.load(prefix+'_dictionary.dict') + return corpus, dictionary + + +def update_corpus_labels( + dictionary: gensim.corpora.Dictionary, + corpus: list[list[tuple[int, int]]], + newclassdict: dict[str, list[str]], + preprocess_and_tokenize: Optional[callable] = None +) -> tuple[list[list[tuple[int, int]]], list[list[tuple[int, int]]]]: + """Update corpus with additional training data. + + Args: + dictionary: Existing dictionary. + corpus: Existing corpus. + newclassdict: Additional training data. + preprocess_and_tokenize: Function to preprocess text. Default: tokenize. + + Returns: + Tuple of (updated_corpus, new_corpus). + """ + if preprocess_and_tokenize is None: + preprocess_and_tokenize = tokenize + + newdoc = [preprocess_and_tokenize(' '.join(newclassdict[classlabel])) for classlabel in sorted(newclassdict.keys())] + newcorpus = [dictionary.doc2bow(doctokens) for doctokens in newdoc] + corpus += newcorpus + + return corpus, newcorpus + + +def tokens_to_fracdict(tokens: list[str]) -> dict[str, float]: + """Convert tokens to normalized frequency dictionary. + + Args: + tokens: List of tokens. + + Returns: + Dictionary with tokens as keys and normalized frequencies as values. + """ + cntdict = Counter(tokens) + totalcnt = sum(cntdict.values()) + return {token: cnt / totalcnt for token, cnt in cntdict.items()} diff --git a/src/shorttext/utils/kerasmodel_io.py b/src/shorttext/utils/kerasmodel_io.py new file mode 100644 index 00000000..cb523d4e --- /dev/null +++ b/src/shorttext/utils/kerasmodel_io.py @@ -0,0 +1,28 @@ +import tensorflow +from tensorflow.keras.models import model_from_json + + +def save_model(nameprefix: str, model: tensorflow.keras.models.Model) -> None: + """Save a Keras model to files. + + Args: + nameprefix: Prefix for output files. + model: Keras model to save. + """ + model_json = model.to_json() + open(nameprefix+'.json', 'w').write(model_json) + model.save_weights(nameprefix+'.weights.h5') + + +def load_model(nameprefix: str) -> tensorflow.keras.models.Model: + """Load a Keras model from files. + + Args: + nameprefix: Prefix for input files. + + Returns: + Loaded Keras model. + """ + model = model_from_json(open(nameprefix+'.json', 'r').read()) + model.load_weights(nameprefix+'.weights.h5') + return model diff --git a/src/shorttext/utils/misc.py b/src/shorttext/utils/misc.py new file mode 100644 index 00000000..ab95a82f --- /dev/null +++ b/src/shorttext/utils/misc.py @@ -0,0 +1,48 @@ + +from typing import Generator +from io import TextIOWrapper + + + +def textfile_generator( + textfile: TextIOWrapper, + linebreak: bool=True, + encoding: bool=None +) -> Generator[str, None, None]: + """Generator that yields lines from a text file. + + Args: + textfile: File object to read lines from. + linebreak: Whether to include line break at end of each line. Default: True. + encoding: Encoding of the text file. Default: None. + + Yields: + Lines from the text file, stripped of whitespace. + """ + for t in textfile: + if len(t) > 0: + if encoding is None: + yield t.strip() + ('\n' if linebreak else '') + else: + yield t.decode(encoding).strip() + ('\n' if linebreak else '') + + +class SinglePoolExecutor: + """Wrapper for Python map function. + + Provides an interface similar to concurrent.futures.Executor.map + but using a synchronous map implementation. + """ + + def map(self, func, *iterables): + """Apply function to iterables element-wise. + + Args: + func: Function to apply to each element. + iterables: One or more iterables to process. + + Returns: + An iterator yielding the results. + """ + return map(func, *iterables) + return map(func, *iterables) diff --git a/src/shorttext/utils/nonneg_stopwords.txt b/src/shorttext/utils/nonneg_stopwords.txt new file mode 100644 index 00000000..8cd4eea2 --- /dev/null +++ b/src/shorttext/utils/nonneg_stopwords.txt @@ -0,0 +1,2778 @@ +og +i +jeg +det +at +en +den +til +er +som +på +de +med +han +af +for +ikke +der +var +mig +sig +men +et +har +om +vi +min +havde +ham +hun +nu +over +da +fra +du +ud +sin +dem +os +op +man +hans +hvor +eller +hvad +skal +selv +her +alle +vil +blev +kunne +ind +når +være +dog +noget +ville +jo +deres +efter +ned +skulle +denne +end +dette +mit +også +under +have +dig +anden +hende +mine +alt +meget +sit +sine +vor +mod +disse +hvis +din +nogle +hos +blive +mange +ad +bliver +hendes +været +thi +jer +sådan +de +en +van +ik +te +dat +die +in +een +hij +het +niet +zijn +is +was +op +aan +met +als +voor +had +er +maar +om +hem +dan +zou +of +wat +mijn +men +dit +zo +door +over +ze +zich +bij +ook +tot +je +mij +uit +der +daar +haar +naar +heb +hoe +heeft +hebben +deze +u +want +nog +zal +me +zij +nu +ge +geen +omdat +iets +worden +toch +al +waren +veel +meer +doen +toen +moet +ben +zonder +kan +hun +dus +alles +onder +ja +eens +hier +wie +werd +altijd +doch +wordt +wezen +kunnen +ons +zelf +tegen +na +reeds +wil +kon +niets +uw +iemand +geweest +andere +i +me +my +myself +we +our +ours +ourselves +you +your +yours +yourself +yourselves +he +him +his +himself +she +her +hers +herself +it +its +itself +they +them +their +theirs +themselves +what +which +who +whom +this +that +these +those +am +is +are +was +were +be +been +being +have +has +had +having +do +does +did +doing +a +an +the +and +but +if +or +because +as +until +while +of +at +by +for +with +about +against +between +into +through +during +before +after +above +below +to +from +up +down +in +out +on +off +over +under +again +further +then +once +here +there +when +where +why +how +all +any +both +each +few +more +most +other +some +such +only +own +same +so +than +too +very +s +t +can +will +just +don +should +now +d +ll +m +o +re +ve +y +ain +aren +couldn +didn +doesn +hadn +hasn +haven +isn +ma +mightn +mustn +needn +shan +shouldn +wasn +weren +won +wouldn +olla +olen +olet +on +olemme +olette +ovat +ole +oli +olisi +olisit +olisin +olisimme +olisitte +olisivat +olit +olin +olimme +olitte +olivat +ollut +olleet +en +et +ei +emme +ette +eivät +minä +minun +minut +minua +minussa +minusta +minuun +minulla +minulta +minulle +sinä +sinun +sinut +sinua +sinussa +sinusta +sinuun +sinulla +sinulta +sinulle +hän +hänen +hänet +häntä +hänessä +hänestä +häneen +hänellä +häneltä +hänelle +me +meidän +meidät +meitä +meissä +meistä +meihin +meillä +meiltä +meille +te +teidän +teidät +teitä +teissä +teistä +teihin +teillä +teiltä +teille +he +heidän +heidät +heitä +heissä +heistä +heihin +heillä +heiltä +heille +tämä +tämän +tätä +tässä +tästä +tähän +tallä +tältä +tälle +tänä +täksi +tuo +tuon +tuotä +tuossa +tuosta +tuohon +tuolla +tuolta +tuolle +tuona +tuoksi +se +sen +sitä +siinä +siitä +siihen +sillä +siltä +sille +sinä +siksi +nämä +näiden +näitä +näissä +näistä +näihin +näillä +näiltä +näille +näinä +näiksi +nuo +noiden +noita +noissa +noista +noihin +noilla +noilta +noille +noina +noiksi +ne +niiden +niitä +niissä +niistä +niihin +niillä +niiltä +niille +niinä +niiksi +kuka +kenen +kenet +ketä +kenessä +kenestä +keneen +kenellä +keneltä +kenelle +kenenä +keneksi +ketkä +keiden +ketkä +keitä +keissä +keistä +keihin +keillä +keiltä +keille +keinä +keiksi +mikä +minkä +minkä +mitä +missä +mistä +mihin +millä +miltä +mille +minä +miksi +mitkä +joka +jonka +jota +jossa +josta +johon +jolla +jolta +jolle +jona +joksi +jotka +joiden +joita +joissa +joista +joihin +joilla +joilta +joille +joina +joiksi +että +ja +jos +koska +kuin +mutta +niin +sekä +sillä +tai +vaan +vai +vaikka +kanssa +mukaan +noin +poikki +yli +kun +niin +nyt +itse +au +aux +avec +ce +ces +dans +de +des +du +elle +en +et +eux +il +je +la +le +leur +lui +ma +mais +me +même +mes +moi +mon +ne +nos +notre +nous +on +ou +par +pas +pour +qu +que +qui +sa +se +ses +son +sur +ta +te +tes +toi +ton +tu +un +une +vos +votre +vous +c +d +j +l +à +m +n +s +t +y +été +étée +étées +étés +étant +étante +étants +étantes +suis +es +est +sommes +êtes +sont +serai +seras +sera +serons +serez +seront +serais +serait +serions +seriez +seraient +étais +était +étions +étiez +étaient +fus +fut +fûmes +fûtes +furent +sois +soit +soyons +soyez +soient +fusse +fusses +fût +fussions +fussiez +fussent +ayant +ayante +ayantes +ayants +eu +eue +eues +eus +ai +as +avons +avez +ont +aurai +auras +aura +aurons +aurez +auront +aurais +aurait +aurions +auriez +auraient +avais +avait +avions +aviez +avaient +eut +eûmes +eûtes +eurent +aie +aies +ait +ayons +ayez +aient +eusse +eusses +eût +eussions +eussiez +eussent +aber +alle +allem +allen +aller +alles +als +also +am +an +ander +andere +anderem +anderen +anderer +anderes +anderm +andern +anderr +anders +auch +auf +aus +bei +bin +bis +bist +da +damit +dann +der +den +des +dem +die +das +daß +derselbe +derselben +denselben +desselben +demselben +dieselbe +dieselben +dasselbe +dazu +dein +deine +deinem +deinen +deiner +deines +denn +derer +dessen +dich +dir +du +dies +diese +diesem +diesen +dieser +dieses +doch +dort +durch +ein +eine +einem +einen +einer +eines +einig +einige +einigem +einigen +einiger +einiges +einmal +er +ihn +ihm +es +etwas +euer +eure +eurem +euren +eurer +eures +für +gegen +gewesen +hab +habe +haben +hat +hatte +hatten +hier +hin +hinter +ich +mich +mir +ihr +ihre +ihrem +ihren +ihrer +ihres +euch +im +in +indem +ins +ist +jede +jedem +jeden +jeder +jedes +jene +jenem +jenen +jener +jenes +jetzt +kann +kein +keine +keinem +keinen +keiner +keines +können +könnte +machen +man +manche +manchem +manchen +mancher +manches +mein +meine +meinem +meinen +meiner +meines +mit +muss +musste +nach +nicht +nichts +noch +nun +nur +ob +oder +ohne +sehr +sein +seine +seinem +seinen +seiner +seines +selbst +sich +sie +ihnen +sind +so +solche +solchem +solchen +solcher +solches +soll +sollte +sondern +sonst +über +um +und +uns +unsere +unserem +unseren +unser +unseres +unter +viel +vom +von +vor +während +war +waren +warst +was +weg +weil +weiter +welche +welchem +welchen +welcher +welches +wenn +werde +werden +wie +wieder +will +wir +wird +wirst +wo +wollen +wollte +würde +würden +zu +zum +zur +zwar +zwischen +a +ahogy +ahol +aki +akik +akkor +alatt +által +általában +amely +amelyek +amelyekben +amelyeket +amelyet +amelynek +ami +amit +amolyan +amíg +amikor +át +abban +ahhoz +annak +arra +arról +az +azok +azon +azt +azzal +azért +aztán +azután +azonban +bár +be +belül +benne +cikk +cikkek +cikkeket +csak +de +e +eddig +egész +egy +egyes +egyetlen +egyéb +egyik +egyre +ekkor +el +elég +ellen +elõ +elõször +elõtt +elsõ +én +éppen +ebben +ehhez +emilyen +ennek +erre +ez +ezt +ezek +ezen +ezzel +ezért +és +fel +felé +hanem +hiszen +hogy +hogyan +igen +így +illetve +ill. +ill +ilyen +ilyenkor +ison +ismét +itt +jó +jól +jobban +kell +kellett +keresztül +keressünk +ki +kívül +között +közül +legalább +lehet +lehetett +legyen +lenne +lenni +lesz +lett +maga +magát +majd +majd +már +más +másik +meg +még +mellett +mert +mely +melyek +mi +mit +míg +miért +milyen +mikor +minden +mindent +mindenki +mindig +mint +mintha +mivel +most +nagy +nagyobb +nagyon +ne +néha +nekem +neki +nem +néhány +nélkül +nincs +olyan +ott +össze +õ +õk +õket +pedig +persze +rá +s +saját +sem +semmi +sok +sokat +sokkal +számára +szemben +szerint +szinte +talán +tehát +teljes +tovább +továbbá +több +úgy +ugyanis +új +újabb +újra +után +utána +utolsó +vagy +vagyis +valaki +valami +valamint +való +vagyok +van +vannak +volt +voltam +voltak +voltunk +vissza +vele +viszont +volna +ad +al +allo +ai +agli +all +agl +alla +alle +con +col +coi +da +dal +dallo +dai +dagli +dall +dagl +dalla +dalle +di +del +dello +dei +degli +dell +degl +della +delle +in +nel +nello +nei +negli +nell +negl +nella +nelle +su +sul +sullo +sui +sugli +sull +sugl +sulla +sulle +per +tra +contro +io +tu +lui +lei +noi +voi +loro +mio +mia +miei +mie +tuo +tua +tuoi +tue +suo +sua +suoi +sue +nostro +nostra +nostri +nostre +vostro +vostra +vostri +vostre +mi +ti +ci +vi +lo +la +li +le +gli +ne +il +un +uno +una +ma +ed +se +perché +anche +come +dov +dove +che +chi +cui +non +più +quale +quanto +quanti +quanta +quante +quello +quelli +quella +quelle +questo +questi +questa +queste +si +tutto +tutti +a +c +e +i +l +o +ho +hai +ha +abbiamo +avete +hanno +abbia +abbiate +abbiano +avrò +avrai +avrà +avremo +avrete +avranno +avrei +avresti +avrebbe +avremmo +avreste +avrebbero +avevo +avevi +aveva +avevamo +avevate +avevano +ebbi +avesti +ebbe +avemmo +aveste +ebbero +avessi +avesse +avessimo +avessero +avendo +avuto +avuta +avuti +avute +sono +sei +è +siamo +siete +sia +siate +siano +sarò +sarai +sarà +saremo +sarete +saranno +sarei +saresti +sarebbe +saremmo +sareste +sarebbero +ero +eri +era +eravamo +eravate +erano +fui +fosti +fu +fummo +foste +furono +fossi +fosse +fossimo +fossero +essendo +faccio +fai +facciamo +fanno +faccia +facciate +facciano +farò +farai +farà +faremo +farete +faranno +farei +faresti +farebbe +faremmo +fareste +farebbero +facevo +facevi +faceva +facevamo +facevate +facevano +feci +facesti +fece +facemmo +faceste +fecero +facessi +facesse +facessimo +facessero +facendo +sto +stai +sta +stiamo +stanno +stia +stiate +stiano +starò +starai +starà +staremo +starete +staranno +starei +staresti +starebbe +staremmo +stareste +starebbero +stavo +stavi +stava +stavamo +stavate +stavano +stetti +stesti +stette +stemmo +steste +stettero +stessi +stesse +stessimo +stessero +stando +ах +ох +эх +ай +эй +ой +тағы +тағыда +әрине +жоқ +сондай +осындай +осылай +солай +мұндай +бұндай +мен +сен +ол +біз +біздер +олар +сіз +сіздер +маған +оған +саған +біздің +сіздің +оның +бізге +сізге +оларға +біздерге +сіздерге +оларға +менімен +сенімен +онымен +бізбен +сізбен +олармен +біздермен +сіздермен +менің +сенің +біздің +сіздің +оның +біздердің +сіздердің +олардың +маған +саған +оған +менен +сенен +одан +бізден +сізден +олардан +біздерден +сіздерден +олардан +айтпақшы +сонымен +сондықтан +бұл +осы +сол +анау +мынау +сонау +осынау +ана +мына +сона +әні +міне +өй +үйт +бүйт +біреу +кейбіреу +кейбір +қайсыбір +әрбір +бірнеше +бірдеме +бірнеше +әркім +әрне +әрқайсы +әрқалай +әлдекім +әлдене +әлдеқайдан +әлденеше +әлдеқалай +әлдеқашан +алдақашан +еш +ешкім +ешбір +ештеме +дәнеңе +ешқашан +ешқандай +ешқайсы +емес +бәрі +барлық +барша +бар +күллі +бүкіл +түгел +өз +өзім +өзің +өзінің +өзіме +өзіне +өзімнің +өзі +өзге +менде +сенде +онда +менен +сенен онан +одан +ау +па +ей +әй +е +уа +уау +уай +я +пай +ә +о +оһо +ой +ие +аһа +ау +беу +мәссаған +бәрекелді +әттегенай +жаракімалла +масқарай +астапыралла +япырмай +ойпырмай +кәне +кәнеки +ал +әйда +кәні +міне +әні +сорап +қош-қош +пфша +пішә +құрау-құрау +шәйт +шек +моһ +тәк +құрау +құр +кә +кәһ +күшім +күшім +мышы +пырс +әукім +алақай +паһ-паһ +бәрекелді +ура +әттең +әттеген-ай +қап +түге +пішту +шіркін +алатау +пай-пай +үшін +сайын +сияқты +туралы +арқылы +бойы +бойымен +шамалы +шақты +қаралы +ғұрлы +ғұрлым +шейін +дейін +қарай +таман +салым +тарта +жуық +таяу +гөрі +бері +кейін +соң +бұрын +бетер +қатар +бірге +қоса +арс +гүрс +дүрс +қорс +тарс +тырс +ырс +барқ +борт +күрт +кірт +морт +сарт +шырт +дүңк +күңк +қыңқ +мыңқ +маңқ +саңқ +шаңқ +шіңк +сыңқ +таңқ +тыңқ +ыңқ +болп +былп +жалп +желп +қолп +ірк +ырқ +сарт-сұрт +тарс-тұрс +арс-ұрс +жалт-жалт +жалт-жұлт +қалт-қалт +қалт-құлт +қаңқ-қаңқ +қаңқ-құңқ +шаңқ-шаңқ +шаңқ-шұңқ +арбаң-арбаң +бүгжең-бүгжең +арсалаң-арсалаң +ербелең-ербелең +батыр-бұтыр +далаң-далаң +тарбаң-тарбаң +қызараң-қызараң +қаңғыр-күңгір +қайқаң-құйқаң +митың-митың +салаң-сұлаң +ыржың-тыржың +бірақ +алайда +дегенмен +әйтпесе +әйткенмен +себебі +өйткені +сондықтан +үшін +сайын +сияқты +туралы +арқылы +бойы +бойымен +шамалы +шақты +қаралы +ғұрлы +ғұрлым +гөрі +бері +кейін +соң +бұрын +бетер +қатар +бірге +қоса +шейін +дейін +қарай +таман +салым +тарта +жуық +таяу +арнайы +осындай +ғана +қана +тек +әншейін +og +i +jeg +det +at +en +et +den +til +er +som +på +de +med +han +av +ikke +ikkje +der +så +var +meg +seg +men +ett +har +om +vi +min +mitt +ha +hadde +hun +nå +over +da +ved +fra +du +ut +sin +dem +oss +opp +man +kan +hans +hvor +eller +hva +skal +selv +sjøl +her +alle +vil +bli +ble +blei +blitt +kunne +inn +når +være +kom +noen +noe +ville +dere +som +deres +kun +ja +etter +ned +skulle +denne +for +deg +si +sine +sitt +mot +å +meget +hvorfor +dette +disse +uten +hvordan +ingen +din +ditt +blir +samme +hvilken +hvilke +sånn +inni +mellom +vår +hver +hvem +vors +hvis +både +bare +enn +fordi +før +mange +også +slik +vært +være +båe +begge +siden +dykk +dykkar +dei +deira +deires +deim +di +då +eg +ein +eit +eitt +elles +honom +hjå +ho +hoe +henne +hennar +hennes +hoss +hossen +ikkje +ingi +inkje +korleis +korso +kva +kvar +kvarhelst +kven +kvi +kvifor +me +medan +mi +mine +mykje +no +nokon +noka +nokor +noko +nokre +si +sia +sidan +so +somt +somme +um +upp +vere +vore +verte +vort +varte +vart +de +a +o +que +e +do +da +em +um +para +com +não +uma +os +no +se +na +por +mais +as +dos +como +mas +ao +ele +das +à +seu +sua +ou +quando +muito +nos +já +eu +também +só +pelo +pela +até +isso +ela +entre +depois +sem +mesmo +aos +seus +quem +nas +me +esse +eles +você +essa +num +nem +suas +meu +às +minha +numa +pelos +elas +qual +nós +lhe +deles +essas +esses +pelas +este +dele +tu +te +vocês +vos +lhes +meus +minhas +teu +tua +teus +tuas +nosso +nossa +nossos +nossas +dela +delas +esta +estes +estas +aquele +aquela +aqueles +aquelas +isto +aquilo +estou +está +estamos +estão +estive +esteve +estivemos +estiveram +estava +estávamos +estavam +estivera +estivéramos +esteja +estejamos +estejam +estivesse +estivéssemos +estivessem +estiver +estivermos +estiverem +hei +há +havemos +hão +houve +houvemos +houveram +houvera +houvéramos +haja +hajamos +hajam +houvesse +houvéssemos +houvessem +houver +houvermos +houverem +houverei +houverá +houveremos +houverão +houveria +houveríamos +houveriam +sou +somos +são +era +éramos +eram +fui +foi +fomos +foram +fora +fôramos +seja +sejamos +sejam +fosse +fôssemos +fossem +for +formos +forem +serei +será +seremos +serão +seria +seríamos +seriam +tenho +tem +temos +tém +tinha +tínhamos +tinham +tive +teve +tivemos +tiveram +tivera +tivéramos +tenha +tenhamos +tenham +tivesse +tivéssemos +tivessem +tiver +tivermos +tiverem +terei +terá +teremos +terão +teria +teríamos +teriam +и +в +во +не +что +он +на +я +с +со +как +а +то +все +она +так +его +но +да +ты +к +у +же +вы +за +бы +по +только +ее +мне +было +вот +от +меня +еще +нет +о +из +ему +теперь +когда +даже +ну +вдруг +ли +если +уже +или +ни +быть +был +него +до +вас +нибудь +опять +уж +вам +ведь +там +потом +себя +ничего +ей +может +они +тут +где +есть +надо +ней +для +мы +тебя +их +чем +была +сам +чтоб +без +будто +чего +раз +тоже +себе +под +будет +ж +тогда +кто +этот +того +потому +этого +какой +совсем +ним +здесь +этом +один +почти +мой +тем +чтобы +нее +сейчас +были +куда +зачем +всех +никогда +можно +при +наконец +два +об +другой +хоть +после +над +больше +тот +через +эти +нас +про +всего +них +какая +много +разве +три +эту +моя +впрочем +хорошо +свою +этой +перед +иногда +лучше +чуть +том +нельзя +такой +им +более +всегда +конечно +всю +между +de +la +que +el +en +y +a +los +del +se +las +por +un +para +con +no +una +su +al +lo +como +más +pero +sus +le +ya +o +este +sí +porque +esta +entre +cuando +muy +sin +sobre +también +me +hasta +hay +donde +quien +desde +todo +nos +durante +todos +uno +les +ni +contra +otros +ese +eso +ante +ellos +e +esto +mí +antes +algunos +qué +unos +yo +otro +otras +otra +él +tanto +esa +estos +mucho +quienes +nada +muchos +cual +poco +ella +estar +estas +algunas +algo +nosotros +mi +mis +tú +te +ti +tu +tus +ellas +nosotras +vosostros +vosostras +os +mío +mía +míos +mías +tuyo +tuya +tuyos +tuyas +suyo +suya +suyos +suyas +nuestro +nuestra +nuestros +nuestras +vuestro +vuestra +vuestros +vuestras +esos +esas +estoy +estás +está +estamos +estáis +están +esté +estés +estemos +estéis +estén +estaré +estarás +estará +estaremos +estaréis +estarán +estaría +estarías +estaríamos +estaríais +estarían +estaba +estabas +estábamos +estabais +estaban +estuve +estuviste +estuvo +estuvimos +estuvisteis +estuvieron +estuviera +estuvieras +estuviéramos +estuvierais +estuvieran +estuviese +estuvieses +estuviésemos +estuvieseis +estuviesen +estando +estado +estada +estados +estadas +estad +he +has +ha +hemos +habéis +han +haya +hayas +hayamos +hayáis +hayan +habré +habrás +habrá +habremos +habréis +habrán +habría +habrías +habríamos +habríais +habrían +había +habías +habíamos +habíais +habían +hube +hubiste +hubo +hubimos +hubisteis +hubieron +hubiera +hubieras +hubiéramos +hubierais +hubieran +hubiese +hubieses +hubiésemos +hubieseis +hubiesen +habiendo +habido +habida +habidos +habidas +soy +eres +es +somos +sois +son +sea +seas +seamos +seáis +sean +seré +serás +será +seremos +seréis +serán +sería +serías +seríamos +seríais +serían +era +eras +éramos +erais +eran +fui +fuiste +fue +fuimos +fuisteis +fueron +fuera +fueras +fuéramos +fuerais +fueran +fuese +fueses +fuésemos +fueseis +fuesen +sintiendo +sentido +sentida +sentidos +sentidas +siente +sentid +tengo +tienes +tiene +tenemos +tenéis +tienen +tenga +tengas +tengamos +tengáis +tengan +tendré +tendrás +tendrá +tendremos +tendréis +tendrán +tendría +tendrías +tendríamos +tendríais +tendrían +tenía +tenías +teníamos +teníais +tenían +tuve +tuviste +tuvo +tuvimos +tuvisteis +tuvieron +tuviera +tuvieras +tuviéramos +tuvierais +tuvieran +tuviese +tuvieses +tuviésemos +tuvieseis +tuviesen +teniendo +tenido +tenida +tenidos +tenidas +tened +och +det +att +i +en +jag +hon +som +han +på +den +med +var +sig +för +så +till +är +men +ett +om +hade +de +av +icke +mig +du +henne +då +sin +nu +har +inte +hans +honom +skulle +hennes +där +min +man +ej +vid +kunde +något +från +ut +när +efter +upp +vi +dem +vara +vad +över +än +dig +kan +sina +här +ha +mot +alla +under +någon +eller +allt +mycket +sedan +ju +denna +själv +detta +åt +utan +varit +hur +ingen +mitt +ni +bli +blev +oss +din +dessa +några +deras +blir +mina +samma +vilken +er +sådan +vår +blivit +dess +inom +mellan +sådant +varför +varje +vilka +ditt +vem +vilket +sitta +sådana +vart +dina +vars +vårt +våra +ert +era +vilkas +acaba +ama +aslında +az +bazı +belki +biri +birkaç +birşey +biz +bu +çok +çünkü +da +daha +de +defa +diye +eğer +en +gibi +hem +hep +hepsi +her +hiç +için +ile +ise +kez +ki +kim +mı +mu +mü +nasıl +ne +neden +nerde +nerede +nereye +niçin +niye +o +sanki +şey +siz +şu +tüm +ve +veya +ya +yani diff --git a/src/shorttext/utils/stopwords.txt b/src/shorttext/utils/stopwords.txt new file mode 100644 index 00000000..4b21343d --- /dev/null +++ b/src/shorttext/utils/stopwords.txt @@ -0,0 +1,2781 @@ +og +i +jeg +det +at +en +den +til +er +som +på +de +med +han +af +for +ikke +der +var +mig +sig +men +et +har +om +vi +min +havde +ham +hun +nu +over +da +fra +du +ud +sin +dem +os +op +man +hans +hvor +eller +hvad +skal +selv +her +alle +vil +blev +kunne +ind +når +være +dog +noget +ville +jo +deres +efter +ned +skulle +denne +end +dette +mit +også +under +have +dig +anden +hende +mine +alt +meget +sit +sine +vor +mod +disse +hvis +din +nogle +hos +blive +mange +ad +bliver +hendes +været +thi +jer +sådan +de +en +van +ik +te +dat +die +in +een +hij +het +niet +zijn +is +was +op +aan +met +als +voor +had +er +maar +om +hem +dan +zou +of +wat +mijn +men +dit +zo +door +over +ze +zich +bij +ook +tot +je +mij +uit +der +daar +haar +naar +heb +hoe +heeft +hebben +deze +u +want +nog +zal +me +zij +nu +ge +geen +omdat +iets +worden +toch +al +waren +veel +meer +doen +toen +moet +ben +zonder +kan +hun +dus +alles +onder +ja +eens +hier +wie +werd +altijd +doch +wordt +wezen +kunnen +ons +zelf +tegen +na +reeds +wil +kon +niets +uw +iemand +geweest +andere +i +me +my +myself +we +our +ours +ourselves +you +your +yours +yourself +yourselves +he +him +his +himself +she +her +hers +herself +it +its +itself +they +them +their +theirs +themselves +what +which +who +whom +this +that +these +those +am +is +are +was +were +be +been +being +have +has +had +having +do +does +did +doing +a +an +the +and +but +if +or +because +as +until +while +of +at +by +for +with +about +against +between +into +through +during +before +after +above +below +to +from +up +down +in +out +on +off +over +under +again +further +then +once +here +there +when +where +why +how +all +any +both +each +few +more +most +other +some +such +no +nor +not +only +own +same +so +than +too +very +s +t +can +will +just +don +should +now +d +ll +m +o +re +ve +y +ain +aren +couldn +didn +doesn +hadn +hasn +haven +isn +ma +mightn +mustn +needn +shan +shouldn +wasn +weren +won +wouldn +olla +olen +olet +on +olemme +olette +ovat +ole +oli +olisi +olisit +olisin +olisimme +olisitte +olisivat +olit +olin +olimme +olitte +olivat +ollut +olleet +en +et +ei +emme +ette +eivät +minä +minun +minut +minua +minussa +minusta +minuun +minulla +minulta +minulle +sinä +sinun +sinut +sinua +sinussa +sinusta +sinuun +sinulla +sinulta +sinulle +hän +hänen +hänet +häntä +hänessä +hänestä +häneen +hänellä +häneltä +hänelle +me +meidän +meidät +meitä +meissä +meistä +meihin +meillä +meiltä +meille +te +teidän +teidät +teitä +teissä +teistä +teihin +teillä +teiltä +teille +he +heidän +heidät +heitä +heissä +heistä +heihin +heillä +heiltä +heille +tämä +tämän +tätä +tässä +tästä +tähän +tallä +tältä +tälle +tänä +täksi +tuo +tuon +tuotä +tuossa +tuosta +tuohon +tuolla +tuolta +tuolle +tuona +tuoksi +se +sen +sitä +siinä +siitä +siihen +sillä +siltä +sille +sinä +siksi +nämä +näiden +näitä +näissä +näistä +näihin +näillä +näiltä +näille +näinä +näiksi +nuo +noiden +noita +noissa +noista +noihin +noilla +noilta +noille +noina +noiksi +ne +niiden +niitä +niissä +niistä +niihin +niillä +niiltä +niille +niinä +niiksi +kuka +kenen +kenet +ketä +kenessä +kenestä +keneen +kenellä +keneltä +kenelle +kenenä +keneksi +ketkä +keiden +ketkä +keitä +keissä +keistä +keihin +keillä +keiltä +keille +keinä +keiksi +mikä +minkä +minkä +mitä +missä +mistä +mihin +millä +miltä +mille +minä +miksi +mitkä +joka +jonka +jota +jossa +josta +johon +jolla +jolta +jolle +jona +joksi +jotka +joiden +joita +joissa +joista +joihin +joilla +joilta +joille +joina +joiksi +että +ja +jos +koska +kuin +mutta +niin +sekä +sillä +tai +vaan +vai +vaikka +kanssa +mukaan +noin +poikki +yli +kun +niin +nyt +itse +au +aux +avec +ce +ces +dans +de +des +du +elle +en +et +eux +il +je +la +le +leur +lui +ma +mais +me +même +mes +moi +mon +ne +nos +notre +nous +on +ou +par +pas +pour +qu +que +qui +sa +se +ses +son +sur +ta +te +tes +toi +ton +tu +un +une +vos +votre +vous +c +d +j +l +à +m +n +s +t +y +été +étée +étées +étés +étant +étante +étants +étantes +suis +es +est +sommes +êtes +sont +serai +seras +sera +serons +serez +seront +serais +serait +serions +seriez +seraient +étais +était +étions +étiez +étaient +fus +fut +fûmes +fûtes +furent +sois +soit +soyons +soyez +soient +fusse +fusses +fût +fussions +fussiez +fussent +ayant +ayante +ayantes +ayants +eu +eue +eues +eus +ai +as +avons +avez +ont +aurai +auras +aura +aurons +aurez +auront +aurais +aurait +aurions +auriez +auraient +avais +avait +avions +aviez +avaient +eut +eûmes +eûtes +eurent +aie +aies +ait +ayons +ayez +aient +eusse +eusses +eût +eussions +eussiez +eussent +aber +alle +allem +allen +aller +alles +als +also +am +an +ander +andere +anderem +anderen +anderer +anderes +anderm +andern +anderr +anders +auch +auf +aus +bei +bin +bis +bist +da +damit +dann +der +den +des +dem +die +das +daß +derselbe +derselben +denselben +desselben +demselben +dieselbe +dieselben +dasselbe +dazu +dein +deine +deinem +deinen +deiner +deines +denn +derer +dessen +dich +dir +du +dies +diese +diesem +diesen +dieser +dieses +doch +dort +durch +ein +eine +einem +einen +einer +eines +einig +einige +einigem +einigen +einiger +einiges +einmal +er +ihn +ihm +es +etwas +euer +eure +eurem +euren +eurer +eures +für +gegen +gewesen +hab +habe +haben +hat +hatte +hatten +hier +hin +hinter +ich +mich +mir +ihr +ihre +ihrem +ihren +ihrer +ihres +euch +im +in +indem +ins +ist +jede +jedem +jeden +jeder +jedes +jene +jenem +jenen +jener +jenes +jetzt +kann +kein +keine +keinem +keinen +keiner +keines +können +könnte +machen +man +manche +manchem +manchen +mancher +manches +mein +meine +meinem +meinen +meiner +meines +mit +muss +musste +nach +nicht +nichts +noch +nun +nur +ob +oder +ohne +sehr +sein +seine +seinem +seinen +seiner +seines +selbst +sich +sie +ihnen +sind +so +solche +solchem +solchen +solcher +solches +soll +sollte +sondern +sonst +über +um +und +uns +unsere +unserem +unseren +unser +unseres +unter +viel +vom +von +vor +während +war +waren +warst +was +weg +weil +weiter +welche +welchem +welchen +welcher +welches +wenn +werde +werden +wie +wieder +will +wir +wird +wirst +wo +wollen +wollte +würde +würden +zu +zum +zur +zwar +zwischen +a +ahogy +ahol +aki +akik +akkor +alatt +által +általában +amely +amelyek +amelyekben +amelyeket +amelyet +amelynek +ami +amit +amolyan +amíg +amikor +át +abban +ahhoz +annak +arra +arról +az +azok +azon +azt +azzal +azért +aztán +azután +azonban +bár +be +belül +benne +cikk +cikkek +cikkeket +csak +de +e +eddig +egész +egy +egyes +egyetlen +egyéb +egyik +egyre +ekkor +el +elég +ellen +elõ +elõször +elõtt +elsõ +én +éppen +ebben +ehhez +emilyen +ennek +erre +ez +ezt +ezek +ezen +ezzel +ezért +és +fel +felé +hanem +hiszen +hogy +hogyan +igen +így +illetve +ill. +ill +ilyen +ilyenkor +ison +ismét +itt +jó +jól +jobban +kell +kellett +keresztül +keressünk +ki +kívül +között +közül +legalább +lehet +lehetett +legyen +lenne +lenni +lesz +lett +maga +magát +majd +majd +már +más +másik +meg +még +mellett +mert +mely +melyek +mi +mit +míg +miért +milyen +mikor +minden +mindent +mindenki +mindig +mint +mintha +mivel +most +nagy +nagyobb +nagyon +ne +néha +nekem +neki +nem +néhány +nélkül +nincs +olyan +ott +össze +õ +õk +õket +pedig +persze +rá +s +saját +sem +semmi +sok +sokat +sokkal +számára +szemben +szerint +szinte +talán +tehát +teljes +tovább +továbbá +több +úgy +ugyanis +új +újabb +újra +után +utána +utolsó +vagy +vagyis +valaki +valami +valamint +való +vagyok +van +vannak +volt +voltam +voltak +voltunk +vissza +vele +viszont +volna +ad +al +allo +ai +agli +all +agl +alla +alle +con +col +coi +da +dal +dallo +dai +dagli +dall +dagl +dalla +dalle +di +del +dello +dei +degli +dell +degl +della +delle +in +nel +nello +nei +negli +nell +negl +nella +nelle +su +sul +sullo +sui +sugli +sull +sugl +sulla +sulle +per +tra +contro +io +tu +lui +lei +noi +voi +loro +mio +mia +miei +mie +tuo +tua +tuoi +tue +suo +sua +suoi +sue +nostro +nostra +nostri +nostre +vostro +vostra +vostri +vostre +mi +ti +ci +vi +lo +la +li +le +gli +ne +il +un +uno +una +ma +ed +se +perché +anche +come +dov +dove +che +chi +cui +non +più +quale +quanto +quanti +quanta +quante +quello +quelli +quella +quelle +questo +questi +questa +queste +si +tutto +tutti +a +c +e +i +l +o +ho +hai +ha +abbiamo +avete +hanno +abbia +abbiate +abbiano +avrò +avrai +avrà +avremo +avrete +avranno +avrei +avresti +avrebbe +avremmo +avreste +avrebbero +avevo +avevi +aveva +avevamo +avevate +avevano +ebbi +avesti +ebbe +avemmo +aveste +ebbero +avessi +avesse +avessimo +avessero +avendo +avuto +avuta +avuti +avute +sono +sei +è +siamo +siete +sia +siate +siano +sarò +sarai +sarà +saremo +sarete +saranno +sarei +saresti +sarebbe +saremmo +sareste +sarebbero +ero +eri +era +eravamo +eravate +erano +fui +fosti +fu +fummo +foste +furono +fossi +fosse +fossimo +fossero +essendo +faccio +fai +facciamo +fanno +faccia +facciate +facciano +farò +farai +farà +faremo +farete +faranno +farei +faresti +farebbe +faremmo +fareste +farebbero +facevo +facevi +faceva +facevamo +facevate +facevano +feci +facesti +fece +facemmo +faceste +fecero +facessi +facesse +facessimo +facessero +facendo +sto +stai +sta +stiamo +stanno +stia +stiate +stiano +starò +starai +starà +staremo +starete +staranno +starei +staresti +starebbe +staremmo +stareste +starebbero +stavo +stavi +stava +stavamo +stavate +stavano +stetti +stesti +stette +stemmo +steste +stettero +stessi +stesse +stessimo +stessero +stando +ах +ох +эх +ай +эй +ой +тағы +тағыда +әрине +жоқ +сондай +осындай +осылай +солай +мұндай +бұндай +мен +сен +ол +біз +біздер +олар +сіз +сіздер +маған +оған +саған +біздің +сіздің +оның +бізге +сізге +оларға +біздерге +сіздерге +оларға +менімен +сенімен +онымен +бізбен +сізбен +олармен +біздермен +сіздермен +менің +сенің +біздің +сіздің +оның +біздердің +сіздердің +олардың +маған +саған +оған +менен +сенен +одан +бізден +сізден +олардан +біздерден +сіздерден +олардан +айтпақшы +сонымен +сондықтан +бұл +осы +сол +анау +мынау +сонау +осынау +ана +мына +сона +әні +міне +өй +үйт +бүйт +біреу +кейбіреу +кейбір +қайсыбір +әрбір +бірнеше +бірдеме +бірнеше +әркім +әрне +әрқайсы +әрқалай +әлдекім +әлдене +әлдеқайдан +әлденеше +әлдеқалай +әлдеқашан +алдақашан +еш +ешкім +ешбір +ештеме +дәнеңе +ешқашан +ешқандай +ешқайсы +емес +бәрі +барлық +барша +бар +күллі +бүкіл +түгел +өз +өзім +өзің +өзінің +өзіме +өзіне +өзімнің +өзі +өзге +менде +сенде +онда +менен +сенен онан +одан +ау +па +ей +әй +е +уа +уау +уай +я +пай +ә +о +оһо +ой +ие +аһа +ау +беу +мәссаған +бәрекелді +әттегенай +жаракімалла +масқарай +астапыралла +япырмай +ойпырмай +кәне +кәнеки +ал +әйда +кәні +міне +әні +сорап +қош-қош +пфша +пішә +құрау-құрау +шәйт +шек +моһ +тәк +құрау +құр +кә +кәһ +күшім +күшім +мышы +пырс +әукім +алақай +паһ-паһ +бәрекелді +ура +әттең +әттеген-ай +қап +түге +пішту +шіркін +алатау +пай-пай +үшін +сайын +сияқты +туралы +арқылы +бойы +бойымен +шамалы +шақты +қаралы +ғұрлы +ғұрлым +шейін +дейін +қарай +таман +салым +тарта +жуық +таяу +гөрі +бері +кейін +соң +бұрын +бетер +қатар +бірге +қоса +арс +гүрс +дүрс +қорс +тарс +тырс +ырс +барқ +борт +күрт +кірт +морт +сарт +шырт +дүңк +күңк +қыңқ +мыңқ +маңқ +саңқ +шаңқ +шіңк +сыңқ +таңқ +тыңқ +ыңқ +болп +былп +жалп +желп +қолп +ірк +ырқ +сарт-сұрт +тарс-тұрс +арс-ұрс +жалт-жалт +жалт-жұлт +қалт-қалт +қалт-құлт +қаңқ-қаңқ +қаңқ-құңқ +шаңқ-шаңқ +шаңқ-шұңқ +арбаң-арбаң +бүгжең-бүгжең +арсалаң-арсалаң +ербелең-ербелең +батыр-бұтыр +далаң-далаң +тарбаң-тарбаң +қызараң-қызараң +қаңғыр-күңгір +қайқаң-құйқаң +митың-митың +салаң-сұлаң +ыржың-тыржың +бірақ +алайда +дегенмен +әйтпесе +әйткенмен +себебі +өйткені +сондықтан +үшін +сайын +сияқты +туралы +арқылы +бойы +бойымен +шамалы +шақты +қаралы +ғұрлы +ғұрлым +гөрі +бері +кейін +соң +бұрын +бетер +қатар +бірге +қоса +шейін +дейін +қарай +таман +салым +тарта +жуық +таяу +арнайы +осындай +ғана +қана +тек +әншейін +og +i +jeg +det +at +en +et +den +til +er +som +på +de +med +han +av +ikke +ikkje +der +så +var +meg +seg +men +ett +har +om +vi +min +mitt +ha +hadde +hun +nå +over +da +ved +fra +du +ut +sin +dem +oss +opp +man +kan +hans +hvor +eller +hva +skal +selv +sjøl +her +alle +vil +bli +ble +blei +blitt +kunne +inn +når +være +kom +noen +noe +ville +dere +som +deres +kun +ja +etter +ned +skulle +denne +for +deg +si +sine +sitt +mot +å +meget +hvorfor +dette +disse +uten +hvordan +ingen +din +ditt +blir +samme +hvilken +hvilke +sånn +inni +mellom +vår +hver +hvem +vors +hvis +både +bare +enn +fordi +før +mange +også +slik +vært +være +båe +begge +siden +dykk +dykkar +dei +deira +deires +deim +di +då +eg +ein +eit +eitt +elles +honom +hjå +ho +hoe +henne +hennar +hennes +hoss +hossen +ikkje +ingi +inkje +korleis +korso +kva +kvar +kvarhelst +kven +kvi +kvifor +me +medan +mi +mine +mykje +no +nokon +noka +nokor +noko +nokre +si +sia +sidan +so +somt +somme +um +upp +vere +vore +verte +vort +varte +vart +de +a +o +que +e +do +da +em +um +para +com +não +uma +os +no +se +na +por +mais +as +dos +como +mas +ao +ele +das +à +seu +sua +ou +quando +muito +nos +já +eu +também +só +pelo +pela +até +isso +ela +entre +depois +sem +mesmo +aos +seus +quem +nas +me +esse +eles +você +essa +num +nem +suas +meu +às +minha +numa +pelos +elas +qual +nós +lhe +deles +essas +esses +pelas +este +dele +tu +te +vocês +vos +lhes +meus +minhas +teu +tua +teus +tuas +nosso +nossa +nossos +nossas +dela +delas +esta +estes +estas +aquele +aquela +aqueles +aquelas +isto +aquilo +estou +está +estamos +estão +estive +esteve +estivemos +estiveram +estava +estávamos +estavam +estivera +estivéramos +esteja +estejamos +estejam +estivesse +estivéssemos +estivessem +estiver +estivermos +estiverem +hei +há +havemos +hão +houve +houvemos +houveram +houvera +houvéramos +haja +hajamos +hajam +houvesse +houvéssemos +houvessem +houver +houvermos +houverem +houverei +houverá +houveremos +houverão +houveria +houveríamos +houveriam +sou +somos +são +era +éramos +eram +fui +foi +fomos +foram +fora +fôramos +seja +sejamos +sejam +fosse +fôssemos +fossem +for +formos +forem +serei +será +seremos +serão +seria +seríamos +seriam +tenho +tem +temos +tém +tinha +tínhamos +tinham +tive +teve +tivemos +tiveram +tivera +tivéramos +tenha +tenhamos +tenham +tivesse +tivéssemos +tivessem +tiver +tivermos +tiverem +terei +terá +teremos +terão +teria +teríamos +teriam +и +в +во +не +что +он +на +я +с +со +как +а +то +все +она +так +его +но +да +ты +к +у +же +вы +за +бы +по +только +ее +мне +было +вот +от +меня +еще +нет +о +из +ему +теперь +когда +даже +ну +вдруг +ли +если +уже +или +ни +быть +был +него +до +вас +нибудь +опять +уж +вам +ведь +там +потом +себя +ничего +ей +может +они +тут +где +есть +надо +ней +для +мы +тебя +их +чем +была +сам +чтоб +без +будто +чего +раз +тоже +себе +под +будет +ж +тогда +кто +этот +того +потому +этого +какой +совсем +ним +здесь +этом +один +почти +мой +тем +чтобы +нее +сейчас +были +куда +зачем +всех +никогда +можно +при +наконец +два +об +другой +хоть +после +над +больше +тот +через +эти +нас +про +всего +них +какая +много +разве +три +эту +моя +впрочем +хорошо +свою +этой +перед +иногда +лучше +чуть +том +нельзя +такой +им +более +всегда +конечно +всю +между +de +la +que +el +en +y +a +los +del +se +las +por +un +para +con +no +una +su +al +lo +como +más +pero +sus +le +ya +o +este +sí +porque +esta +entre +cuando +muy +sin +sobre +también +me +hasta +hay +donde +quien +desde +todo +nos +durante +todos +uno +les +ni +contra +otros +ese +eso +ante +ellos +e +esto +mí +antes +algunos +qué +unos +yo +otro +otras +otra +él +tanto +esa +estos +mucho +quienes +nada +muchos +cual +poco +ella +estar +estas +algunas +algo +nosotros +mi +mis +tú +te +ti +tu +tus +ellas +nosotras +vosostros +vosostras +os +mío +mía +míos +mías +tuyo +tuya +tuyos +tuyas +suyo +suya +suyos +suyas +nuestro +nuestra +nuestros +nuestras +vuestro +vuestra +vuestros +vuestras +esos +esas +estoy +estás +está +estamos +estáis +están +esté +estés +estemos +estéis +estén +estaré +estarás +estará +estaremos +estaréis +estarán +estaría +estarías +estaríamos +estaríais +estarían +estaba +estabas +estábamos +estabais +estaban +estuve +estuviste +estuvo +estuvimos +estuvisteis +estuvieron +estuviera +estuvieras +estuviéramos +estuvierais +estuvieran +estuviese +estuvieses +estuviésemos +estuvieseis +estuviesen +estando +estado +estada +estados +estadas +estad +he +has +ha +hemos +habéis +han +haya +hayas +hayamos +hayáis +hayan +habré +habrás +habrá +habremos +habréis +habrán +habría +habrías +habríamos +habríais +habrían +había +habías +habíamos +habíais +habían +hube +hubiste +hubo +hubimos +hubisteis +hubieron +hubiera +hubieras +hubiéramos +hubierais +hubieran +hubiese +hubieses +hubiésemos +hubieseis +hubiesen +habiendo +habido +habida +habidos +habidas +soy +eres +es +somos +sois +son +sea +seas +seamos +seáis +sean +seré +serás +será +seremos +seréis +serán +sería +serías +seríamos +seríais +serían +era +eras +éramos +erais +eran +fui +fuiste +fue +fuimos +fuisteis +fueron +fuera +fueras +fuéramos +fuerais +fueran +fuese +fueses +fuésemos +fueseis +fuesen +sintiendo +sentido +sentida +sentidos +sentidas +siente +sentid +tengo +tienes +tiene +tenemos +tenéis +tienen +tenga +tengas +tengamos +tengáis +tengan +tendré +tendrás +tendrá +tendremos +tendréis +tendrán +tendría +tendrías +tendríamos +tendríais +tendrían +tenía +tenías +teníamos +teníais +tenían +tuve +tuviste +tuvo +tuvimos +tuvisteis +tuvieron +tuviera +tuvieras +tuviéramos +tuvierais +tuvieran +tuviese +tuvieses +tuviésemos +tuvieseis +tuviesen +teniendo +tenido +tenida +tenidos +tenidas +tened +och +det +att +i +en +jag +hon +som +han +på +den +med +var +sig +för +så +till +är +men +ett +om +hade +de +av +icke +mig +du +henne +då +sin +nu +har +inte +hans +honom +skulle +hennes +där +min +man +ej +vid +kunde +något +från +ut +när +efter +upp +vi +dem +vara +vad +över +än +dig +kan +sina +här +ha +mot +alla +under +någon +eller +allt +mycket +sedan +ju +denna +själv +detta +åt +utan +varit +hur +ingen +mitt +ni +bli +blev +oss +din +dessa +några +deras +blir +mina +samma +vilken +er +sådan +vår +blivit +dess +inom +mellan +sådant +varför +varje +vilka +ditt +vem +vilket +sitta +sådana +vart +dina +vars +vårt +våra +ert +era +vilkas +acaba +ama +aslında +az +bazı +belki +biri +birkaç +birşey +biz +bu +çok +çünkü +da +daha +de +defa +diye +eğer +en +gibi +hem +hep +hepsi +her +hiç +için +ile +ise +kez +ki +kim +mı +mu +mü +nasıl +ne +neden +nerde +nerede +nereye +niçin +niye +o +sanki +şey +siz +şu +tüm +ve +veya +ya +yani diff --git a/src/shorttext/utils/textpreprocessing.py b/src/shorttext/utils/textpreprocessing.py new file mode 100644 index 00000000..d3b79e9d --- /dev/null +++ b/src/shorttext/utils/textpreprocessing.py @@ -0,0 +1,233 @@ + +import re +import os +import codecs +from typing import TextIO +from functools import partial + +import snowballstemmer + + +# tokenizer +def tokenize(s: str) -> list[str]: + """Tokenize a string by splitting on whitespace. + + Args: + s: Input string to tokenize. + + Returns: + List of tokens split by whitespace. + """ + return s.split(' ') + + +# stemmer +class StemmerSingleton: + """Singleton class for Porter stemmer. + + Provides a singleton instance of the snowball stemmer for English. + """ + + def __new__(cls): + if not hasattr(cls, 'instance'): + cls.instance = super(StemmerSingleton, cls).__new__(cls) + cls.stemmer = snowballstemmer.stemmer('english') + return cls.instance + + def __call__(cls, s: str) -> str: + """Stem a word using Porter stemmer. + + Args: + s: Word to stem. + + Returns: + Stemmed word. + """ + return cls.stemmer.stemWord(s) + + +def stemword(s: str) -> str: + """Stem a word using Porter stemmer. + + Args: + s: Word to stem. + + Returns: + Stemmed word. + """ + return StemmerSingleton()(s) + + +def preprocess_text(text: str, pipeline: list[callable]) -> str: + """Preprocess text according to a given pipeline. + + Applies a sequence of preprocessing functions to the input text. + Each function in the pipeline transforms the text (e.g., stemming, + lemmatizing, removing punctuation). + + Args: + text: Input text to preprocess. + pipeline: List of functions that each transform a text string to another text string. + + Returns: + The preprocessed text after applying all pipeline functions. + """ + return text if len(pipeline)==0 else preprocess_text(pipeline[0](text), pipeline[1:]) + + +def tokenize_text( + text: str, + presplit_pipeline: list[callable], + primitize_tokenizer: callable, + postsplit_pipeline: list[callable], + stopwordsfile: TextIO +) -> list[str]: + """Tokenize text with preprocessing pipelines. + + Applies pre-split and post-split pipelines to tokenize text, + filtering out stopwords. + + Args: + text: Input text to tokenize. + presplit_pipeline: List of functions to apply before tokenization. + primitize_tokenizer: Tokenizer function to split text into tokens. + postsplit_pipeline: List of functions to apply to each token after tokenization. + stopwordsfile: File containing stopwords to filter out. + + Returns: + List of tokens after preprocessing and stopword filtering. + """ + # load stop words file + stopwordset = set([stopword.strip() for stopword in stopwordsfile]) + + # done + presplit_text = text + for func in presplit_pipeline: + presplit_text = func(presplit_text) + postsplit_tokens = primitize_tokenizer(presplit_text) + for func in postsplit_pipeline: + for i, token in enumerate(postsplit_tokens): + postsplit_tokens[i] = func(token) + postsplit_tokens = [ + token for token in postsplit_tokens + if token not in stopwordset + ] + return postsplit_tokens + + +def text_preprocessor(pipeline: list[callable]) -> callable: + """Create a text preprocessor function from a pipeline. + + Returns a function that applies the given pipeline to preprocess text. + This is a convenience function that wraps preprocess_text with + a fixed pipeline. + + Args: + pipeline: List of functions that transform text to text. + + Returns: + A callable that takes text and returns preprocessed text. + """ + return partial(preprocess_text, pipeline=pipeline) + + +def oldschool_standard_text_preprocessor(stopwordsfile: TextIO) -> callable: + """Create a standard text preprocessor. + + Returns a text preprocessor with the following steps: + - Remove special characters + - Remove numerals + - Convert to lowercase + - Remove stop words + - Stem words using Porter stemmer + + Args: + stopwordsfile: File object containing stopwords to filter. + + Returns: + A callable that takes text and returns preprocessed text. + """ + # load stop words file + stopwordset = set([stopword.strip() for stopword in stopwordsfile]) + stopwordsfile.close() + + # the pipeline + pipeline = [lambda s: re.sub(r'[^\w\s]', '', s), + lambda s: re.sub(r'[0-9]', '', s), + lambda s: s.lower(), + lambda s: ' '.join(filter(lambda s: not (s in stopwordset), tokenize(s))), + lambda s: ' '.join([stemword(stemmed_token) for stemmed_token in tokenize(s)]) + ] + return text_preprocessor(pipeline) + + +def standard_text_preprocessor_1() -> callable: + """Create a standard text preprocessor using NLTK stopwords. + + Returns a text preprocessor with the following steps: + - Remove special characters + - Remove numerals + - Convert to lowercase + - Remove stop words (NLTK list) + - Stem words using Porter stemmer + + Returns: + A callable that takes text and returns preprocessed text. + """ + # load stop words + this_dir, _ = os.path.split(__file__) + stopwordsfile = codecs.open(os.path.join(this_dir, 'stopwords.txt'), 'r', 'utf-8') + + return oldschool_standard_text_preprocessor(stopwordsfile) + + +def standard_text_preprocessor_2() -> callable: + """Create a standard text preprocessor with negation-aware stopwords. + + Returns a text preprocessor with the following steps: + - Remove special characters + - Remove numerals + - Convert to lowercase + - Remove stop words (NLTK list minus negation terms) + - Stem words using Porter stemmer + + Returns: + A callable that takes text and returns preprocessed text. + """ + # load stop words + this_dir, _ = os.path.split(__file__) + stopwordsfile = codecs.open(os.path.join(this_dir, 'nonneg_stopwords.txt'), 'r', 'utf-8') + + return oldschool_standard_text_preprocessor(stopwordsfile) + + +def advanced_text_tokenizer_1() -> callable: + """Create an advanced text tokenizer. + + Returns a tokenizer function that applies preprocessing steps: + - Remove special characters + - Remove numerals + - Convert to lowercase + - Stem tokens using Porter stemmer + - Filter out negation-aware stopwords + + Returns: + A callable that takes text and returns a list of tokens. + """ + presplit_pipeline = [ + lambda s: re.sub(r'[^\w\s]', '', s), + lambda s: re.sub(r'[0-9]', '', s), + lambda s: s.lower() + ] + tokenizer = tokenize + postsplit_pipeline = [ + lambda s: ' '.join([stemword(stemmed_token) for stemmed_token in tokenize(s)]) + ] + this_dir, _ = os.path.split(__file__) + return partial( + tokenize_text, + presplit_pipeline=presplit_pipeline, + primitize_tokenizer=tokenizer, + postsplit_pipeline=postsplit_pipeline, + stopwordsfile=codecs.open(os.path.join(this_dir, 'nonneg_stopwords.txt'), 'r', 'utf-8') + ) diff --git a/src/shorttext/utils/wordembed.py b/src/shorttext/utils/wordembed.py new file mode 100644 index 00000000..8cf77c5e --- /dev/null +++ b/src/shorttext/utils/wordembed.py @@ -0,0 +1,254 @@ + +from os import PathLike +from typing import Any, Annotated, Optional, TextIO + +import numpy as np +import numpy.typing as npt +import gensim +from gensim.models.keyedvectors import KeyedVectors +from gensim.models.fasttext import FastTextKeyedVectors +from gensim.models.poincare import PoincareModel, PoincareKeyedVectors +import requests + +from .textpreprocessing import tokenize + + +def load_word2vec_model( + path: str | PathLike, + binary: bool = True +) -> KeyedVectors: + """Load a pre-trained Word2Vec model. + + Args: + path: Path to the Word2Vec model file. + binary: Whether the file is in binary format. Default: True. + + Returns: + A KeyedVectors model containing word embeddings. + """ + return KeyedVectors.load_word2vec_format(path, binary=binary) + + +def load_fasttext_model( + path: str | PathLike, + encoding: Any = 'utf-8' +) -> FastTextKeyedVectors: + """Load a pre-trained FastText model. + + Args: + path: Path to the FastText model file. + encoding: File encoding. Default: 'utf-8'. + + Returns: + A FastTextKeyedVectors model. + """ + return gensim.models.fasttext.load_facebook_vectors(path, encoding=encoding) + + +def load_poincare_model( + path: str | PathLike, + word2vec_format: bool = True, + binary: bool = False +) -> PoincareKeyedVectors: + """Load a Poincaré embedding model. + + Args: + path: Path to the Poincaré model file. + word2vec_format: Whether to load from word2vec format. Default: True. + binary: Whether file is binary. Default: False. + + Returns: + A PoincareKeyedVectors model. + """ + if word2vec_format: + return PoincareKeyedVectors.load_word2vec_format(path, binary=binary) + else: + return PoincareModel.load(path).kv + + +def shorttext_to_avgvec( + shorttext: str, + wvmodel: KeyedVectors +) -> Annotated[npt.NDArray[np.float64], "1D array"]: + """Convert short text to averaged embedding vector. + + Converts each token to its word embedding, averages them, + and normalizes the result. + + Args: + shorttext: Input text. + wvmodel: Word embedding model. + + Returns: + A normalized vector representation of the text. + """ + vec = np.sum( + [ + wvmodel[token].astype(np.float64) + if token in wvmodel + else np.array([1.]*wvmodel.vector_size) / np.sqrt(wvmodel.vector_size) + for token in tokenize(shorttext) + ], + axis=0 + ) + + # normalize + norm = np.linalg.norm(vec) + if norm != 0: + vec /= norm + + return vec + + +class RESTfulKeyedVectors(KeyedVectors): + """Remote word vector client via REST API. + + Connects to a remote WordEmbedAPI service to access word + embeddings via HTTP requests. + + Attributes: + url: Base URL of the API. + port: Port number for the API. + """ + + def __init__(self, url: str, port: str | int='5000'): + """Initialize the client. + + Args: + url: Base URL of the API (e.g., 'http://localhost'). + port: Port number. Default: '5000'. + """ + self.url = url + self.port = port + + def closer_than(self, entity1: str, entity2: str) -> list | dict: + """Find words closer to entity1 than entity2 is. + + Args: + entity1: First word. + entity2: Reference word. + + Returns: + List of words closer to entity1 than entity2. + """ + r = requests.post(self.url + ':' + self.port + '/closerthan', + json={'entity1': entity1, 'entity2': entity2}) + return r.json() + + def distance(self, entity1: str, entity2: str) -> float: + """Compute distance between two words. + + Args: + entity1: First word. + entity2: Second word. + + Returns: + Distance between the word vectors. + """ + r = requests.post(self.url + ':' + self.port + '/distance', + json={'entity1': entity1, 'entity2': entity2}) + return r.json()['distance'] + + def distances( + self, + entity1: str, + other_entities: Optional[list[str]] = None + ) -> Annotated[npt.NDArray[np.float64], "1D array"]: + """Compute distances from one word to multiple words. + + Args: + entity1: First word. + other_entities: List of words to compare against. + + Returns: + Array of distances. + """ + if other_entities is None: + other_entities = [] + + r = requests.post(self.url + ':' + self.port + '/distances', + json={'entity1': entity1, 'other_entities': other_entities}) + return np.array(r.json()['distances'], dtype=np.float32) + + def get_vector(self, entity: str) -> Annotated[npt.NDArray[np.float64], "1D array"]: + """Get word vector for a word. + + Args: + entity: Word to get vector for. + + Returns: + Word embedding vector. + + Raises: + KeyError: If word not in vocabulary. + """ + r = requests.post(self.url + ':' + self.port + '/get_vector', json={'token': entity}) + returned_dict = r.json() + if 'vector' in returned_dict: + return np.array(returned_dict['vector']) + else: + raise KeyError(f'The token {entity} does not exist in the model.') + + def most_similar(self, **kwargs) -> list[tuple[str, float]]: + """Find most similar words. + + Args: + **kwargs: Arguments passed to the API (e.g., positive, negative). + + Returns: + List of (word, similarity) tuples. + """ + r = requests.post(self.url + ':' + self.port + '/most_similar', json=kwargs) + return [tuple(pair) for pair in r.json()] + + def most_similar_to_given(self, entity1: str, entities_list: list[str]) -> list[str]: + """Find most similar word from a list to a given word. + + Args: + entity1: Reference word. + entities_list: List of candidate words. + + Returns: + List of words sorted by similarity. + """ + r = requests.post(self.url + ':' + self.port + '/most_similar_to_given', + json={'entity1': entity1, 'entities_list': entities_list}) + return r.json()['token'] + + def rank(self, entity1: str, entity2: str) -> int: + """Get similarity rank between two words. + + Args: + entity1: First word. + entity2: Second word. + + Returns: + Rank of entity2 relative to entity1. + """ + r = requests.post(self.url + ':' + self.port + '/rank', + json={'entity1': entity1, 'entity2': entity2}) + return r.json()['rank'] + + def save(self, fname_or_handle: TextIO, **kwargs) -> None: + """Save is not supported for remote vectors. + + Raises: + IOError: Always, since remote vectors cannot be saved locally. + """ + raise IOError('The class RESTfulKeyedVectors do not persist models to a file.') + + def similarity(self, entity1: str, entity2: str) -> float: + """Compute similarity between two words. + + Args: + entity1: First word. + entity2: Second word. + + Returns: + Similarity score between 0 and 1. + """ + r = requests.post(self.url + ':' + self.port + '/similarity', + json={'entity1': entity1, 'entity2': entity2}) + return r.json()['similarity'] + +# reference: https://radimrehurek.com/gensim/models/keyedvectors.html diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 00000000..828a6894 --- /dev/null +++ b/test/__init__.py @@ -0,0 +1,3 @@ +""" +This package has automated unit-tests for shorttext. +""" diff --git a/test/test_charonehot.py b/test/test_charonehot.py new file mode 100644 index 00000000..59dbf9a1 --- /dev/null +++ b/test/test_charonehot.py @@ -0,0 +1,13 @@ + +from urllib.request import urlopen + +import shorttext + + +def test_onehot_bigtxt(): + chartovec_encoder = shorttext.generators.initialize_SentenceToCharVecEncoder( + urlopen('http://norvig.com/big.txt'), + encoding='utf-8' + ) + assert len(chartovec_encoder.dictionary) == 93 + assert chartovec_encoder.signalchar == "\n" diff --git a/test/test_compute.py b/test/test_compute.py new file mode 100644 index 00000000..4377686f --- /dev/null +++ b/test/test_compute.py @@ -0,0 +1,23 @@ + +import numpy as np +import pytest + +from shorttext.utils.compute import cosine_similarity + + +def test_cosine_similarity_1(): + vec1 = np.array([0.3, 0.7]) + vec2 = np.array([-0.7, 0.3]) + assert cosine_similarity(vec1, vec2) == pytest.approx(0.) + + +def test_cosine_similarity_2(): + vec1 = np.array([1., 1.]) + vec2 = np.array([2.5, 2.5]) + assert cosine_similarity(vec1, vec2) == pytest.approx(1.) + + +def test_cosine_similarity_3(): + vec1 = np.array([3., 3.]) + vec2 = np.array([2., 0.]) + assert cosine_similarity(vec1, vec2) == pytest.approx(np.sqrt(0.5)) diff --git a/test/test_dtm.py b/test/test_dtm.py new file mode 100644 index 00000000..813b6828 --- /dev/null +++ b/test/test_dtm.py @@ -0,0 +1,28 @@ + +import pytest + +import shorttext +from shorttext.utils import stemword +from shorttext.utils.textpreprocessing import standard_text_preprocessor_1 + + +def test_inaugural(): + # preparing data + usprez = shorttext.data.inaugural() + docids = sorted(usprez.keys()) + usprez = [' '.join(usprez[docid]) for docid in docids] + + # preprocesser defined + txtpreprocessor = standard_text_preprocessor_1() + + # corpus making + corpus = [txtpreprocessor(speech) for speech in usprez] + + # making DTM + dtm = shorttext.utils.NumpyDocumentTermMatrix(corpus, docids, tfidf=True) + + # check results + assert dtm.get_token_occurences(stemword('change'))['2009-Obama'] == pytest.approx(0.9400072584914713) + assert dtm.nbdocs == 56 + assert dtm.nbtokens == 5075 + assert dtm.get_total_termfreq(stemword('government')) == pytest.approx(37.82606692473982) diff --git a/test/test_fuzzylogic.py b/test/test_fuzzylogic.py new file mode 100644 index 00000000..5602d5d7 --- /dev/null +++ b/test/test_fuzzylogic.py @@ -0,0 +1,32 @@ + +import pytest + +from shorttext.metrics.dynprog.dldist import damerau_levenshtein +from shorttext.metrics.dynprog.lcp import longest_common_prefix +from shorttext.metrics.dynprog.jaccard import similarity as jaccard_similarity + + +def test_similarity(): + assert damerau_levenshtein('debug', 'deubg') == 1 + assert damerau_levenshtein('intrdependence', 'interdpeendencae') == 3 + assert longest_common_prefix('debug', 'debuag') == 4 + +def test_dldistance_transposition(): + assert damerau_levenshtein('independent', 'indeepndent') == 1 + assert damerau_levenshtein('providence', 'porvidecne') == 2 + +def test_dldistance_insertion(): + assert damerau_levenshtein('algorithm', 'algorithms') == 1 + assert damerau_levenshtein('algorithm', 'algoarithmm') == 2 + +def test_dldistance_deletion(): + assert damerau_levenshtein('algorithm', 'algoithm') == 1 + assert damerau_levenshtein('algorithm', 'algorith') == 1 + assert damerau_levenshtein('algorithm', 'algrihm') == 2 + +def test_dldistance_correct(): + assert damerau_levenshtein('python', 'python') == 0 + assert damerau_levenshtein('sosad', 'sosad') == 0 + +def test_dldistance_jaccard(): + assert jaccard_similarity('diver', 'driver') == pytest.approx(5/6) diff --git a/test/test_norvigspell.py b/test/test_norvigspell.py new file mode 100644 index 00000000..57597285 --- /dev/null +++ b/test/test_norvigspell.py @@ -0,0 +1,14 @@ + +from urllib.request import urlopen + +import shorttext + + +def test_norvig(): + text = urlopen('http://norvig.com/big.txt').read() + text = text.decode("utf-8") + speller = shorttext.spell.NorvigSpellCorrector() + speller.train(text) + + assert speller.correct('apple') == 'apple' + assert speller.correct('appl') == 'apply' diff --git a/test/test_stacking.py b/test/test_stacking.py new file mode 100644 index 00000000..e8df9aec --- /dev/null +++ b/test/test_stacking.py @@ -0,0 +1,137 @@ + +from sklearn.svm import SVC +from loguru import logger +import pytest + +import shorttext +from shorttext.stack import StackedGeneralization, LogisticStackedGeneralization +from shorttext.smartload import smartload_compact_model +from shorttext.classifiers import TopicVectorSkLearnClassifier, TopicVectorCosineDistanceClassifier, MaxEntClassifier +from shorttext.generators import GensimTopicModeler, LDAModeler + + +def training_stacking() -> tuple[MaxEntClassifier, GensimTopicModeler, TopicVectorSkLearnClassifier, StackedGeneralization]: + # loading NIH Reports + nihdict = shorttext.data.nihreports(sample_size=None) + nihdict = {'NCCAM': nihdict['NCCAM'], 'NCATS': nihdict['NCATS']} + + # maxent + maxent_classifier = MaxEntClassifier() + maxent_classifier.train(nihdict, nb_epochs=100) + maxent_classifier.save_compact_model('./bio_maxent.bin') + + # SVM + LDA + topicmodeler = LDAModeler() + topicmodeler.train(nihdict, 8) + topicdisclassifier = TopicVectorCosineDistanceClassifier(topicmodeler) + topicmodeler.save_compact_model('bio_lda.bin') + svm_classifier = TopicVectorSkLearnClassifier(topicmodeler, SVC()) + svm_classifier.train(nihdict) + svm_classifier.save_compact_model('bio_svm.bin') + + # logistic + stacked_classifier = LogisticStackedGeneralization({ + 'maxent': maxent_classifier, + 'svm': svm_classifier, + 'topiccosine': topicdisclassifier + }) + stacked_classifier.train(nihdict, nb_epoch=300) + stacked_classifier.save_compact_model('bio_logistics.bin') + + return maxent_classifier, topicmodeler, svm_classifier, stacked_classifier + + +def compare_two_dicts(dict1, dict2) -> None: + assert len(dict1) == len(dict2) + for classlabel in dict1: + assert (classlabel in dict2) + assert dict1[classlabel] == pytest.approx(dict2[classlabel], abs=1e-3) + + +def test_studies() -> None: + # train + maxent_classifier, topicmodeler, svm_classifier, stacked_classifier = training_stacking() + + # smartload + maxent_classifier2 = smartload_compact_model('bio_maxent.bin', None) + topicmodeler2 = smartload_compact_model('bio_lda.bin', None) + topicdisclassifier2 = TopicVectorCosineDistanceClassifier(topicmodeler2) + svm_classifier2 = smartload_compact_model('bio_svm.bin', None) + stacked_classifier2 = LogisticStackedGeneralization({ + 'maxent': maxent_classifier2, + 'svm': svm_classifier2, + 'topiccosine': topicdisclassifier2 + }) + stacked_classifier2.load_compact_model('bio_logistics.bin') + + # compare + terms = ['stem cell', 'grant', 'system biology'] + for term in terms: + logger.info(term) + + logger.info('maximum entropy') + compare_two_dicts(maxent_classifier.score(term), maxent_classifier2.score(term)) + + # logger.info('LDA') + # compare_two_dicts(topicdisclassifier.score(term), topicdisclassifier2.score(term)) + # + # logger.info('SVM') + # compare_two_dicts(svm_classifier.score(term), svm_classifier2.score(term)) + + logger.info('combined') + compare_two_dicts(stacked_classifier.score(term), stacked_classifier2.score(term)) + + +def test_svm() -> None: + # loading NIH Reports + nihdict = shorttext.data.nihreports(sample_size=None) + nihdict = {'NCCAM': nihdict['NCCAM'], 'NCATS': nihdict['NCATS']} + + # svm + topicmodeler = LDAModeler() + topicmodeler.train(nihdict, 16) + svm_classifier = TopicVectorSkLearnClassifier(topicmodeler, SVC()) + svm_classifier.train(nihdict) + + logger.info('before saving...') + logger.info('--'.join(svm_classifier.classlabels)) + svm_classifier.save_compact_model('bio_svm2.bin') + logger.info('after saving...') + logger.info('--'.join(svm_classifier.classlabels)) + + # load + svm_classifier2 = smartload_compact_model('bio_svm2.bin', None) + logger.info('second classifier...') + logger.info(','.join(svm_classifier2.classlabels)) + logger.info(','.join(svm_classifier2.topicmodeler.classlabels)) + + # compare + terms = ['stem cell', 'grant', 'system biology'] + for term in terms: + logger.info(term) + topicvec = svm_classifier.getvector(term) + topicvec2 = svm_classifier2.getvector(term) + + logger.info(topicvec) + logger.info(topicvec2) + + for idx, classlabel in enumerate(svm_classifier.classlabels): + logger.info(f"{idx} {classlabel}") + logger.info(svm_classifier.classifier.score([topicvec], [idx])) + + for idx, classlabel in enumerate(svm_classifier2.classlabels): + logger.info(f"{idx} {classlabel}") + logger.info(svm_classifier2.classifier.score([topicvec2], [idx])) + + logger.info({ + classlabel: svm_classifier.classifier.score([topicvec], [idx]) + for idx, classlabel in enumerate(svm_classifier.classlabels) + }) + logger.info({ + classlabel: svm_classifier2.classifier.score([topicvec], [idx]) + for idx, classlabel in enumerate(svm_classifier2.classlabels) + }) + + # for term in terms: + # logger.info(term) + # compare_two_dicts(svm_classifier.score(term), svm_classifier2.score(term)) diff --git a/test/test_textpreprocessing.py b/test/test_textpreprocessing.py new file mode 100644 index 00000000..e829d715 --- /dev/null +++ b/test/test_textpreprocessing.py @@ -0,0 +1,15 @@ + +import shorttext + + +def test_textpreprocessing_standard_pipeline(): + preprocessor = shorttext.utils.standard_text_preprocessor_1() + assert preprocessor('I love you.') == 'love' + assert preprocessor('Natural language processing and text mining on fire.') == 'natur languag process text mine fire' + assert preprocessor('I do not think.') == 'think' + +def test_textpreprocessing_standard_pipeline_stopwords(): + preprocessor = shorttext.utils.standard_text_preprocessor_2() + assert preprocessor('I love you.') == 'love' + assert preprocessor('Natural language processing and text mining on fire.') == 'natur languag process text mine fire' + assert preprocessor('I do not think.') == 'not think' diff --git a/test/test_topicmodeling.py b/test/test_topicmodeling.py new file mode 100644 index 00000000..4ef3390f --- /dev/null +++ b/test/test_topicmodeling.py @@ -0,0 +1,95 @@ + +import numpy as np +from sklearn.naive_bayes import GaussianNB +from sklearn.linear_model import LogisticRegression +import pytest + +import shorttext + + +def test_ldatopicmodel(): + # load data + trainclassdict = shorttext.data.nihreports(sample_size=None) + + # train LDA model + topicmodeler = shorttext.generators.LDAModeler() + topicmodeler.train(trainclassdict, 128) + + # retrieve topic vectors + topic_vector_1 = topicmodeler.retrieve_topicvec('stem cell research NIH cancer immunology') + assert not np.any(np.isnan(topic_vector_1)) + assert np.linalg.norm(topic_vector_1) == pytest.approx(1.) + + topic_vector_2 = topicmodeler.retrieve_topicvec('bioinformatics') + assert not np.any(np.isnan(topic_vector_2)) + assert np.linalg.norm(topic_vector_2) == pytest.approx(1.) + + topic_vector_3 = topicmodeler.retrieve_topicvec('linear algebra') + assert not np.any(np.isnan(topic_vector_3)) + assert np.linalg.norm(topic_vector_3) == pytest.approx(1.) + + # test I/O + topicmodeler.save_compact_model('nihlda128.bin') + topicmodeler2 = shorttext.generators.load_gensimtopicmodel('nihlda128.bin') + topic_vector_1a = topicmodeler2.retrieve_topicvec("stem cell research NIH cancer immunology") + assert not np.any(np.isnan(topic_vector_1a)) + assert np.linalg.norm(topic_vector_1a) == pytest.approx(1.) + # np.testing.assert_array_almost_equal(topic_vector_1a, topic_vector_1) # do not check this; LDA models are stochastic + + # cosine similarity scorer + cos_classifier = shorttext.classifiers.TopicVectorCosineDistanceClassifier(topicmodeler) + score_dict = cos_classifier.score("stem cell research NIH cancer immunology") + assert isinstance(score_dict, dict) + assert len(score_dict) == len(trainclassdict) + + # scikit-learn classifier + gaussian_nb_classifier = shorttext.classifiers.TopicVectorSkLearnClassifier( + topicmodeler, LogisticRegression() + ) + gaussian_nb_classifier.train(trainclassdict) + score_dict = gaussian_nb_classifier.score("stem cell research NIH cancer immunology") + assert isinstance(score_dict, dict) + + +def test_autoencoder(): + # load data + subdict = shorttext.data.subjectkeywords() + + # train the model + autoencoder = shorttext.generators.AutoencodingTopicModeler() + autoencoder.train(subdict, 8) + + # retrieve BOW vector + bow_vector = autoencoder.retrieve_bow_vector("critical race") + assert not np.any(np.isnan(bow_vector)) + assert np.all(bow_vector == 1 / np.sqrt(len(autoencoder.token2indices))) + + # retrieve topic vector + topic_vector_1 = autoencoder.retrieve_topicvec("linear algebra") + assert not np.any(np.isnan(topic_vector_1)) + assert np.linalg.norm(topic_vector_1) == pytest.approx(1.) + np.testing.assert_array_almost_equal(autoencoder["linear algebra"], topic_vector_1) + + topic_vector_2 = autoencoder.retrieve_topicvec("path integral") + assert not np.any(np.isnan(topic_vector_2)) + assert np.linalg.norm(topic_vector_2) == pytest.approx(1.) + np.testing.assert_array_almost_equal(autoencoder["path integral"], topic_vector_2) + + topic_vector_3 = autoencoder.retrieve_topicvec("critical race") + assert not np.any(np.isnan(topic_vector_3)) + assert np.linalg.norm(topic_vector_3) == pytest.approx(1.) + np.testing.assert_array_almost_equal(autoencoder["critical race"], topic_vector_3) + + # cosine similarity scholar + cos_classifier = shorttext.classifiers.TopicVectorCosineDistanceClassifier(autoencoder) + score_dict = cos_classifier.score("stem cell research") + assert isinstance(score_dict, dict) + assert len(score_dict) == 3 + + # scikit-learn classifier + gaussian_nb_classifier = shorttext.classifiers.TopicVectorSkLearnClassifier( + autoencoder, LogisticRegression() + ) + gaussian_nb_classifier.train(subdict) + score_dict = gaussian_nb_classifier.score("path integral") + assert isinstance(score_dict, dict) diff --git a/test/test_var_nn_embedded_vec_classifier.py b/test/test_var_nn_embedded_vec_classifier.py new file mode 100644 index 00000000..497c980e --- /dev/null +++ b/test/test_var_nn_embedded_vec_classifier.py @@ -0,0 +1,117 @@ + +import urllib +from pathlib import Path + +from loguru import logger +import pytest + +import shorttext + + +# download model +link = "https://shorttext-data-northernvirginia.s3.amazonaws.com/trainingdata/test_w2v_model.bin" +filename = "test_w2v_model.bin" +if not Path(filename).exists(): + urllib.request.urlretrieve(link, filename) +w2v_model = shorttext.utils.load_word2vec_model(filename, binary=True) # load word2vec model +trainclass_dict = shorttext.data.subjectkeywords() + + +def compare_two_dicts(dict1, dict2) -> None: + assert len(dict1) == len(dict2) + for classlabel in dict1: + assert (classlabel in dict2) + assert dict1[classlabel] == pytest.approx(dict2[classlabel], abs=1e-3) + + +def test_CNN_word_embed_without_gensim(): + logger.info("Testing CNN...") + # create keras model using `CNNWordEmbed` class + logger.info("\tKeras model") + keras_model = shorttext.classifiers.frameworks.CNNWordEmbed( + wvmodel=w2v_model, + nb_labels=len(trainclass_dict.keys()) + ) + + # create and train classifier using keras model constructed above + logger.info("\tTraining") + main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(w2v_model) + main_classifier.train(trainclass_dict, keras_model, nb_epoch=2) + + # compute classification score + logger.info("\tTesting") + score_vals = main_classifier.score('artificial intelligence') + assert score_vals['mathematics'] + score_vals['physics'] + score_vals['theology'] == pytest.approx(1.0) + + +def test_double_CNN_word_embed_ewithout_gensim(): + logger.info("Testing DoubleCNN...") + # create keras model using `DoubleCNNWordEmbed` class + logger.info("\tKeras model") + keras_model = shorttext.classifiers.frameworks.DoubleCNNWordEmbed( + wvmodel=w2v_model, + nb_labels=len(trainclass_dict.keys()) + ) + + # create and train classifier using keras model constructed above + logger.info("\tTraining") + main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(w2v_model) + main_classifier.train(trainclass_dict, keras_model, nb_epoch=2) + + # compute classification score + logger.info("\tTesting") + score_vals = main_classifier.score('artificial intelligence') + assert score_vals['mathematics'] + score_vals['physics'] + score_vals['theology'] == pytest.approx(1.0) + + +def test_CLSTM_word_embed_without_gensim(): + logger.info("Testing CLSTM...") + # create keras model using `CLSTMWordEmbed` class + logger.info("\tKeras model") + keras_model = shorttext.classifiers.frameworks.CLSTMWordEmbed( + wvmodel=w2v_model, + nb_labels=len(trainclass_dict.keys()) + ) + + # create and train classifier using keras model constructed above + logger.info("\tTraining") + main_classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(w2v_model) + main_classifier.train(trainclass_dict, keras_model, nb_epoch=2) + + # compute classification score + logger.info("\tTesting") + score_vals = main_classifier.score('artificial intelligence') + assert score_vals['mathematics'] + score_vals['physics'] + score_vals['theology'] ==pytest.approx(1.0) + + +def test_AA_sum_embed(): + logger.info("Testing SumEmbed") + classifier = shorttext.classifiers.SumEmbeddedVecClassifier(w2v_model) + classdict = shorttext.data.subjectkeywords() + classifier.train(classdict) + + # compute + compare_two_dicts( + classifier.score('linear algebra'), + { + 'mathematics': 0.9044698253778962, + 'physics': 0.7586816549044926, + 'theology': 0.1817602793151848 + } + ) + compare_two_dicts( + classifier.score('learning'), + { + 'mathematics': 0.9037142562255835, + 'physics': 0.7588376500004107, + 'theology': 0.18039468994239538 + } + ) + compare_two_dicts( + classifier.score('eschatology'), + { + 'mathematics': 0.3658578123294476, + 'physics': 0.5996711864493821, + 'theology': 0.9694560847986978 + } + ) diff --git a/test/test_wmd.py b/test/test_wmd.py new file mode 100644 index 00000000..1edf88d9 --- /dev/null +++ b/test/test_wmd.py @@ -0,0 +1,33 @@ + +import urllib +from pathlib import Path + +import pytest + +from shorttext.metrics.wasserstein import word_mover_distance +from shorttext.utils import load_word2vec_model + + +# download model +link = "https://shorttext-data-northernvirginia.s3.amazonaws.com/trainingdata/test_w2v_model.bin" +filename = "test_w2v_model.bin" +if not Path(filename).exists(): + urllib.request.urlretrieve(link, filename) +w2v_model = load_word2vec_model(filename, binary=True) # load word2vec model + + +def test_word_mover_distance_1(): + tokens1 = ['president', 'speaks'] + tokens2 = ['president', 'talks'] + known_answer = 0.19936788082122803 + wdistance = word_mover_distance(tokens1, tokens2, w2v_model) + assert wdistance == pytest.approx(known_answer) + + +def test_word_mover_distance_2(): + tokens1 = ['fan', 'book'] + tokens2 = ['apple', 'orange'] + known_answer = 1.8019972145557404 + wdistance = word_mover_distance(tokens1, tokens2, w2v_model) + assert wdistance == pytest.approx(known_answer) +