diff --git a/.coveragerc b/.coveragerc index ca62439..b2ddda4 100644 --- a/.coveragerc +++ b/.coveragerc @@ -2,4 +2,4 @@ omit = */python?.?/* */site-packages/nose/* - *__init__* + *__init__* \ No newline at end of file diff --git a/.gitignore b/.gitignore index d906f69..6819ea1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,10 @@ -<<<<<<< HEAD # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] +# pycharm generated +.idea/ + # visual studio generated bin/ obj/ @@ -62,8 +64,5 @@ docs/_build/ # PyBuilder target/ -||||||| merged common ancestors -======= # Google App Engine generated folder appengine-generated/ ->>>>>>> adaaddecc50208c18b08806f63f80f3342bd5e30 diff --git a/.travis.yml b/.travis.yml index 65ec28e..87a5c32 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,11 +5,18 @@ python: - "3.4" - "3.5" sudo: false +# Enable 3.7 without globally enabling sudo and dist: xenial for other build jobs +matrix: + include: + - python: 3.7 + dist: xenial + sudo: true install: - pip install python-coveralls - pip install coveralls + - pip install cachetools script: nosetests tests --verbose --with-coverage after_success: - coveralls notifications: - email: false + email: false \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..500bc70 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.linting.pylintEnabled": true +} \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..1e79b12 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2016 Hanif Amal Robbani + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md index deeaa80..6fc8dd0 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ # sastrawi Indonesian stemmer. Python port of PHP Sastrawi project. -[![Coverage Status](https://coveralls.io/repos/har07/sastrawi/badge.svg?branch=development&service=github)](https://coveralls.io/github/har07/sastrawi?branch=development) +[![Coverage Status](https://coveralls.io/repos/har07/sastrawi/badge.svg?branch=development&service=github)](https://coveralls.io/github/har07/sastrawi?branch=development) \ No newline at end of file diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..35eb584 --- /dev/null +++ b/README.rst @@ -0,0 +1,63 @@ +Sastrawi +======== + +| Sastrawi is a simple Python library which allows you to reduce + inflected words in Indonesian Language (Bahasa Indonesia) to their + base form (`stem`_). +| This is Python port of the original `Sastrawi`_ project written in + PHP. + +|Build Status| +|Coverage Status| + +Installation +------------ + +Sastrawi can be installed via `pip`_, by running the following commands +in terminal/command prompt : ``pip install Sastrawi`` + +Example Usage +------------- + +Run the following commands in *Python interactive terminal* : + +.. code:: python + + # import Sastrawi package + from Sastrawi.Stemmer.StemmerFactory import StemmerFactory + + # create stemmer + factory = StemmerFactory() + stemmer = factory.create_stemmer() + + # stem + sentence = 'Perekonomian Indonesia sedang dalam pertumbuhan yang membanggakan' + output = stemmer.stem(sentence) + + print(output) + # ekonomi indonesia sedang dalam tumbuh yang bangga + + print(stemmer.stem('Mereka meniru-nirukannya')) + # mereka tiru + +Demo +--------- + +Live demo : https://pysastrawi-demo.appspot.com/ + +Repository : https://github.com/har07/pystastrawi-demo + +More Info +--------- + +- `Sastrawi PHP Repository page`_ + +.. _stem: http://en.wikipedia.org/wiki/Stemming +.. _Sastrawi: https://github.com/sastrawi/sastrawi +.. _pip: https://docs.python.org/3.6/installing/index.html +.. _Sastrawi PHP Repository page: https://github.com/sastrawi/sastrawi + +.. |Build Status| image:: https://travis-ci.org/har07/PySastrawi.svg?branch=master + :target: https://travis-ci.org/har07/PySastrawi +.. |Coverage Status| image:: https://coveralls.io/repos/har07/sastrawi/badge.svg?branch=master&service=github + :target: https://coveralls.io/github/har07/sastrawi?branch=master diff --git a/Sastrawi.sln b/Sastrawi.sln index 65c9db8..0866570 100644 --- a/Sastrawi.sln +++ b/Sastrawi.sln @@ -7,6 +7,17 @@ Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "Sastrawi", "src\Sastrawi\Sa EndProject Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "SastrawiTest", "tests\SastrawiTest.pyproj", "{69199BE5-44C5-45C3-8B82-62F14DA2B9F1}" EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{0302964A-E17E-468E-8365-21827A654692}" + ProjectSection(SolutionItems) = preProject + .coveragerc = .coveragerc + .gitignore = .gitignore + .travis.yml = .travis.yml + README.md = README.md + README.rst = README.rst + setup.cfg = setup.cfg + setup.py = setup.py + EndProjectSection +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU diff --git a/release.sh b/release.sh new file mode 100755 index 0000000..f8fc1d8 --- /dev/null +++ b/release.sh @@ -0,0 +1,9 @@ +# generate disribution package +python -m pip install --user --upgrade setuptools wheel +python setup.py sdist bdist_wheel + +# upload distribution package +python3 -m pip install --user --upgrade twine +twine upload --repository-url https://test.pypi.org/legacy/ dist/* + +twine upload dist/* diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..c34b498 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,5 @@ +[bdist_wheel] +# This flag says that the code is written to work on both Python 2 and Python +# 3. If at all possible, it is good practice to do this. If you cannot, you +# will need to generate wheels for each Python version that you support. +universal=1 \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..d657037 --- /dev/null +++ b/setup.py @@ -0,0 +1,115 @@ +"""A setuptools based setup module. +See: +https://packaging.python.org/en/latest/distributing.html +https://github.com/pypa/sampleproject +""" + +# To use a consistent encoding +from codecs import open +from os import path + +# Always prefer setuptools over distutils +from setuptools import setup, find_packages + +# Get the long description from the README file +here = path.abspath(path.dirname(__file__)) +with open(path.join(here, 'README.rst'), encoding='utf-8') as f: + long_description = f.read() + +setup( + name='PySastrawi', + + # Versions should comply with PEP440. For a discussion on single-sourcing + # the version across setup.py and the project code, see + # https://packaging.python.org/en/latest/single_source_version.html + version='1.2.0', + + description='Library for stemming Indonesian (Bahasa) text', + long_description='Library for stemming Indonesian (Bahasa) text', + + # The project's main homepage. + url='https://github.com/har07/PySastrawi', + + # Author details + author='Hanif Amal Robbani', + author_email='dev.har07@gmail.com', + + # Choose your license + license='MIT', + + # See https://pypi.python.org/pypi?%3Aaction=list_classifiers + classifiers=[ + # How mature is this project? Common values are + # 3 - Alpha + # 4 - Beta + # 5 - Production/Stable + 'Development Status :: 4 - Beta', + + # Indicate who your project is intended for + 'Intended Audience :: Information Technology', + 'Intended Audience :: Science/Research', + 'Topic :: Text Processing :: Linguistic', + + # Pick your license as you wish (should match "license" above) + 'License :: OSI Approved :: MIT License', + + # Specify the Python versions you support here. In particular, ensure + # that you indicate whether you support Python 2, Python 3 or both. + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.3', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', + ], + + # What does your project relate to? + keywords='linguistic stemming indonesian bahasa', + + # You can just specify the packages manually here if your project is + # simple. Or you can use find_packages(). + packages=find_packages('src', exclude=['contrib', 'docs', 'tests']), + # packages=["Sastrawi"], + package_dir={'': 'src'}, + + # Alternatively, if you want to distribute just a my_module.py, uncomment + # this: + # py_modules=["my_module"], + + # List run-time dependencies here. These will be installed by pip when + # your project is installed. For an analysis of "install_requires" vs pip's + # requirements files see: + # https://packaging.python.org/en/latest/requirements.html + # install_requires=['peppercorn'], + + # List additional groups of dependencies here (e.g. development + # dependencies). You can install these using the following syntax, + # for example: + # $ pip install -e .[dev,test] + # extras_require={ + # 'dev': ['check-manifest'], + # 'test': ['coverage'], + # }, + + # If there are data files included in your packages that need to be + # installed, specify them here. If using Python 2.6 or less, then these + # have to be included in MANIFEST.in as well. + package_data={ + '': ['data/*.txt'], + }, + + # Although 'package_data' is the preferred approach, in some case you may + # need to place data files outside of your packages. See: + # http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files # noqa + # In this case, 'data_file' will be installed into '/my_data' + # data_files=[('my_data', ['data/data_file'])], + + # To provide executable scripts, use entry points in preference to the + # "scripts" keyword. Entry points provide cross-platform support and allow + # pip to create the appropriate form of executable for the target platform. + # entry_points={ + # 'console_scripts': [ + # 'sample=sample:main', + # ], + # }, +) diff --git a/src/Sastrawi/Dictionary/ArrayDictionary.py b/src/Sastrawi/Dictionary/ArrayDictionary.py index 5819a29..814cf6b 100644 --- a/src/Sastrawi/Dictionary/ArrayDictionary.py +++ b/src/Sastrawi/Dictionary/ArrayDictionary.py @@ -1,10 +1,17 @@ -class ArrayDictionary(object): +from Sastrawi.Dictionary.DictionaryInterface import DictionaryInterface + +class ArrayDictionary(DictionaryInterface): """description of class""" def __init__(self, words=None): - self.words = [] - if words: + if words is None: + self.words = {} + elif type(words) is dict: + self.words = words + elif type(words) is list: self.add_words(words) + else: + self.words = {} def contains(self, word): return word in self.words @@ -14,16 +21,10 @@ def count(self): def add_words(self, words): """Add multiple words to the dictionary""" - for word in words: - self.add(word) + self.words = dict(zip(words,words)) def add(self, word): """Add a word to the dictionary""" if not word or word.strip() == '': return - self.words.append(word) - - - - - + self.words[word] = word \ No newline at end of file diff --git a/src/Sastrawi/Dictionary/DictionaryInterface.py b/src/Sastrawi/Dictionary/DictionaryInterface.py index 3b2d037..46433b6 100644 --- a/src/Sastrawi/Dictionary/DictionaryInterface.py +++ b/src/Sastrawi/Dictionary/DictionaryInterface.py @@ -1,8 +1,12 @@ -class DictionaryInterface(object): - """description of class""" - - def contains(self, word): - pass +# @update_by Mufid Jamaluddin +# @update_date 16/03/2019 +from abc import ABCMeta, abstractmethod +class DictionaryInterface: + """Interface definition of dictionary""" + __metaclass__ = ABCMeta + @abstractmethod + def contains(self, word): + pass \ No newline at end of file diff --git a/src/Sastrawi/Morphology/Disambiguator/DisambiguatorPrefixRule24.py b/src/Sastrawi/Morphology/Disambiguator/DisambiguatorPrefixRule24.py index 68b6f12..6dd3497 100644 --- a/src/Sastrawi/Morphology/Disambiguator/DisambiguatorPrefixRule24.py +++ b/src/Sastrawi/Morphology/Disambiguator/DisambiguatorPrefixRule24.py @@ -1,10 +1,11 @@ import re + class DisambiguatorPrefixRule24(object): """Disambiguate Prefix Rule 24 Rule 24 : perCAerV -> per-CAerV where C != 'r' """ - + def disambiguate(self, word): """Disambiguate Prefix Rule 24 Rule 24 : perCAerV -> per-CAerV where C != 'r' diff --git a/src/Sastrawi/Morphology/Disambiguator/DisambiguatorPrefixRule27.py b/src/Sastrawi/Morphology/Disambiguator/DisambiguatorPrefixRule27.py index 2715c45..e67fbbc 100644 --- a/src/Sastrawi/Morphology/Disambiguator/DisambiguatorPrefixRule27.py +++ b/src/Sastrawi/Morphology/Disambiguator/DisambiguatorPrefixRule27.py @@ -2,13 +2,19 @@ class DisambiguatorPrefixRule27(object): """Disambiguate Prefix Rule 27 - Rule 27 : pen{c|d|j|z} -> pen-{c|d|j|z} + Rule 27 modified by Prasasto Adi : pen{c|d|j|s|t|z} -> pen-{c|d|j|s|t|z} + in order to stem penstabilan, pentranskripsi + + Original CS Rule 27 was : pen{c|d|j|z} -> pen-{c|d|j|z} """ def disambiguate(self, word): """Disambiguate Prefix Rule 27 - Rule 27 : pen{c|d|j|z} -> pen-{c|d|j|z} + Rule 27 modified by Prasasto Adi : pen{c|d|j|s|t|z} -> pen-{c|d|j|s|t|z} + in order to stem penstabilan, pentranskripsi + + Original CS Rule 27 was : pen{c|d|j|z} -> pen-{c|d|j|z} """ - matches = re.match(r'^pen([cdjz])(.*)$', word) + matches = re.match(r'^pen([cdjstz])(.*)$', word) if matches: return matches.group(1) + matches.group(2) diff --git a/src/Sastrawi/Sastrawi.pyproj b/src/Sastrawi/Sastrawi.pyproj index 8dcee03..3ec3716 100644 --- a/src/Sastrawi/Sastrawi.pyproj +++ b/src/Sastrawi/Sastrawi.pyproj @@ -27,6 +27,7 @@ + @@ -246,6 +247,10 @@ Code + + + + \ No newline at end of file diff --git a/src/Sastrawi/Stemmer/Cache/ArrayCache.py b/src/Sastrawi/Stemmer/Cache/ArrayCache.py deleted file mode 100644 index 35ff2b3..0000000 --- a/src/Sastrawi/Stemmer/Cache/ArrayCache.py +++ /dev/null @@ -1,19 +0,0 @@ -from Sastrawi.Stemmer.Cache.CacheInterface import CacheInterface - -class ArrayCache(CacheInterface): - """description of class""" - - def __init__(self): - self.data = {} - - def set(self, key, value): - self.data[key] = value - - def get(self, key): - if key in self.data: - return self.data[key] - - def has(self, key): - return key in self.data - - diff --git a/src/Sastrawi/Stemmer/Cache/CacheInterface.py b/src/Sastrawi/Stemmer/Cache/CacheInterface.py deleted file mode 100644 index cbed596..0000000 --- a/src/Sastrawi/Stemmer/Cache/CacheInterface.py +++ /dev/null @@ -1,13 +0,0 @@ -class CacheInterface(object): - """description of class""" - - def has(self, key): - pass - - def set(self, key, value): - pass - - def get(self, key): - pass - - diff --git a/src/Sastrawi/Stemmer/Cache/__init__.py b/src/Sastrawi/Stemmer/Cache/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/Sastrawi/Stemmer/CachedStemmer.py b/src/Sastrawi/Stemmer/CachedStemmer.py deleted file mode 100644 index 97258bc..0000000 --- a/src/Sastrawi/Stemmer/CachedStemmer.py +++ /dev/null @@ -1,27 +0,0 @@ -#from Sastrawi.Stemmer.StemmerInterface import StemmerInterface -from Sastrawi.Stemmer.Filter import TextNormalizer - -class CachedStemmer(object): - """description of class""" - def __init__(self, cache, delegatedStemmer): - self.cache = cache - self.delegatedStemmer = delegatedStemmer - - def stem(self, text): - normalizedText = TextNormalizer.normalize_text(text) - - words = normalizedText.split(' ') - stems = [] - - for word in words: - if self.cache.has(word): - stems.append(self.cache.get(word)) - else: - stem = self.delegatedStemmer.stem(word) - self.cache.set(word, stem) - stems.append(stem) - - return ' '.join(stems) - - def get_cache(self): - return self.cache diff --git a/src/Sastrawi/Stemmer/Context/Context.py b/src/Sastrawi/Stemmer/Context/Context.py index 18223ee..6667714 100644 --- a/src/Sastrawi/Stemmer/Context/Context.py +++ b/src/Sastrawi/Stemmer/Context/Context.py @@ -160,5 +160,4 @@ def restore_prefix(self): for removal in self.removals: if removal.get_affix_type() == 'DP': - self.removals.remove(removal) - + self.removals.remove(removal) \ No newline at end of file diff --git a/src/Sastrawi/Stemmer/Context/ContextInterface.py b/src/Sastrawi/Stemmer/Context/ContextInterface.py index 180c6d0..5a3b7be 100644 --- a/src/Sastrawi/Stemmer/Context/ContextInterface.py +++ b/src/Sastrawi/Stemmer/Context/ContextInterface.py @@ -1,30 +1,40 @@ -class ContextInterface(object): - """description of class""" +# @update_by Mufid Jamaluddin +# @update_date 16/03/2019 +from abc import ABCMeta, abstractmethod + +class ContextInterface: + """description of abs class""" + __metaclass__ = ABCMeta + + @abstractmethod def getOriginalWord(self): pass + @abstractmethod def setCurrentWord(self, word): pass + @abstractmethod def getCurrentWord(self): pass + @abstractmethod def getDictionary(self): pass + @abstractmethod def stopProcess(self): pass + @abstractmethod def processIsStopped(self): pass + @abstractmethod def addRemoval(self, removal): pass + @abstractmethod def getRemovals(self): - pass - - - - + pass \ No newline at end of file diff --git a/src/Sastrawi/Stemmer/Context/Removal.py b/src/Sastrawi/Stemmer/Context/Removal.py index 0cdfed8..e11c9c0 100644 --- a/src/Sastrawi/Stemmer/Context/Removal.py +++ b/src/Sastrawi/Stemmer/Context/Removal.py @@ -1,6 +1,6 @@ from Sastrawi.Stemmer.Context.RemovalInterface import RemovalInterface -class Removal(object): +class Removal(RemovalInterface): """description of class""" def __init__(self, visitor, subject, result, removedPart, affixType): diff --git a/src/Sastrawi/Stemmer/Context/RemovalInterface.py b/src/Sastrawi/Stemmer/Context/RemovalInterface.py index 93b6171..a94a18f 100644 --- a/src/Sastrawi/Stemmer/Context/RemovalInterface.py +++ b/src/Sastrawi/Stemmer/Context/RemovalInterface.py @@ -1,18 +1,29 @@ -class RemovalInterface(object): +# @update_by Mufid Jamaluddin +# @update_date 16/03/2019 + +from abc import ABCMeta, abstractmethod + +class RemovalInterface: """description of class""" + __metaclass__ = ABCMeta + @abstractmethod def get_visitor(self): pass + @abstractmethod def get_subject(self): pass + @abstractmethod def get_result(self): pass + @abstractmethod def get_removed_part(self): pass + @abstractmethod def get_affix_type(self): pass diff --git a/src/Sastrawi/Stemmer/Context/Visitor/VisitorProvider.py b/src/Sastrawi/Stemmer/Context/Visitor/VisitorProvider.py index 0ce6b33..be4aae6 100644 --- a/src/Sastrawi/Stemmer/Context/Visitor/VisitorProvider.py +++ b/src/Sastrawi/Stemmer/Context/Visitor/VisitorProvider.py @@ -86,8 +86,8 @@ def init_visitors(self): self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule15a(), DisambiguatorPrefixRule15b()])) self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule16()])) - disambiguators17 = [DisambiguatorPrefixRule17a(), DisambiguatorPrefixRule17b(), \ - DisambiguatorPrefixRule17c(), DisambiguatorPrefixRule17d()] + disambiguators17 = [DisambiguatorPrefixRule17a(), DisambiguatorPrefixRule17b(), DisambiguatorPrefixRule17c(), + DisambiguatorPrefixRule17d()] self.prefix_pisitors.append(PrefixDisambiguator(disambiguators17)) self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule18a(), DisambiguatorPrefixRule18b()])) @@ -102,8 +102,7 @@ def init_visitors(self): self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule28a(), DisambiguatorPrefixRule28b()])) self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule29()])) - disambiguators30 = [DisambiguatorPrefixRule30a(), DisambiguatorPrefixRule30b(), \ - DisambiguatorPrefixRule30c()] + disambiguators30 = [DisambiguatorPrefixRule30a(), DisambiguatorPrefixRule30b(), DisambiguatorPrefixRule30c()] self.prefix_pisitors.append(PrefixDisambiguator(disambiguators30)) self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule31a(), DisambiguatorPrefixRule31b()])) diff --git a/src/Sastrawi/Stemmer/Filter/TextNormalizer.py b/src/Sastrawi/Stemmer/Filter/TextNormalizer.py index 2d72e27..8cad4a7 100644 --- a/src/Sastrawi/Stemmer/Filter/TextNormalizer.py +++ b/src/Sastrawi/Stemmer/Filter/TextNormalizer.py @@ -1,7 +1,7 @@ import re def normalize_text(text): - result = str.lower(text) + result = text.lower() #lower the text even unicode given result = re.sub(r'[^a-z0-9 -]', ' ', result, flags = re.IGNORECASE|re.MULTILINE) result = re.sub(r'( +)', ' ', result, flags = re.IGNORECASE|re.MULTILINE) diff --git a/src/Sastrawi/Stemmer/Stemmer.py b/src/Sastrawi/Stemmer/Stemmer.py index cb196c7..1f3135e 100644 --- a/src/Sastrawi/Stemmer/Stemmer.py +++ b/src/Sastrawi/Stemmer/Stemmer.py @@ -2,6 +2,7 @@ from Sastrawi.Stemmer.Context.Visitor.VisitorProvider import VisitorProvider from Sastrawi.Stemmer.Filter import TextNormalizer from Sastrawi.Stemmer.Context.Context import Context +from cachetools import cached, LRUCache class Stemmer(object): """Indonesian Stemmer. @@ -35,6 +36,16 @@ def stem_word(self, word): else: return self.stem_singular_word(word) + # Stemming word in Tokens + # @author Mufid Jamaluddin + def stem_tokens(self, tokens): + stemmed_tokens = [] + for token in tokens: + if not token or token.strip() == '': + continue + stemmed_tokens.append(self.stem_word(token)) + return stemmed_tokens + def is_plural(self, word): #-ku|-mu|-nya #nikmat-Ku, etc diff --git a/src/Sastrawi/Stemmer/StemmerFactory.py b/src/Sastrawi/Stemmer/StemmerFactory.py index f01f80f..dd16bb5 100644 --- a/src/Sastrawi/Stemmer/StemmerFactory.py +++ b/src/Sastrawi/Stemmer/StemmerFactory.py @@ -1,43 +1,37 @@ import os +from cachetools import cached, LRUCache from Sastrawi.Dictionary.ArrayDictionary import ArrayDictionary from Sastrawi.Stemmer.Stemmer import Stemmer -from Sastrawi.Stemmer.CachedStemmer import CachedStemmer -from Sastrawi.Stemmer.Cache.ArrayCache import ArrayCache class StemmerFactory(object): """ Stemmer factory helps creating pre-configured stemmer """ - APC_KEY = 'sastrawi_cache_dictionary' def create_stemmer(self, isDev=False): """ Returns Stemmer instance """ + if isDev: + words = self.get_words_from_file() + dictionary = ArrayDictionary(words) + else: + dictionary = self.get_prod_words_dictionary() - words = self.get_words(isDev) - dictionary = ArrayDictionary(words) stemmer = Stemmer(dictionary) - resultCache = ArrayCache() - cachedStemmer = CachedStemmer(resultCache, stemmer) - - return cachedStemmer + return stemmer - def get_words(self, isDev=False): - #if isDev or callable(getattr(self, 'apc_fetch')): - # words = self.getWordsFromFile() - #else: - # words = apc_fetch(self.APC_KEY) - # if not words: - # words = self.getWordsFromFile() - # apc_store(self.APC_KEY, words) - return self.get_words_from_file() + @cached(cache=LRUCache(maxsize=32)) + def get_prod_words_dictionary(self): + words = self.get_words_from_file() + dictionary = ArrayDictionary(words) + return dictionary def get_words_from_file(self): current_dir = os.path.dirname(os.path.realpath(__file__)) - dictionaryFile = current_dir + '/../../../data/kata-dasar.txt' + dictionaryFile = current_dir + '/data/kata-dasar.txt' + if not os.path.isfile(dictionaryFile): raise RuntimeError('Dictionary file is missing. It seems that your installation is corrupted.') - dictionaryContent = '' + text = '' with open(dictionaryFile, 'r') as f: - dictionaryContent = f.read() - - return dictionaryContent.split('\n') \ No newline at end of file + text = f.read() + return text.split('\n') \ No newline at end of file diff --git a/data/kata-dasar.original.txt b/src/Sastrawi/Stemmer/data/kata-dasar.original.txt similarity index 100% rename from data/kata-dasar.original.txt rename to src/Sastrawi/Stemmer/data/kata-dasar.original.txt diff --git a/data/kata-dasar.txt b/src/Sastrawi/Stemmer/data/kata-dasar.txt similarity index 99% rename from data/kata-dasar.txt rename to src/Sastrawi/Stemmer/data/kata-dasar.txt index f46c3bc..9ebe9fb 100644 --- a/data/kata-dasar.txt +++ b/src/Sastrawi/Stemmer/data/kata-dasar.txt @@ -29929,4 +29929,4 @@ zulmat zulu zurafah zuriah -zus +zus \ No newline at end of file diff --git a/src/Sastrawi/StopWordRemover/StopWordRemover.py b/src/Sastrawi/StopWordRemover/StopWordRemover.py index b3b2f25..a5bbd3e 100644 --- a/src/Sastrawi/StopWordRemover/StopWordRemover.py +++ b/src/Sastrawi/StopWordRemover/StopWordRemover.py @@ -10,12 +10,12 @@ def get_dictionary(self): def remove(self, text): """Remove stop words.""" words = text.split(' ') - for word in words: - if self.dictionary.contains(word): - words.remove(word) - - return ' '.join(words) - - + stopped_words = [word for word in words if not self.dictionary.contains(word)] + return ' '.join(stopped_words) + # Remove Stopword in Tokens + # @author Mufid Jamaluddin + def remove_tokens(self, tokens): + clean_tokens = [token for token in tokens if not self.dictionary.contains(token)] + return clean_tokens \ No newline at end of file diff --git a/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py b/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py index 5b35049..668ed94 100644 --- a/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py +++ b/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py @@ -1,32 +1,34 @@ +import os +from cachetools import cached, LRUCache from Sastrawi.Dictionary.ArrayDictionary import ArrayDictionary from Sastrawi.StopWordRemover.StopWordRemover import StopWordRemover class StopWordRemoverFactory(object): """description of class""" - def create_stop_word_remover(self): - stopWords = self.get_stop_words() - dictionary = ArrayDictionary(stopWords) - stopWordRemover = StopWordRemover(dictionary) + def create_stop_word_remover(self, isDev=False): + if isDev: + stopWords = self.get_stop_words() + dictionary = ArrayDictionary(stopWords) + else: + dictionary = self.get_prod_stop_word_dictionary() + stopWordRemover = StopWordRemover(dictionary) return stopWordRemover - def get_stop_words(self): - return ['yang', 'untuk', 'pada', 'ke', 'para', 'namun', 'menurut', 'antara', 'dia', 'dua', - 'ia', 'seperti', 'jika', 'jika', 'sehingga', 'kembali', 'dan', 'tidak', 'ini', 'karena', - 'kepada', 'oleh', 'saat', 'harus', 'sementara', 'setelah', 'belum', 'kami', 'sekitar', - 'bagi', 'serta', 'di', 'dari', 'telah', 'sebagai', 'masih', 'hal', 'ketika', 'adalah', - 'itu', 'dalam', 'bisa', 'bahwa', 'atau', 'hanya', 'kita', 'dengan', 'akan', 'juga', - 'ada', 'mereka', 'sudah', 'saya', 'terhadap', 'secara', 'agar', 'lain', 'anda', - 'begitu', 'mengapa', 'kenapa', 'yaitu', 'yakni', 'daripada', 'itulah', 'lagi', 'maka', - 'tentang', 'demi', 'dimana', 'kemana', 'pula', 'sambil', 'sebelum', 'sesudah', 'supaya', - 'guna', 'kah', 'pun', 'sampai', 'sedangkan', 'selagi', 'sementara', 'tetapi', 'apakah', - 'kecuali', 'sebab', 'selain', 'seolah', 'seraya', 'seterusnya', 'tanpa', 'agak', 'boleh', - 'dapat', 'dsb', 'dst', 'dll', 'dahulu', 'dulunya', 'anu', 'demikian', 'tapi', 'ingin', - 'juga', 'nggak', 'mari', 'nanti', 'melainkan', 'oh', 'ok', 'seharusnya', 'sebetulnya', - 'setiap', 'setidaknya', 'sesuatu', 'pasti', 'saja', 'toh', 'ya', 'walau', 'tolong', - 'tentu', 'amat', 'apalagi', 'bagaimanapun'] - + @cached(cache=LRUCache(maxsize=8)) + def get_prod_stop_word_dictionary(self): + stopWords = self.get_stop_words() + return ArrayDictionary(stopWords) + def get_stop_words(self): + current_dir = os.path.dirname(os.path.realpath(__file__)) + dictionaryFile = current_dir + '/data/stopword_tala_2003.txt' + if not os.path.isfile(dictionaryFile): + raise RuntimeError('Stopword file is missing. It seems that your installation is corrupted.') + text = '' + with open(dictionaryFile, 'r') as f: + text = f.read() + return text.split('\n') \ No newline at end of file diff --git a/src/Sastrawi/StopWordRemover/data/stopword_tala_2003.txt b/src/Sastrawi/StopWordRemover/data/stopword_tala_2003.txt new file mode 100644 index 0000000..bf88a45 --- /dev/null +++ b/src/Sastrawi/StopWordRemover/data/stopword_tala_2003.txt @@ -0,0 +1,758 @@ +ada +adalah +adanya +adapun +agak +agaknya +agar +akan +akankah +akhir +akhiri +akhirnya +aku +akulah +amat +amatlah +anda +andalah +antar +antara +antaranya +apa +apaan +apabila +apakah +apalagi +apatah +artinya +asal +asalkan +atas +atau +ataukah +ataupun +awal +awalnya +bagai +bagaikan +bagaimana +bagaimanakah +bagaimanapun +bagi +bagian +bahkan +bahwa +bahwasanya +baik +bakal +bakalan +balik +banyak +bapak +baru +bawah +beberapa +begini +beginian +beginikah +beginilah +begitu +begitukah +begitulah +begitupun +bekerja +belakang +belakangan +belum +belumlah +benar +benarkah +benarlah +berada +berakhir +berakhirlah +berakhirnya +berapa +berapakah +berapalah +berapapun +berarti +berawal +berbagai +berdatangan +beri +berikan +berikut +berikutnya +berjumlah +berkali-kali +berkata +berkehendak +berkeinginan +berkenaan +berlainan +berlalu +berlangsung +berlebihan +bermacam +bermacam-macam +bermaksud +bermula +bersama +bersama-sama +bersiap +bersiap-siap +bertanya +bertanya-tanya +berturut +berturut-turut +bertutur +berujar +berupa +besar +betul +betulkah +biasa +biasanya +bila +bilakah +bisa +bisakah +boleh +bolehkah +bolehlah +buat +bukan +bukankah +bukanlah +bukannya +bulan +bung +cara +caranya +cukup +cukupkah +cukuplah +cuma +dahulu +dalam +dan +dapat +dari +daripada +datang +dekat +demi +demikian +demikianlah +dengan +depan +di +dia +diakhiri +diakhirinya +dialah +diantara +diantaranya +diberi +diberikan +diberikannya +dibuat +dibuatnya +didapat +didatangkan +digunakan +diibaratkan +diibaratkannya +diingat +diingatkan +diinginkan +dijawab +dijelaskan +dijelaskannya +dikarenakan +dikatakan +dikatakannya +dikerjakan +diketahui +diketahuinya +dikira +dilakukan +dilalui +dilihat +dimaksud +dimaksudkan +dimaksudkannya +dimaksudnya +diminta +dimintai +dimisalkan +dimulai +dimulailah +dimulainya +dimungkinkan +dini +dipastikan +diperbuat +diperbuatnya +dipergunakan +diperkirakan +diperlihatkan +diperlukan +diperlukannya +dipersoalkan +dipertanyakan +dipunyai +diri +dirinya +disampaikan +disebut +disebutkan +disebutkannya +disini +disinilah +ditambahkan +ditandaskan +ditanya +ditanyai +ditanyakan +ditegaskan +ditujukan +ditunjuk +ditunjuki +ditunjukkan +ditunjukkannya +ditunjuknya +dituturkan +dituturkannya +diucapkan +diucapkannya +diungkapkan +dong +dua +dulu +empat +enggak +enggaknya +entah +entahlah +guna +gunakan +hal +hampir +hanya +hanyalah +hari +harus +haruslah +harusnya +hendak +hendaklah +hendaknya +hingga +ia +ialah +ibarat +ibaratkan +ibaratnya +ibu +ikut +ingat +ingat-ingat +ingin +inginkah +inginkan +ini +inikah +inilah +itu +itukah +itulah +jadi +jadilah +jadinya +jangan +jangankan +janganlah +jauh +jawab +jawaban +jawabnya +jelas +jelaskan +jelaslah +jelasnya +jika +jikalau +juga +jumlah +jumlahnya +justru +kala +kalau +kalaulah +kalaupun +kalian +kami +kamilah +kamu +kamulah +kan +kapan +kapankah +kapanpun +karena +karenanya +kasus +kata +katakan +katakanlah +katanya +ke +keadaan +kebetulan +kecil +kedua +keduanya +keinginan +kelamaan +kelihatan +kelihatannya +kelima +keluar +kembali +kemudian +kemungkinan +kemungkinannya +kenapa +kepada +kepadanya +kesampaian +keseluruhan +keseluruhannya +keterlaluan +ketika +khususnya +kini +kinilah +kira +kira-kira +kiranya +kita +kitalah +kok +kurang +lagi +lagian +lah +lain +lainnya +lalu +lama +lamanya +lanjut +lanjutnya +lebih +lewat +lima +luar +macam +maka +makanya +makin +malah +malahan +mampu +mampukah +mana +manakala +manalagi +masa +masalah +masalahnya +masih +masihkah +masing +masing-masing +mau +maupun +melainkan +melakukan +melalui +melihat +melihatnya +memang +memastikan +memberi +memberikan +membuat +memerlukan +memihak +meminta +memintakan +memisalkan +memperbuat +mempergunakan +memperkirakan +memperlihatkan +mempersiapkan +mempersoalkan +mempertanyakan +mempunyai +memulai +memungkinkan +menaiki +menambahkan +menandaskan +menanti +menanti-nanti +menantikan +menanya +menanyai +menanyakan +mendapat +mendapatkan +mendatang +mendatangi +mendatangkan +menegaskan +mengakhiri +mengapa +mengatakan +mengatakannya +mengenai +mengerjakan +mengetahui +menggunakan +menghendaki +mengibaratkan +mengibaratkannya +mengingat +mengingatkan +menginginkan +mengira +mengucapkan +mengucapkannya +mengungkapkan +menjadi +menjawab +menjelaskan +menuju +menunjuk +menunjuki +menunjukkan +menunjuknya +menurut +menuturkan +menyampaikan +menyangkut +menyatakan +menyebutkan +menyeluruh +menyiapkan +merasa +mereka +merekalah +merupakan +meski +meskipun +meyakini +meyakinkan +minta +mirip +misal +misalkan +misalnya +mula +mulai +mulailah +mulanya +mungkin +mungkinkah +nah +naik +namun +nanti +nantinya +nyaris +nyatanya +oleh +olehnya +pada +padahal +padanya +pak +paling +panjang +pantas +para +pasti +pastilah +penting +pentingnya +per +percuma +perlu +perlukah +perlunya +pernah +persoalan +pertama +pertama-tama +pertanyaan +pertanyakan +pihak +pihaknya +pukul +pula +pun +punya +rasa +rasanya +rata +rupanya +saat +saatnya +saja +sajalah +saling +sama +sama-sama +sambil +sampai +sampai-sampai +sampaikan +sana +sangat +sangatlah +satu +saya +sayalah +se +sebab +sebabnya +sebagai +sebagaimana +sebagainya +sebagian +sebaik +sebaik-baiknya +sebaiknya +sebaliknya +sebanyak +sebegini +sebegitu +sebelum +sebelumnya +sebenarnya +seberapa +sebesar +sebetulnya +sebisanya +sebuah +sebut +sebutlah +sebutnya +secara +secukupnya +sedang +sedangkan +sedemikian +sedikit +sedikitnya +seenaknya +segala +segalanya +segera +seharusnya +sehingga +seingat +sejak +sejauh +sejenak +sejumlah +sekadar +sekadarnya +sekali +sekali-kali +sekalian +sekaligus +sekalipun +sekarang +sekarang +sekecil +seketika +sekiranya +sekitar +sekitarnya +sekurang-kurangnya +sekurangnya +sela +selain +selaku +selalu +selama +selama-lamanya +selamanya +selanjutnya +seluruh +seluruhnya +semacam +semakin +semampu +semampunya +semasa +semasih +semata +semata-mata +semaunya +sementara +semisal +semisalnya +sempat +semua +semuanya +semula +sendiri +sendirian +sendirinya +seolah +seolah-olah +seorang +sepanjang +sepantasnya +sepantasnyalah +seperlunya +seperti +sepertinya +sepihak +sering +seringnya +serta +serupa +sesaat +sesama +sesampai +sesegera +sesekali +seseorang +sesuatu +sesuatunya +sesudah +sesudahnya +setelah +setempat +setengah +seterusnya +setiap +setiba +setibanya +setidak-tidaknya +setidaknya +setinggi +seusai +sewaktu +siap +siapa +siapakah +siapapun +sini +sinilah +soal +soalnya +suatu +sudah +sudahkah +sudahlah +supaya +tadi +tadinya +tahu +tahun +tak +tambah +tambahnya +tampak +tampaknya +tandas +tandasnya +tanpa +tanya +tanyakan +tanyanya +tapi +tegas +tegasnya +telah +tempat +tengah +tentang +tentu +tentulah +tentunya +tepat +terakhir +terasa +terbanyak +terdahulu +terdapat +terdiri +terhadap +terhadapnya +teringat +teringat-ingat +terjadi +terjadilah +terjadinya +terkira +terlalu +terlebih +terlihat +termasuk +ternyata +tersampaikan +tersebut +tersebutlah +tertentu +tertuju +terus +terutama +tetap +tetapi +tiap +tiba +tiba-tiba +tidak +tidakkah +tidaklah +tiga +tinggi +toh +tunjuk +turut +tutur +tuturnya +ucap +ucapnya +ujar +ujarnya +umum +umumnya +ungkap +ungkapnya +untuk +usah +usai +waduh +wah +wahai +waktu +waktunya +walau +walaupun +wong +yaitu +yakin +yakni +yang \ No newline at end of file diff --git a/tests/FunctionalTests/Stemmer/stemmer_test.py b/tests/FunctionalTests/Stemmer/stemmer_test.py index cc21eb3..8609fb7 100644 --- a/tests/FunctionalTests/Stemmer/stemmer_test.py +++ b/tests/FunctionalTests/Stemmer/stemmer_test.py @@ -13,7 +13,9 @@ def setUp(self): 'bangun', 'fitnah', 'vonis', 'baru', 'ajar', 'tangkap', 'kupas', - 'minum', 'pukul', 'cinta', 'dua', 'jauh', 'ziarah', 'nuklir', 'gila', 'hajar', 'qasar', 'udara', + 'minum', 'pukul', + 'cinta', 'dua', 'dahulu', 'jauh', 'jarah', 'ziarah', + 'nuklir', 'nasihat', 'gila', 'hajar', 'qasar', 'udara', 'populer', 'warna', 'yoga', 'adil', 'rumah', 'muka', 'labuh', 'tarung', 'tebar', 'indah', 'daya', 'untung', 'sepuluh', 'ekonomi', 'makmur', 'telah', 'serta', 'percaya', 'pengaruh', 'kritik', 'seko', 'sekolah', 'tahan', 'capa', 'capai', @@ -22,7 +24,8 @@ def setUp(self): 'sembunyi', 'langgan', 'laku', 'baik', 'terang', 'iman', 'bisik', 'taat', 'puas', 'makan', 'nyala', 'nyanyi', 'nyata', 'nyawa', 'rata', 'lembut', 'ligas', 'budaya', 'karya', 'ideal', 'final', - 'taat', 'tiru', 'sepak', 'kuasa', 'malaikat', 'nikmat', # sastrawi additional rules + # sastrawi additional rules + 'taat', 'tiru', 'sepak', 'kuasa', 'malaikat', 'nikmat', 'stabil', 'transkripsi', 'lewat', 'nganga', 'allah', ] ) @@ -73,7 +76,8 @@ def get_test_data(self): data.append(['kesakitan', 'sakit']) data.append(['sesuap', 'suap']) - #data.append(['teriakanmu', 'teriak' # wtf? kok jadi teria?]) + #data.append(['teriakanmu', 'teriak']) # wtf? kok jadi ria? + #teriakanmu -> te-ria-kan-mu # template formulas for derivation prefix rules (disambiguation) # @@ -123,8 +127,8 @@ def get_test_data(self): data.append(['memvonis', 'vonis']) # rule 12 : mempe{r|l} -> mem-pe - data.append(['memperbaru', 'baru']) - data.append(['mempelajar', 'ajar']) + data.append(['memperbarui', 'baru']) + data.append(['mempelajari', 'ajar']) # rule 13a : mem{rV|V} -> mem{rV|V} data.append(['meminum', 'minum']) @@ -191,14 +195,13 @@ def get_test_data(self): data.append(['pemukul', 'pukul']) # rule 27 : men{c|d|j|z} -> men-{c|d|j|z} - # TODO : should find more relevant examples data.append(['pencinta', 'cinta']) - data.append(['pendua', 'dua']) - data.append(['penjauh', 'jauh']) + data.append(['pendahulu', 'dahulu']) + data.append(['penjarah', 'jarah']) data.append(['penziarah', 'ziarah']) # rule 28a : pen{V} -> pe-n{V} - data.append(['penuklir', 'nuklir']) + data.append(['penasihat', 'nasihat']) # rule 28b : pen{V} -> pe-t{V} data.append(['penangkap', 'tangkap']) @@ -219,7 +222,7 @@ def get_test_data(self): # rule 32 : pelV -> pe-lV except pelajar -> ajar data.append(['pelajar', 'ajar']) - data.append(['pelabuh', 'labuh']) + data.append(['pelabuhan', 'labuh']) # rule 33 : peCerV -> per-erV where C != {r|w|y|l|m|n} # TODO : find the examples @@ -325,8 +328,8 @@ def get_test_data(self): data.append(['menahan', 'tahan']) # test stem multiple sentences - multipleSentence1 = 'Cinta telah bertebaran.Keduanya saling mencintai.'; - multipleSentence2 = "(Cinta telah bertebaran)\n\n\n\nKeduanya saling mencintai."; + multipleSentence1 = 'Cinta telah bertebaran.Keduanya saling mencintai.' + multipleSentence2 = "(Cinta telah bertebaran)\n\n\n\nKeduanya saling mencintai." data.append([multipleSentence1, 'cinta telah tebar dua saling cinta']) data.append([multipleSentence2, 'cinta telah tebar dua saling cinta']) @@ -349,6 +352,9 @@ def get_test_data(self): data.append(['finalisasi', 'final']) # sastrawi additional rules + data.append(['penstabilan', 'stabil']) + data.append(['pentranskripsi', 'transkripsi']) + data.append(['mentaati', 'taat']) data.append(['meniru-nirukan', 'tiru']) data.append(['menyepak-nyepak', 'sepak']) diff --git a/tests/SastrawiTest.pyproj b/tests/SastrawiTest.pyproj index 736bb33..2ff9ee5 100644 --- a/tests/SastrawiTest.pyproj +++ b/tests/SastrawiTest.pyproj @@ -68,10 +68,6 @@ - - - - \ No newline at end of file diff --git a/tests/UnitTests/Dictionary/array_dictionary_test.py b/tests/UnitTests/Dictionary/array_dictionary_test.py index 57d72fc..42c5580 100644 --- a/tests/UnitTests/Dictionary/array_dictionary_test.py +++ b/tests/UnitTests/Dictionary/array_dictionary_test.py @@ -36,5 +36,22 @@ def test_constructor_preserve_words(self): self.assertTrue(dictionary.contains('word1')) self.assertTrue(dictionary.contains('word2')) + # Test ArrayDictionary dengan tipe data dict + # @author Mufid Jamaluddin + def test_dict_param(self): + dictionary = ArrayDictionary({'word1':'word1', 'word2':'word2'}) + self.assertTrue(dictionary.contains('word1')) + self.assertTrue(dictionary.contains('word2')) + self.assertFalse(dictionary.contains('word3')) + self.assertEqual(2, dictionary.count()) + dictionary.add('word3') + dictionary.add(' ') + self.assertTrue(dictionary.contains('word3')) + self.assertEqual(3, dictionary.count()) + + def test_non_dict_list(self): + dictionary = ArrayDictionary('$$%&**&(^&') + self.assertEqual(0, dictionary.count()) + if __name__ == '__main__': unittest.main() diff --git a/tests/UnitTests/Stemmer/stemmer_factory_test.py b/tests/UnitTests/Stemmer/stemmer_factory_test.py index 146df28..9df96b7 100644 --- a/tests/UnitTests/Stemmer/stemmer_factory_test.py +++ b/tests/UnitTests/Stemmer/stemmer_factory_test.py @@ -1,4 +1,5 @@ import unittest +import time from Sastrawi.Stemmer.StemmerFactory import StemmerFactory from Sastrawi.Stemmer.Stemmer import Stemmer @@ -23,9 +24,55 @@ def test_fungsional(self): if output != expected: raise AssertionError(str.format('output is {} instead of {}', output, expected)) + # Test Waktu Stemming < 3 detik + # @author Mufid Jamaluddin + def test_execution_time(self): + start = time.time() + sentence = 'Rakyat memenuHi halaMan geDung DPR unTuk menyuarakan isi hatinya. Saat Itu, situasi sangat genting sekali. Terjadi kerusuhan yang mengiringi pergerakan mahasiswa yang memperjuangkan reformasi.' + + factory = StemmerFactory() + stemmer = factory.create_stemmer() + stemmer.stem(sentence) + + end = time.time() + + execution_time = end - start + self.assertTrue(execution_time < 3) + def test_getWordsFromFile(self): factory = StemmerFactory() factory.get_words_from_file() + # Test Stemming per Kata + # @author Mufid Jamaluddin + def test_word_stemming(self): + factory = StemmerFactory() + stemmer = factory.create_stemmer() + self.assertEqual('besar', stemmer.stem('terbesar')) + self.assertEqual('abai', stemmer.stem('diabaikan')) + + # Test Stemming dengan isDev=True (No Cache) + # @author Mufid Jamaluddin + def test_word_stemmingdev(self): + factory = StemmerFactory() + stemmer = factory.create_stemmer(isDev=True) + self.assertEqual('besar', stemmer.stem('terbesar')) + self.assertEqual('abai', stemmer.stem('diabaikan')) + + # Test Stemming dengan list tokens + # @author Mufid Jamaluddin + def test_tokens_stemming(self): + factory = StemmerFactory() + stemmer = factory.create_stemmer() + tokens = ['perekonomian', '', 'indonesia', 'sedang', ' ', 'dalam', 'pertumbuhan' ,'yang', 'membanggakan'] + clean_tokens = stemmer.stem_tokens(tokens) + self.assertEqual('ekonomi', clean_tokens[0]) + self.assertEqual('indonesia', clean_tokens[1]) + self.assertEqual('sedang', clean_tokens[2]) + self.assertEqual('dalam', clean_tokens[3]) + self.assertEqual('tumbuh', clean_tokens[4]) + self.assertEqual('yang', clean_tokens[5]) + self.assertEqual('bangga', clean_tokens[6]) + if __name__ == '__main__': unittest.main() diff --git a/tests/UnitTests/StopWordRemover/stop_word_remover_factory_test.py b/tests/UnitTests/StopWordRemover/stop_word_remover_factory_test.py index 5f758f8..8c82336 100644 --- a/tests/UnitTests/StopWordRemover/stop_word_remover_factory_test.py +++ b/tests/UnitTests/StopWordRemover/stop_word_remover_factory_test.py @@ -1,4 +1,5 @@ import unittest +import time from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory from Sastrawi.StopWordRemover.StopWordRemover import StopWordRemover @@ -9,6 +10,33 @@ def setUp(self): def test_createStopWordRemover(self): self.assertIsInstance(self.factory.create_stop_word_remover(), StopWordRemover) + + def test_stopwordRemoval(self): + sremover = self.factory.create_stop_word_remover() + self.assertEqual('pergi sekolah', sremover.remove('pergi ke sekolah yang')) + self.assertEqual('makan rumah', sremover.remove('makan di rumah yang')) + + def test_tokens_stopwordRemoval(self): + tokens = ['pergi', 'ke', 'sekolah', 'yang', 'bagus', 'adalah', 'impian'] + sremover = self.factory.create_stop_word_remover() + clean_tokens = sremover.remove_tokens(tokens) + text = ' '.join(clean_tokens) + self.assertEquals('pergi sekolah bagus impian', text) + self.assertEqual('pergi', clean_tokens[0]) + self.assertEqual('sekolah', clean_tokens[1]) + self.assertEqual('bagus', clean_tokens[2]) + self.assertEqual('impian', clean_tokens[3]) + + def test_execution_time(self): + start = time.time() + sentence = 'Rakyat memenuHi halaMan geDung DPR unTuk menyuarakan isi hatinya. Saat Itu, situasi sangat genting sekali. Terjadi kerusuhan yang mengiringi pergerakan mahasiswa yang memperjuangkan reformasi.' + sremover = self.factory.create_stop_word_remover() + sremover.remove(sentence) + end = time.time() + # print(execution_time) + execution_time = end - start + + self.assertTrue(execution_time < 1) if __name__ == '__main__': unittest.main()