From ee5dcdec6062294ad56eebceb4d44065fbc278c2 Mon Sep 17 00:00:00 2001 From: Hanif Amal Robbani Date: Sat, 16 Jan 2016 09:48:19 +0700 Subject: [PATCH 01/45] Update .travis.yml --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index c763074..7b533f7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,7 +6,7 @@ python: - "3.4" - "3.5" sudo: false -script: nosetests tests --verbose -with-coverage +script: nosetests tests --verbose --with-coverage notifications: email: - dev.har07@gmail.com From 92068528d3fcf9d0ab0e2f21cf45a990449e6f0b Mon Sep 17 00:00:00 2001 From: har07 Date: Sat, 16 Jan 2016 14:13:09 +0700 Subject: [PATCH 02/45] add files for package distribution, according to : http://python-packaging-user-guide.readthedocs.org/en/latest/distributing/ --- README.md | 44 +++++++++++++++ Sastrawi.sln | 7 +++ setup.cfg | 5 ++ setup.py | 113 ++++++++++++++++++++++++++++++++++++++ tests/SastrawiTest.pyproj | 4 -- 5 files changed, 169 insertions(+), 4 deletions(-) create mode 100644 README.md create mode 100644 setup.cfg create mode 100644 setup.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..c84740c --- /dev/null +++ b/README.md @@ -0,0 +1,44 @@ +Sastrawi +========= + +Sastrawi is a simple Python library which allows you to reduce inflected words in Indonesian Language (Bahasa Indonesia) to their base form ([stem](http://en.wikipedia.org/wiki/Stemming)). +This is Python port of the original [Sastrawi](https://github.com/sastrawi/sastrawi) project written in PHP. + + +| Master | +| ------ | +|[![Build Status](https://travis-ci.org/har07/sastrawi.svg?branch=master)](https://travis-ci.org/har07/sastrawi) | + +Cara Install +------------- + +Sastrawi dapat diinstall menggunakan [pip](https://docs.python.org/3.6/installing/index.html), dengan menjalankan perintah berikut di terminal/command prompt : `pip install Sastrawi` + +Penggunaan +----------- + +Jalankan baris-baris kode berikut di *Python interactive terminal* : + +```python +# import Sastrawi package +from Sastrawi.Stemmer.StemmerFactory import StemmerFactory + +# create stemmer +factory = StemmerFactory() +stemmer = factory.create_stemmer() + +# stem +sentence = 'Perekonomian Indonesia sedang dalam pertumbuhan yang membanggakan' +output = stemmer.stem(sentence) + +print(output) +# ekonomi indonesia sedang dalam tumbuh yang bangga + +print(stemmer.stem('Mereka meniru-nirukannya')) +# mereka tiru +``` + +Informasi Lebih Lanjut +---------------------- + +- [Sastrawi PHP Repository page](https://github.com/sastrawi/sastrawi) \ No newline at end of file diff --git a/Sastrawi.sln b/Sastrawi.sln index 65c9db8..5327f28 100644 --- a/Sastrawi.sln +++ b/Sastrawi.sln @@ -7,6 +7,13 @@ Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "Sastrawi", "src\Sastrawi\Sa EndProject Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "SastrawiTest", "tests\SastrawiTest.pyproj", "{69199BE5-44C5-45C3-8B82-62F14DA2B9F1}" EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{0302964A-E17E-468E-8365-21827A654692}" + ProjectSection(SolutionItems) = preProject + README.md = README.md + setup.cfg = setup.cfg + setup.py = setup.py + EndProjectSection +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..c34b498 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,5 @@ +[bdist_wheel] +# This flag says that the code is written to work on both Python 2 and Python +# 3. If at all possible, it is good practice to do this. If you cannot, you +# will need to generate wheels for each Python version that you support. +universal=1 \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..c10a58f --- /dev/null +++ b/setup.py @@ -0,0 +1,113 @@ +"""A setuptools based setup module. +See: +https://packaging.python.org/en/latest/distributing.html +https://github.com/pypa/sampleproject +""" + +# Always prefer setuptools over distutils +from setuptools import setup, find_packages +# To use a consistent encoding +from codecs import open +from os import path + +#here = path.abspath(path.dirname(__file__)) + +# Get the long description from the README file +#with open(path.join(here, 'README.rst'), encoding='utf-8') as f: +# long_description = f.read() + +setup( + name='Sastrawi', + + # Versions should comply with PEP440. For a discussion on single-sourcing + # the version across setup.py and the project code, see + # https://packaging.python.org/en/latest/single_source_version.html + version='1.0.0', + + description='Library for stemming Indonesian (Bahasa) text', + long_description='Library for stemming Indonesian (Bahasa) text', + + # The project's main homepage. + url='https://github.com/har07/sastrawi', + + # Author details + author='Hanif', + author_email='dev.har07@gmail.com', + + # Choose your license + license='MIT', + + # See https://pypi.python.org/pypi?%3Aaction=list_classifiers + classifiers=[ + # How mature is this project? Common values are + # 3 - Alpha + # 4 - Beta + # 5 - Production/Stable + 'Development Status :: 4 - Beta', + + # Indicate who your project is intended for + 'Intended Audience :: Developers', + 'Topic :: Natural Language Processing :: Stemming', + + # Pick your license as you wish (should match "license" above) + 'License :: OSI Approved :: MIT License', + + # Specify the Python versions you support here. In particular, ensure + # that you indicate whether you support Python 2, Python 3 or both. + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.2', + 'Programming Language :: Python :: 3.3', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', + ], + + # What does your project relate to? + keywords='nlp stemming indonesia indonesian', + + # You can just specify the packages manually here if your project is + # simple. Or you can use find_packages(). + packages=find_packages(exclude=['contrib', 'docs', 'tests']), + + # Alternatively, if you want to distribute just a my_module.py, uncomment + # this: + # py_modules=["my_module"], + + # List run-time dependencies here. These will be installed by pip when + # your project is installed. For an analysis of "install_requires" vs pip's + # requirements files see: + # https://packaging.python.org/en/latest/requirements.html + #install_requires=['peppercorn'], + + # List additional groups of dependencies here (e.g. development + # dependencies). You can install these using the following syntax, + # for example: + # $ pip install -e .[dev,test] + #extras_require={ + # 'dev': ['check-manifest'], + # 'test': ['coverage'], + #}, + + # If there are data files included in your packages that need to be + # installed, specify them here. If using Python 2.6 or less, then these + # have to be included in MANIFEST.in as well. + #package_data={ + # 'sample': ['package_data.dat'], + #}, + + # Although 'package_data' is the preferred approach, in some case you may + # need to place data files outside of your packages. See: + # http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files # noqa + # In this case, 'data_file' will be installed into '/my_data' + #data_files=[('my_data', ['data/data_file'])], + + # To provide executable scripts, use entry points in preference to the + # "scripts" keyword. Entry points provide cross-platform support and allow + # pip to create the appropriate form of executable for the target platform. + #entry_points={ + # 'console_scripts': [ + # 'sample=sample:main', + # ], + #}, +) \ No newline at end of file diff --git a/tests/SastrawiTest.pyproj b/tests/SastrawiTest.pyproj index 736bb33..2ff9ee5 100644 --- a/tests/SastrawiTest.pyproj +++ b/tests/SastrawiTest.pyproj @@ -68,10 +68,6 @@ - - - - \ No newline at end of file From 42b5c4dc61ebc639a095ba8165447ea81490af0a Mon Sep 17 00:00:00 2001 From: har07 Date: Sat, 16 Jan 2016 19:32:35 +0700 Subject: [PATCH 03/45] config files for travis-ci, coveralls, pypi/setup --- .coveragerc | 5 +++++ .travis.yml | 10 +++++++--- Sastrawi.sln | 3 +++ setup.py | 4 ++-- 4 files changed, 17 insertions(+), 5 deletions(-) create mode 100644 .coveragerc diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..b2ddda4 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,5 @@ +[report] +omit = + */python?.?/* + */site-packages/nose/* + *__init__* \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index 2436dcc..4a0c03b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,7 +7,11 @@ python: - "3.4" - "3.5" sudo: false -script: nosetests tests --verbose +install: + - pip install python-coveralls + - pip install coveralls +script: nosetests tests --verbose --with-coverage +after_success: + - coveralls notifications: - email: - - dev.har07@gmail.com + email: false diff --git a/Sastrawi.sln b/Sastrawi.sln index 5327f28..3f3f5fe 100644 --- a/Sastrawi.sln +++ b/Sastrawi.sln @@ -9,6 +9,9 @@ Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "SastrawiTest", "tests\Sastr EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{0302964A-E17E-468E-8365-21827A654692}" ProjectSection(SolutionItems) = preProject + .coveragerc = .coveragerc + .gitignore = .gitignore + .travis.yml = .travis.yml README.md = README.md setup.cfg = setup.cfg setup.py = setup.py diff --git a/setup.py b/setup.py index c10a58f..825a8be 100644 --- a/setup.py +++ b/setup.py @@ -31,7 +31,7 @@ url='https://github.com/har07/sastrawi', # Author details - author='Hanif', + author='Hanif Amal Robbani', author_email='dev.har07@gmail.com', # Choose your license @@ -47,7 +47,7 @@ # Indicate who your project is intended for 'Intended Audience :: Developers', - 'Topic :: Natural Language Processing :: Stemming', + 'Topic :: Text Processing :: Linguistic', # Pick your license as you wish (should match "license" above) 'License :: OSI Approved :: MIT License', From 11a4d88561fbe1a1aab324baf945c5144383e21d Mon Sep 17 00:00:00 2001 From: har07 Date: Sat, 16 Jan 2016 19:56:46 +0700 Subject: [PATCH 04/45] update readme --- README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index c84740c..99f8770 100644 --- a/README.md +++ b/README.md @@ -5,14 +5,13 @@ Sastrawi is a simple Python library which allows you to reduce inflected words i This is Python port of the original [Sastrawi](https://github.com/sastrawi/sastrawi) project written in PHP. -| Master | -| ------ | -|[![Build Status](https://travis-ci.org/har07/sastrawi.svg?branch=master)](https://travis-ci.org/har07/sastrawi) | +[![Build Status](https://travis-ci.org/har07/sastrawi.svg?branch=master)](https://travis-ci.org/har07/sastrawi) +[![Coverage Status](https://coveralls.io/repos/har07/sastrawi/badge.svg?branch=master&service=github)](https://coveralls.io/github/har07/sastrawi?branch=master) Cara Install ------------- -Sastrawi dapat diinstall menggunakan [pip](https://docs.python.org/3.6/installing/index.html), dengan menjalankan perintah berikut di terminal/command prompt : `pip install Sastrawi` +Sastrawi dapat di*install* menggunakan [pip](https://docs.python.org/3.6/installing/index.html), dengan menjalankan perintah berikut di terminal/command prompt : `pip install Sastrawi` Penggunaan ----------- From 4d35327440a91c062ca7ccf63fc45c962e6922db Mon Sep 17 00:00:00 2001 From: har07 Date: Sat, 16 Jan 2016 20:02:17 +0700 Subject: [PATCH 05/45] remove support for Python 3.2 due to build error --- .travis.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 9e6157f..65ec28e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,6 @@ language: python python: - "2.7" - - "3.2" - "3.3" - "3.4" - "3.5" From 99eba63b2e519df498d691a8296b228d34b2772e Mon Sep 17 00:00:00 2001 From: Hanif Amal Robbani Date: Sat, 16 Jan 2016 20:13:50 +0700 Subject: [PATCH 06/45] fix typo --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 99f8770..329da81 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ This is Python port of the original [Sastrawi](https://github.com/sastrawi/sastr Cara Install ------------- -Sastrawi dapat di*install* menggunakan [pip](https://docs.python.org/3.6/installing/index.html), dengan menjalankan perintah berikut di terminal/command prompt : `pip install Sastrawi` +Sastrawi dapat di-*install* menggunakan [pip](https://docs.python.org/3.6/installing/index.html), dengan menjalankan perintah berikut di terminal/command prompt : `pip install Sastrawi` Penggunaan ----------- @@ -40,4 +40,4 @@ print(stemmer.stem('Mereka meniru-nirukannya')) Informasi Lebih Lanjut ---------------------- -- [Sastrawi PHP Repository page](https://github.com/sastrawi/sastrawi) \ No newline at end of file +- [Sastrawi PHP Repository page](https://github.com/sastrawi/sastrawi) From ecf2c110d99f342838dbd65a53a76982085529a0 Mon Sep 17 00:00:00 2001 From: har07 Date: Sun, 17 Jan 2016 09:48:37 +0700 Subject: [PATCH 07/45] PyPI/pip badge --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 99f8770..f697d1f 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ This is Python port of the original [Sastrawi](https://github.com/sastrawi/sastr [![Build Status](https://travis-ci.org/har07/sastrawi.svg?branch=master)](https://travis-ci.org/har07/sastrawi) [![Coverage Status](https://coveralls.io/repos/har07/sastrawi/badge.svg?branch=master&service=github)](https://coveralls.io/github/har07/sastrawi?branch=master) +[![PyPI version](https://badge.fury.io/py/Sastrawi.svg)](https://badge.fury.io/py/Sastrawi) Cara Install ------------- From 274223a9b77fabfe5fb6e308cd89fb48747f6f05 Mon Sep 17 00:00:00 2001 From: Hanif Amal Robbani Date: Mon, 18 Jan 2016 09:59:59 +0700 Subject: [PATCH 08/45] Update README.md --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index fc07099..d38e0bd 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ -Sastrawi -========= +Sastrawi Python +=============== -Sastrawi is a simple Python library which allows you to reduce inflected words in Indonesian Language (Bahasa Indonesia) to their base form ([stem](http://en.wikipedia.org/wiki/Stemming)). -This is Python port of the original [Sastrawi](https://github.com/sastrawi/sastrawi) project written in PHP. +Sastrawi Python is a simple python library which allows you to reduce inflected words in Indonesian Language (Bahasa Indonesia) to their base form ([stem](http://en.wikipedia.org/wiki/Stemming)). +This is Python port of the original [Sastrawi](https://github.com/sastrawi/sastrawi) project written in PHP (credits goes to the original author and contributors of Sastrawi PHP). [![Build Status](https://travis-ci.org/har07/sastrawi.svg?branch=master)](https://travis-ci.org/har07/sastrawi) @@ -20,14 +20,14 @@ Penggunaan Jalankan baris-baris kode berikut di *Python interactive terminal* : ```python -# import Sastrawi package +# import StemmerFactory class from Sastrawi.Stemmer.StemmerFactory import StemmerFactory # create stemmer factory = StemmerFactory() stemmer = factory.create_stemmer() -# stem +# stemming process sentence = 'Perekonomian Indonesia sedang dalam pertumbuhan yang membanggakan' output = stemmer.stem(sentence) From 7e212475ff02cdcbf747b395dd1e9f1db53b6472 Mon Sep 17 00:00:00 2001 From: har07 Date: Mon, 18 Jan 2016 19:39:46 +0700 Subject: [PATCH 09/45] commit successful pip setup configurations --- .gitignore | 3 + README.md | 2 +- README.rst | 56 +++++++++++++++++++ Sastrawi.sln | 1 + setup.py | 22 ++++---- src/Sastrawi/Sastrawi.pyproj | 5 ++ src/Sastrawi/Stemmer/StemmerFactory.py | 2 +- .../Stemmer/data}/kata-dasar.original.txt | 0 .../Sastrawi/Stemmer/data}/kata-dasar.txt | 0 9 files changed, 79 insertions(+), 12 deletions(-) create mode 100644 README.rst rename {data => src/Sastrawi/Stemmer/data}/kata-dasar.original.txt (100%) rename {data => src/Sastrawi/Stemmer/data}/kata-dasar.txt (100%) diff --git a/.gitignore b/.gitignore index d906f69..3eac0ca 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,9 @@ __pycache__/ *.py[cod] +# pycharm generated +.idea/ + # visual studio generated bin/ obj/ diff --git a/README.md b/README.md index fc07099..fe38795 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ This is Python port of the original [Sastrawi](https://github.com/sastrawi/sastr [![Build Status](https://travis-ci.org/har07/sastrawi.svg?branch=master)](https://travis-ci.org/har07/sastrawi) [![Coverage Status](https://coveralls.io/repos/har07/sastrawi/badge.svg?branch=master&service=github)](https://coveralls.io/github/har07/sastrawi?branch=master) -[![PyPI version](https://badge.fury.io/py/Sastrawi.svg)](https://badge.fury.io/py/Sastrawi) +[![PyPI version](https://badge.fury.io/py/sastrawi.svg)](https://badge.fury.io/py/sastrawi) Cara Install ------------- diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..2d2f652 --- /dev/null +++ b/README.rst @@ -0,0 +1,56 @@ +Sastrawi +======== + +| Sastrawi is a simple Python library which allows you to reduce + inflected words in Indonesian Language (Bahasa Indonesia) to their + base form (`stem`_). +| This is Python port of the original `Sastrawi`_ project written in + PHP. + +|Build Status| +|Coverage Status| + +Installation +------------ + +Sastrawi can be installed via `pip`_, by running the following commands +in terminal/command prompt : ``pip install Sastrawi`` + +Example Usage +------------- + +Run the following commands in *Python interactive terminal* : + +.. code:: python + + # import Sastrawi package + from Sastrawi.Stemmer.StemmerFactory import StemmerFactory + + # create stemmer + factory = StemmerFactory() + stemmer = factory.create_stemmer() + + # stem + sentence = 'Perekonomian Indonesia sedang dalam pertumbuhan yang membanggakan' + output = stemmer.stem(sentence) + + print(output) + # ekonomi indonesia sedang dalam tumbuh yang bangga + + print(stemmer.stem('Mereka meniru-nirukannya')) + # mereka tiru + +More Info +--------- + +- `Sastrawi PHP Repository page`_ + +.. _stem: http://en.wikipedia.org/wiki/Stemming +.. _Sastrawi: https://github.com/sastrawi/sastrawi +.. _pip: https://docs.python.org/3.6/installing/index.html +.. _Sastrawi PHP Repository page: https://github.com/sastrawi/sastrawi + +.. |Build Status| image:: https://travis-ci.org/har07/sastrawi.svg?branch=master + :target: https://travis-ci.org/har07/sastrawi +.. |Coverage Status| image:: https://coveralls.io/repos/har07/sastrawi/badge.svg?branch=master&service=github + :target: https://coveralls.io/github/har07/sastrawi?branch=master \ No newline at end of file diff --git a/Sastrawi.sln b/Sastrawi.sln index 3f3f5fe..0866570 100644 --- a/Sastrawi.sln +++ b/Sastrawi.sln @@ -13,6 +13,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution .gitignore = .gitignore .travis.yml = .travis.yml README.md = README.md + README.rst = README.rst setup.cfg = setup.cfg setup.py = setup.py EndProjectSection diff --git a/setup.py b/setup.py index 825a8be..1deb409 100644 --- a/setup.py +++ b/setup.py @@ -13,8 +13,8 @@ #here = path.abspath(path.dirname(__file__)) # Get the long description from the README file -#with open(path.join(here, 'README.rst'), encoding='utf-8') as f: -# long_description = f.read() +with open(path.join(here, 'README.rst'), encoding='utf-8') as f: + long_description = f.read() setup( name='Sastrawi', @@ -22,7 +22,7 @@ # Versions should comply with PEP440. For a discussion on single-sourcing # the version across setup.py and the project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='1.0.0', + version='1.0.1', description='Library for stemming Indonesian (Bahasa) text', long_description='Library for stemming Indonesian (Bahasa) text', @@ -46,7 +46,8 @@ 'Development Status :: 4 - Beta', # Indicate who your project is intended for - 'Intended Audience :: Developers', + 'Intended Audience :: Information Technology', + 'Intended Audience :: Science/Research', 'Topic :: Text Processing :: Linguistic', # Pick your license as you wish (should match "license" above) @@ -57,18 +58,19 @@ 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.2', 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', ], # What does your project relate to? - keywords='nlp stemming indonesia indonesian', + keywords='linguistic stemming indonesian bahasa', # You can just specify the packages manually here if your project is # simple. Or you can use find_packages(). - packages=find_packages(exclude=['contrib', 'docs', 'tests']), + packages=find_packages('src', exclude=['contrib', 'docs', 'tests']), + #packages=["Sastrawi"], + package_dir = {'':'src'}, # Alternatively, if you want to distribute just a my_module.py, uncomment # this: @@ -92,9 +94,9 @@ # If there are data files included in your packages that need to be # installed, specify them here. If using Python 2.6 or less, then these # have to be included in MANIFEST.in as well. - #package_data={ - # 'sample': ['package_data.dat'], - #}, + package_data={ + '': ['data/*.txt'], + }, # Although 'package_data' is the preferred approach, in some case you may # need to place data files outside of your packages. See: diff --git a/src/Sastrawi/Sastrawi.pyproj b/src/Sastrawi/Sastrawi.pyproj index 8dcee03..3ec3716 100644 --- a/src/Sastrawi/Sastrawi.pyproj +++ b/src/Sastrawi/Sastrawi.pyproj @@ -27,6 +27,7 @@ + @@ -246,6 +247,10 @@ Code + + + + \ No newline at end of file diff --git a/src/Sastrawi/Stemmer/StemmerFactory.py b/src/Sastrawi/Stemmer/StemmerFactory.py index f01f80f..10e6aaa 100644 --- a/src/Sastrawi/Stemmer/StemmerFactory.py +++ b/src/Sastrawi/Stemmer/StemmerFactory.py @@ -32,7 +32,7 @@ def get_words(self, isDev=False): def get_words_from_file(self): current_dir = os.path.dirname(os.path.realpath(__file__)) - dictionaryFile = current_dir + '/../../../data/kata-dasar.txt' + dictionaryFile = current_dir + '/data/kata-dasar.txt' if not os.path.isfile(dictionaryFile): raise RuntimeError('Dictionary file is missing. It seems that your installation is corrupted.') diff --git a/data/kata-dasar.original.txt b/src/Sastrawi/Stemmer/data/kata-dasar.original.txt similarity index 100% rename from data/kata-dasar.original.txt rename to src/Sastrawi/Stemmer/data/kata-dasar.original.txt diff --git a/data/kata-dasar.txt b/src/Sastrawi/Stemmer/data/kata-dasar.txt similarity index 100% rename from data/kata-dasar.txt rename to src/Sastrawi/Stemmer/data/kata-dasar.txt From d3582795ca2411a701ef1a0dfffb5a37111e41d4 Mon Sep 17 00:00:00 2001 From: Hanif Amal Robbani Date: Tue, 19 Jan 2016 20:29:54 +0700 Subject: [PATCH 10/45] Update README.md --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index db68fb9..23818b6 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,14 @@ print(stemmer.stem('Mereka meniru-nirukannya')) # mereka tiru ``` + +Lisensi +-------- + +Lisensi Sastrawi Python adalah MIT License (MIT). + +Project ini mengandung kamus kata dasar yang berasal dari Kateglo dengan lisensi [CC-BY-NC-SA 3.0](http://creativecommons.org/licenses/by-nc-sa/3.0/). + Informasi Lebih Lanjut ---------------------- From df24307744bb80c67c90d6552f4ca87e18dab697 Mon Sep 17 00:00:00 2001 From: Hanif Amal Robbani Date: Fri, 25 Mar 2016 13:10:44 +0700 Subject: [PATCH 11/45] Update travis-ci link.Add demo URL Update travis-ci link due to renamed repo --- README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 23818b6..79a61d9 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ Sastrawi Python is a simple python library which allows you to reduce inflected This is Python port of the original [Sastrawi](https://github.com/sastrawi/sastrawi) project written in PHP (credits goes to the original author and contributors of Sastrawi PHP). -[![Build Status](https://travis-ci.org/har07/sastrawi.svg?branch=master)](https://travis-ci.org/har07/sastrawi) +[![Build Status](https://travis-ci.org/har07/PySastrawi.svg?branch=master)](https://travis-ci.org/har07/PySastrawi) [![Coverage Status](https://coveralls.io/repos/har07/sastrawi/badge.svg?branch=master&service=github)](https://coveralls.io/github/har07/sastrawi?branch=master) [![PyPI version](https://badge.fury.io/py/sastrawi.svg)](https://badge.fury.io/py/sastrawi) @@ -38,6 +38,12 @@ print(stemmer.stem('Mereka meniru-nirukannya')) # mereka tiru ``` +Demo +-------- + +Live demo URL : https://pysastrawi-demo.appspot.com/ + +Repository : https://github.com/har07/pystastrawi-demo Lisensi -------- From 1868b94a094063e27349fd4c16690088a7d000f9 Mon Sep 17 00:00:00 2001 From: Hanif Amal Robbani Date: Fri, 25 Mar 2016 13:15:01 +0700 Subject: [PATCH 12/45] Update README.rst --- README.rst | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 2d2f652..35eb584 100644 --- a/README.rst +++ b/README.rst @@ -40,6 +40,13 @@ Run the following commands in *Python interactive terminal* : print(stemmer.stem('Mereka meniru-nirukannya')) # mereka tiru +Demo +--------- + +Live demo : https://pysastrawi-demo.appspot.com/ + +Repository : https://github.com/har07/pystastrawi-demo + More Info --------- @@ -50,7 +57,7 @@ More Info .. _pip: https://docs.python.org/3.6/installing/index.html .. _Sastrawi PHP Repository page: https://github.com/sastrawi/sastrawi -.. |Build Status| image:: https://travis-ci.org/har07/sastrawi.svg?branch=master - :target: https://travis-ci.org/har07/sastrawi +.. |Build Status| image:: https://travis-ci.org/har07/PySastrawi.svg?branch=master + :target: https://travis-ci.org/har07/PySastrawi .. |Coverage Status| image:: https://coveralls.io/repos/har07/sastrawi/badge.svg?branch=master&service=github - :target: https://coveralls.io/github/har07/sastrawi?branch=master \ No newline at end of file + :target: https://coveralls.io/github/har07/sastrawi?branch=master From 0d7fcdb2ce013339666896763eb9eb20a400cfa3 Mon Sep 17 00:00:00 2001 From: Guntur Sarwohadi Date: Tue, 21 Jun 2016 12:13:08 +0700 Subject: [PATCH 13/45] Memperbaiki StopWordRemover saat menghapus list words (skipped cursor karena remove langsung dari list words), dengan menggunakan list baru (dan memakai list comprehension). Menambahkan daftar stop word pada StopWordRemoverFactory. --- .../StopWordRemover/StopWordRemover.py | 6 +- .../StopWordRemover/StopWordRemoverFactory.py | 105 +++++++++++++++--- 2 files changed, 93 insertions(+), 18 deletions(-) diff --git a/src/Sastrawi/StopWordRemover/StopWordRemover.py b/src/Sastrawi/StopWordRemover/StopWordRemover.py index b3b2f25..d3de2ff 100644 --- a/src/Sastrawi/StopWordRemover/StopWordRemover.py +++ b/src/Sastrawi/StopWordRemover/StopWordRemover.py @@ -10,11 +10,9 @@ def get_dictionary(self): def remove(self, text): """Remove stop words.""" words = text.split(' ') - for word in words: - if self.dictionary.contains(word): - words.remove(word) + stopped_words = [word for word in words if not self.dictionary.contains(word)] - return ' '.join(words) + return ' '.join(stopped_words) diff --git a/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py b/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py index 5b35049..560db07 100644 --- a/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py +++ b/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py @@ -12,20 +12,97 @@ def create_stop_word_remover(self): return stopWordRemover def get_stop_words(self): - return ['yang', 'untuk', 'pada', 'ke', 'para', 'namun', 'menurut', 'antara', 'dia', 'dua', - 'ia', 'seperti', 'jika', 'jika', 'sehingga', 'kembali', 'dan', 'tidak', 'ini', 'karena', - 'kepada', 'oleh', 'saat', 'harus', 'sementara', 'setelah', 'belum', 'kami', 'sekitar', - 'bagi', 'serta', 'di', 'dari', 'telah', 'sebagai', 'masih', 'hal', 'ketika', 'adalah', - 'itu', 'dalam', 'bisa', 'bahwa', 'atau', 'hanya', 'kita', 'dengan', 'akan', 'juga', - 'ada', 'mereka', 'sudah', 'saya', 'terhadap', 'secara', 'agar', 'lain', 'anda', - 'begitu', 'mengapa', 'kenapa', 'yaitu', 'yakni', 'daripada', 'itulah', 'lagi', 'maka', - 'tentang', 'demi', 'dimana', 'kemana', 'pula', 'sambil', 'sebelum', 'sesudah', 'supaya', - 'guna', 'kah', 'pun', 'sampai', 'sedangkan', 'selagi', 'sementara', 'tetapi', 'apakah', - 'kecuali', 'sebab', 'selain', 'seolah', 'seraya', 'seterusnya', 'tanpa', 'agak', 'boleh', - 'dapat', 'dsb', 'dst', 'dll', 'dahulu', 'dulunya', 'anu', 'demikian', 'tapi', 'ingin', - 'juga', 'nggak', 'mari', 'nanti', 'melainkan', 'oh', 'ok', 'seharusnya', 'sebetulnya', - 'setiap', 'setidaknya', 'sesuatu', 'pasti', 'saja', 'toh', 'ya', 'walau', 'tolong', - 'tentu', 'amat', 'apalagi', 'bagaimanapun'] + return ['a','ada','adalah','adanya','adapun','agak','agaknya','agar','akan','akankah','akhir', + 'akhiri','akhirnya','aku','akulah','amat','amatlah','anda','andalah','antar','antara', + 'antaranya','apa','apaan','apabila','apakah','apalagi','apatah','arti','artinya','asal', + 'asalkan','atas','atau','ataukah','ataupun','awal','awalnya','b','bagai','bagaikan', + 'bagaimana','bagaimanakah','bagaimanapun','bagainamakah','bagi','bagian','bahkan','bahwa', + 'bahwasannya','bahwasanya','baik','baiklah','bakal','bakalan','balik','banyak','bapak', + 'baru','bawah','beberapa','begini','beginian','beginikah','beginilah','begitu','begitukah', + 'begitulah','begitupun','bekerja','belakang','belakangan','belum','belumlah','benar', + 'benarkah','benarlah','berada','berakhir','berakhirlah','berakhirnya','berapa','berapakah', + 'berapalah','berapapun','berarti','berawal','berbagai','berdatangan','beri','berikan', + 'berikut','berikutnya','berjumlah','berkali-kali','berkata','berkehendak','berkeinginan', + 'berkenaan','berlainan','berlalu','berlangsung','berlebihan','bermacam','bermacam-macam', + 'bermaksud','bermula','bersama','bersama-sama','bersiap','bersiap-siap','bertanya', + 'bertanya-tanya','berturut','berturut-turut','bertutur','berujar','berupa','besar', + 'betul','betulkah','biasa','biasanya','bila','bilakah','bisa','bisakah','boleh','bolehkah', + 'bolehlah','buat','bukan','bukankah','bukanlah','bukannya','bulan','bung','c','cara', + 'caranya','cukup','cukupkah','cukuplah','cuma','d','dahulu','dalam','dan','dapat','dari', + 'daripada','datang','dekat','demi','demikian','demikianlah','dengan','depan','di','dia', + 'diakhiri','diakhirinya','dialah','diantara','diantaranya','diberi','diberikan','diberikannya', + 'dibuat','dibuatnya','didapat','didatangkan','digunakan','diibaratkan','diibaratkannya', + 'diingat','diingatkan','diinginkan','dijawab','dijelaskan','dijelaskannya','dikarenakan', + 'dikatakan','dikatakannya','dikerjakan','diketahui','diketahuinya','dikira','dilakukan', + 'dilalui','dilihat','dimaksud','dimaksudkan','dimaksudkannya','dimaksudnya','diminta', + 'dimintai','dimisalkan','dimulai','dimulailah','dimulainya','dimungkinkan','dini','dipastikan', + 'diperbuat','diperbuatnya','dipergunakan','diperkirakan','diperlihatkan','diperlukan', + 'diperlukannya','dipersoalkan','dipertanyakan','dipunyai','diri','dirinya','disampaikan', + 'disebut','disebutkan','disebutkannya','disini','disinilah','ditambahkan','ditandaskan', + 'ditanya','ditanyai','ditanyakan','ditegaskan','ditujukan','ditunjuk','ditunjuki','ditunjukkan', + 'ditunjukkannya','ditunjuknya','dituturkan','dituturkannya','diucapkan','diucapkannya', + 'diungkapkan','dong','dua','dulu','e','empat','enak','enggak','enggaknya','entah','entahlah', + 'f','g','guna','gunakan','h','hadap','hai','hal','halo','hallo','hampir','hanya','hanyalah', + 'hari','harus','haruslah','harusnya','helo','hello','hendak','hendaklah','hendaknya','hingga', + 'i','ia','ialah','ibarat','ibaratkan','ibaratnya','ibu','ikut','ingat','ingat-ingat','ingin', + 'inginkah','inginkan','ini','inikah','inilah','itu','itukah','itulah','j','jadi','jadilah', + 'jadinya','jangan','jangankan','janganlah','jauh','jawab','jawaban','jawabnya','jelas', + 'jelaskan','jelaslah','jelasnya','jika','jikalau','juga','jumlah','jumlahnya','justru', + 'k','kadar','kala','kalau','kalaulah','kalaupun','kali','kalian','kami','kamilah','kamu', + 'kamulah','kan','kapan','kapankah','kapanpun','karena','karenanya','kasus','kata','katakan', + 'katakanlah','katanya','ke','keadaan','kebetulan','kecil','kedua','keduanya','keinginan', + 'kelamaan','kelihatan','kelihatannya','kelima','keluar','kembali','kemudian','kemungkinan', + 'kemungkinannya','kena','kenapa','kepada','kepadanya','kerja','kesampaian','keseluruhan', + 'keseluruhannya','keterlaluan','ketika','khusus','khususnya','kini','kinilah','kira', + 'kira-kira','kiranya','kita','kitalah','kok','kurang','l','lagi','lagian','lah','lain', + 'lainnya','laku','lalu','lama','lamanya','langsung','lanjut','lanjutnya','lebih','lewat', + 'lihat','lima','luar','m','macam','maka','makanya','makin','maksud','malah','malahan', + 'mampu','mampukah','mana','manakala','manalagi','masa','masalah','masalahnya','masih', + 'masihkah','masing','masing-masing','masuk','mata','mau','maupun','melainkan','melakukan', + 'melalui','melihat','melihatnya','memang','memastikan','memberi','memberikan','membuat', + 'memerlukan','memihak','meminta','memintakan','memisalkan','memperbuat','mempergunakan', + 'memperkirakan','memperlihatkan','mempersiapkan','mempersoalkan','mempertanyakan','mempunyai', + 'memulai','memungkinkan','menaiki','menambahkan','menandaskan','menanti','menanti-nanti', + 'menantikan','menanya','menanyai','menanyakan','mendapat','mendapatkan','mendatang','mendatangi', + 'mendatangkan','menegaskan','mengakhiri','mengapa','mengatakan','mengatakannya','mengenai', + 'mengerjakan','mengetahui','menggunakan','menghendaki','mengibaratkan','mengibaratkannya', + 'mengingat','mengingatkan','menginginkan','mengira','mengucapkan','mengucapkannya','mengungkapkan', + 'menjadi','menjawab','menjelaskan','menuju','menunjuk','menunjuki','menunjukkan','menunjuknya', + 'menurut','menuturkan','menyampaikan','menyangkut','menyatakan','menyebutkan','menyeluruh', + 'menyiapkan','merasa','mereka','merekalah','merupakan','meski','meskipun','meyakini','meyakinkan', + 'minta','mirip','misal','misalkan','misalnya','mohon','mula','mulai','mulailah','mulanya','mungkin', + 'mungkinkah','n','nah','naik','namun','nanti','nantinya','nya','nyaris','nyata','nyatanya', + 'o','oleh','olehnya','orang','p','pada','padahal','padanya','pak','paling','panjang','pantas', + 'para','pasti','pastilah','penting','pentingnya','per','percuma','perlu','perlukah','perlunya', + 'pernah','persoalan','pertama','pertama-tama','pertanyaan','pertanyakan','pihak','pihaknya', + 'pukul','pula','pun','punya','q','r','rasa','rasanya','rupa','rupanya','s','saat','saatnya','saja', + 'sajalah','salam','saling','sama','sama-sama','sambil','sampai','sampai-sampai','sampaikan','sana', + 'sangat','sangatlah','sangkut','satu','saya','sayalah','se','sebab','sebabnya','sebagai', + 'sebagaimana','sebagainya','sebagian','sebaik','sebaik-baiknya','sebaiknya','sebaliknya', + 'sebanyak','sebegini','sebegitu','sebelum','sebelumnya','sebenarnya','seberapa','sebesar', + 'sebetulnya','sebisanya','sebuah','sebut','sebutlah','sebutnya','secara','secukupnya','sedang', + 'sedangkan','sedemikian','sedikit','sedikitnya','seenaknya','segala','segalanya','segera', + 'seharusnya','sehingga','seingat','sejak','sejauh','sejenak','sejumlah','sekadar','sekadarnya', + 'sekali','sekali-kali','sekalian','sekaligus','sekalipun','sekarang','sekaranglah','sekecil', + 'seketika','sekiranya','sekitar','sekitarnya','sekurang-kurangnya','sekurangnya','sela','selain', + 'selaku','selalu','selama','selama-lamanya','selamanya','selanjutnya','seluruh','seluruhnya', + 'semacam','semakin','semampu','semampunya','semasa','semasih','semata','semata-mata','semaunya', + 'sementara','semisal','semisalnya','sempat','semua','semuanya','semula','sendiri','sendirian', + 'sendirinya','seolah','seolah-olah','seorang','sepanjang','sepantasnya','sepantasnyalah', + 'seperlunya','seperti','sepertinya','sepihak','sering','seringnya','serta','serupa','sesaat', + 'sesama','sesampai','sesegera','sesekali','seseorang','sesuatu','sesuatunya','sesudah', + 'sesudahnya','setelah','setempat','setengah','seterusnya','setiap','setiba','setibanya', + 'setidak-tidaknya','setidaknya','setinggi','seusai','sewaktu','siap','siapa','siapakah', + 'siapapun','sini','sinilah','soal','soalnya','suatu','sudah','sudahkah','sudahlah','supaya', + 't','tadi','tadinya','tahu','tak','tambah','tambahnya','tampak','tampaknya','tandas','tandasnya', + 'tanpa','tanya','tanyakan','tanyanya','tapi','tegas','tegasnya','telah','tempat','tentang','tentu', + 'tentulah','tentunya','tepat','terakhir','terasa','terbanyak','terdahulu','terdapat','terdiri', + 'terhadap','terhadapnya','teringat','teringat-ingat','terjadi','terjadilah','terjadinya','terkira', + 'terlalu','terlebih','terlihat','termasuk','ternyata','tersampaikan','tersebut','tersebutlah', + 'tertentu','tertuju','terus','terutama','tetap','tetapi','tiap','tiba','tiba-tiba','tidak', + 'tidakkah','tidaklah','tiga','toh','tuju','tunjuk','turut','tutur','tuturnya','u','ucap','ucapnya', + 'ujar','ujarnya','umumnya','ungkap','ungkapnya','untuk','usah','usai','v','w','waduh','wah','wahai', + 'waktunya','walau','walaupun','wong','x','y','ya','yaitu','yakin','yakni','yang','z'] From 010f228e149ace21613310be0ffb560a30724a4d Mon Sep 17 00:00:00 2001 From: Prasasto Adi Date: Sun, 23 Oct 2016 14:20:46 +0700 Subject: [PATCH 14/45] Update to Sastrawi v1.2.0 --- .../DisambiguatorPrefixRule27.py | 12 ++++++--- tests/FunctionalTests/Stemmer/stemmer_test.py | 26 ++++++++++++------- 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/src/Sastrawi/Morphology/Disambiguator/DisambiguatorPrefixRule27.py b/src/Sastrawi/Morphology/Disambiguator/DisambiguatorPrefixRule27.py index 2715c45..e67fbbc 100644 --- a/src/Sastrawi/Morphology/Disambiguator/DisambiguatorPrefixRule27.py +++ b/src/Sastrawi/Morphology/Disambiguator/DisambiguatorPrefixRule27.py @@ -2,13 +2,19 @@ class DisambiguatorPrefixRule27(object): """Disambiguate Prefix Rule 27 - Rule 27 : pen{c|d|j|z} -> pen-{c|d|j|z} + Rule 27 modified by Prasasto Adi : pen{c|d|j|s|t|z} -> pen-{c|d|j|s|t|z} + in order to stem penstabilan, pentranskripsi + + Original CS Rule 27 was : pen{c|d|j|z} -> pen-{c|d|j|z} """ def disambiguate(self, word): """Disambiguate Prefix Rule 27 - Rule 27 : pen{c|d|j|z} -> pen-{c|d|j|z} + Rule 27 modified by Prasasto Adi : pen{c|d|j|s|t|z} -> pen-{c|d|j|s|t|z} + in order to stem penstabilan, pentranskripsi + + Original CS Rule 27 was : pen{c|d|j|z} -> pen-{c|d|j|z} """ - matches = re.match(r'^pen([cdjz])(.*)$', word) + matches = re.match(r'^pen([cdjstz])(.*)$', word) if matches: return matches.group(1) + matches.group(2) diff --git a/tests/FunctionalTests/Stemmer/stemmer_test.py b/tests/FunctionalTests/Stemmer/stemmer_test.py index cc21eb3..c1f56ed 100644 --- a/tests/FunctionalTests/Stemmer/stemmer_test.py +++ b/tests/FunctionalTests/Stemmer/stemmer_test.py @@ -13,7 +13,9 @@ def setUp(self): 'bangun', 'fitnah', 'vonis', 'baru', 'ajar', 'tangkap', 'kupas', - 'minum', 'pukul', 'cinta', 'dua', 'jauh', 'ziarah', 'nuklir', 'gila', 'hajar', 'qasar', 'udara', + 'minum', 'pukul', + 'cinta', 'dua', 'dahulu', 'jauh', 'jarah', 'ziarah', + 'nuklir', 'nasihat', 'gila', 'hajar', 'qasar', 'udara', 'populer', 'warna', 'yoga', 'adil', 'rumah', 'muka', 'labuh', 'tarung', 'tebar', 'indah', 'daya', 'untung', 'sepuluh', 'ekonomi', 'makmur', 'telah', 'serta', 'percaya', 'pengaruh', 'kritik', 'seko', 'sekolah', 'tahan', 'capa', 'capai', @@ -22,7 +24,8 @@ def setUp(self): 'sembunyi', 'langgan', 'laku', 'baik', 'terang', 'iman', 'bisik', 'taat', 'puas', 'makan', 'nyala', 'nyanyi', 'nyata', 'nyawa', 'rata', 'lembut', 'ligas', 'budaya', 'karya', 'ideal', 'final', - 'taat', 'tiru', 'sepak', 'kuasa', 'malaikat', 'nikmat', # sastrawi additional rules + # sastrawi additional rules + 'taat', 'tiru', 'sepak', 'kuasa', 'malaikat', 'nikmat', 'stabil', 'transkripsi', 'lewat', 'nganga', 'allah', ] ) @@ -73,7 +76,8 @@ def get_test_data(self): data.append(['kesakitan', 'sakit']) data.append(['sesuap', 'suap']) - #data.append(['teriakanmu', 'teriak' # wtf? kok jadi teria?]) + #data.append(['teriakanmu', 'teriak']) # wtf? kok jadi ria? + #teriakanmu -> te-ria-kan-mu # template formulas for derivation prefix rules (disambiguation) # @@ -123,8 +127,8 @@ def get_test_data(self): data.append(['memvonis', 'vonis']) # rule 12 : mempe{r|l} -> mem-pe - data.append(['memperbaru', 'baru']) - data.append(['mempelajar', 'ajar']) + data.append(['memperbarui', 'baru']) + data.append(['mempelajari', 'ajar']) # rule 13a : mem{rV|V} -> mem{rV|V} data.append(['meminum', 'minum']) @@ -191,14 +195,13 @@ def get_test_data(self): data.append(['pemukul', 'pukul']) # rule 27 : men{c|d|j|z} -> men-{c|d|j|z} - # TODO : should find more relevant examples data.append(['pencinta', 'cinta']) - data.append(['pendua', 'dua']) - data.append(['penjauh', 'jauh']) + data.append(['pendahulu', 'dahulu']) + data.append(['penjarah', 'jarah']) data.append(['penziarah', 'ziarah']) # rule 28a : pen{V} -> pe-n{V} - data.append(['penuklir', 'nuklir']) + data.append(['penasihat', 'nasihat']) # rule 28b : pen{V} -> pe-t{V} data.append(['penangkap', 'tangkap']) @@ -219,7 +222,7 @@ def get_test_data(self): # rule 32 : pelV -> pe-lV except pelajar -> ajar data.append(['pelajar', 'ajar']) - data.append(['pelabuh', 'labuh']) + data.append(['pelabuhan', 'labuh']) # rule 33 : peCerV -> per-erV where C != {r|w|y|l|m|n} # TODO : find the examples @@ -349,6 +352,9 @@ def get_test_data(self): data.append(['finalisasi', 'final']) # sastrawi additional rules + data.append(['penstabilan', 'stabil']) + data.append(['pentranskripsi', 'transkripsi']) + data.append(['mentaati', 'taat']) data.append(['meniru-nirukan', 'tiru']) data.append(['menyepak-nyepak', 'sepak']) From fe2b42ab0b9999e3a366ab0d51dc7d5af4f18694 Mon Sep 17 00:00:00 2001 From: Hanif Amal Robbani Date: Wed, 26 Oct 2016 13:17:27 +0700 Subject: [PATCH 15/45] update coverage badge link --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 79a61d9..9e4c34f 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ This is Python port of the original [Sastrawi](https://github.com/sastrawi/sastr [![Build Status](https://travis-ci.org/har07/PySastrawi.svg?branch=master)](https://travis-ci.org/har07/PySastrawi) -[![Coverage Status](https://coveralls.io/repos/har07/sastrawi/badge.svg?branch=master&service=github)](https://coveralls.io/github/har07/sastrawi?branch=master) +[![Coverage Status](https://coveralls.io/repos/github/har07/PySastrawi/badge.svg?branch=master)](https://coveralls.io/github/har07/PySastrawi?branch=master) [![PyPI version](https://badge.fury.io/py/sastrawi.svg)](https://badge.fury.io/py/sastrawi) Cara Install From af480e5b7c648fde871c7bb10513de3d3b742232 Mon Sep 17 00:00:00 2001 From: Khairul Imam Date: Fri, 5 May 2017 11:18:15 +0800 Subject: [PATCH 16/45] change lower sequence (#5) `normalize_text()` supports both `string` and `unicode` --- src/Sastrawi/Stemmer/Filter/TextNormalizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Sastrawi/Stemmer/Filter/TextNormalizer.py b/src/Sastrawi/Stemmer/Filter/TextNormalizer.py index 2d72e27..8cad4a7 100644 --- a/src/Sastrawi/Stemmer/Filter/TextNormalizer.py +++ b/src/Sastrawi/Stemmer/Filter/TextNormalizer.py @@ -1,7 +1,7 @@ import re def normalize_text(text): - result = str.lower(text) + result = text.lower() #lower the text even unicode given result = re.sub(r'[^a-z0-9 -]', ' ', result, flags = re.IGNORECASE|re.MULTILINE) result = re.sub(r'( +)', ' ', result, flags = re.IGNORECASE|re.MULTILINE) From 5f30ede74c2a2e90f76cddebc590f9a83edf8369 Mon Sep 17 00:00:00 2001 From: widnyana Date: Sat, 28 Oct 2017 15:59:07 +0700 Subject: [PATCH 17/45] fix NameError when installing (#6) * remove git conflict info * fix code formatting * fix NameError: name 'here' is not defined --- .gitignore | 4 --- setup.py | 26 +++++++++---------- .../Dictionary/DictionaryInterface.py | 7 ++--- .../DisambiguatorPrefixRule24.py | 3 ++- src/Sastrawi/Stemmer/Context/Removal.py | 2 +- .../Context/Visitor/VisitorProvider.py | 7 +++-- 6 files changed, 21 insertions(+), 28 deletions(-) diff --git a/.gitignore b/.gitignore index 3eac0ca..6819ea1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,3 @@ -<<<<<<< HEAD # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -65,8 +64,5 @@ docs/_build/ # PyBuilder target/ -||||||| merged common ancestors -======= # Google App Engine generated folder appengine-generated/ ->>>>>>> adaaddecc50208c18b08806f63f80f3342bd5e30 diff --git a/setup.py b/setup.py index 1deb409..72a96dd 100644 --- a/setup.py +++ b/setup.py @@ -4,15 +4,15 @@ https://github.com/pypa/sampleproject """ -# Always prefer setuptools over distutils -from setuptools import setup, find_packages # To use a consistent encoding from codecs import open from os import path -#here = path.abspath(path.dirname(__file__)) +# Always prefer setuptools over distutils +from setuptools import setup, find_packages # Get the long description from the README file +here = path.abspath(path.dirname(__file__)) with open(path.join(here, 'README.rst'), encoding='utf-8') as f: long_description = f.read() @@ -47,7 +47,7 @@ # Indicate who your project is intended for 'Intended Audience :: Information Technology', - 'Intended Audience :: Science/Research', + 'Intended Audience :: Science/Research', 'Topic :: Text Processing :: Linguistic', # Pick your license as you wish (should match "license" above) @@ -69,8 +69,8 @@ # You can just specify the packages manually here if your project is # simple. Or you can use find_packages(). packages=find_packages('src', exclude=['contrib', 'docs', 'tests']), - #packages=["Sastrawi"], - package_dir = {'':'src'}, + # packages=["Sastrawi"], + package_dir={'': 'src'}, # Alternatively, if you want to distribute just a my_module.py, uncomment # this: @@ -80,16 +80,16 @@ # your project is installed. For an analysis of "install_requires" vs pip's # requirements files see: # https://packaging.python.org/en/latest/requirements.html - #install_requires=['peppercorn'], + # install_requires=['peppercorn'], # List additional groups of dependencies here (e.g. development # dependencies). You can install these using the following syntax, # for example: # $ pip install -e .[dev,test] - #extras_require={ + # extras_require={ # 'dev': ['check-manifest'], # 'test': ['coverage'], - #}, + # }, # If there are data files included in your packages that need to be # installed, specify them here. If using Python 2.6 or less, then these @@ -102,14 +102,14 @@ # need to place data files outside of your packages. See: # http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files # noqa # In this case, 'data_file' will be installed into '/my_data' - #data_files=[('my_data', ['data/data_file'])], + # data_files=[('my_data', ['data/data_file'])], # To provide executable scripts, use entry points in preference to the # "scripts" keyword. Entry points provide cross-platform support and allow # pip to create the appropriate form of executable for the target platform. - #entry_points={ + # entry_points={ # 'console_scripts': [ # 'sample=sample:main', # ], - #}, -) \ No newline at end of file + # }, +) diff --git a/src/Sastrawi/Dictionary/DictionaryInterface.py b/src/Sastrawi/Dictionary/DictionaryInterface.py index 3b2d037..f899ae4 100644 --- a/src/Sastrawi/Dictionary/DictionaryInterface.py +++ b/src/Sastrawi/Dictionary/DictionaryInterface.py @@ -1,8 +1,5 @@ class DictionaryInterface(object): - """description of class""" + """Interface definition of dictionary""" def contains(self, word): - pass - - - + raise NotImplementedError('you must implement this method manually') \ No newline at end of file diff --git a/src/Sastrawi/Morphology/Disambiguator/DisambiguatorPrefixRule24.py b/src/Sastrawi/Morphology/Disambiguator/DisambiguatorPrefixRule24.py index 68b6f12..6dd3497 100644 --- a/src/Sastrawi/Morphology/Disambiguator/DisambiguatorPrefixRule24.py +++ b/src/Sastrawi/Morphology/Disambiguator/DisambiguatorPrefixRule24.py @@ -1,10 +1,11 @@ import re + class DisambiguatorPrefixRule24(object): """Disambiguate Prefix Rule 24 Rule 24 : perCAerV -> per-CAerV where C != 'r' """ - + def disambiguate(self, word): """Disambiguate Prefix Rule 24 Rule 24 : perCAerV -> per-CAerV where C != 'r' diff --git a/src/Sastrawi/Stemmer/Context/Removal.py b/src/Sastrawi/Stemmer/Context/Removal.py index 0cdfed8..e11c9c0 100644 --- a/src/Sastrawi/Stemmer/Context/Removal.py +++ b/src/Sastrawi/Stemmer/Context/Removal.py @@ -1,6 +1,6 @@ from Sastrawi.Stemmer.Context.RemovalInterface import RemovalInterface -class Removal(object): +class Removal(RemovalInterface): """description of class""" def __init__(self, visitor, subject, result, removedPart, affixType): diff --git a/src/Sastrawi/Stemmer/Context/Visitor/VisitorProvider.py b/src/Sastrawi/Stemmer/Context/Visitor/VisitorProvider.py index 0ce6b33..be4aae6 100644 --- a/src/Sastrawi/Stemmer/Context/Visitor/VisitorProvider.py +++ b/src/Sastrawi/Stemmer/Context/Visitor/VisitorProvider.py @@ -86,8 +86,8 @@ def init_visitors(self): self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule15a(), DisambiguatorPrefixRule15b()])) self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule16()])) - disambiguators17 = [DisambiguatorPrefixRule17a(), DisambiguatorPrefixRule17b(), \ - DisambiguatorPrefixRule17c(), DisambiguatorPrefixRule17d()] + disambiguators17 = [DisambiguatorPrefixRule17a(), DisambiguatorPrefixRule17b(), DisambiguatorPrefixRule17c(), + DisambiguatorPrefixRule17d()] self.prefix_pisitors.append(PrefixDisambiguator(disambiguators17)) self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule18a(), DisambiguatorPrefixRule18b()])) @@ -102,8 +102,7 @@ def init_visitors(self): self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule28a(), DisambiguatorPrefixRule28b()])) self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule29()])) - disambiguators30 = [DisambiguatorPrefixRule30a(), DisambiguatorPrefixRule30b(), \ - DisambiguatorPrefixRule30c()] + disambiguators30 = [DisambiguatorPrefixRule30a(), DisambiguatorPrefixRule30b(), DisambiguatorPrefixRule30c()] self.prefix_pisitors.append(PrefixDisambiguator(disambiguators30)) self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule31a(), DisambiguatorPrefixRule31b()])) From 32125a5b2d9139875035171e969a6875ca82c265 Mon Sep 17 00:00:00 2001 From: har07 Date: Tue, 24 Apr 2018 17:32:18 +0700 Subject: [PATCH 18/45] release v1.1.0 --- setup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 72a96dd..67daaf8 100644 --- a/setup.py +++ b/setup.py @@ -17,18 +17,18 @@ long_description = f.read() setup( - name='Sastrawi', + name='PySastrawi', # Versions should comply with PEP440. For a discussion on single-sourcing # the version across setup.py and the project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='1.0.1', + version='1.1.0', description='Library for stemming Indonesian (Bahasa) text', long_description='Library for stemming Indonesian (Bahasa) text', # The project's main homepage. - url='https://github.com/har07/sastrawi', + url='https://github.com/har07/PySastrawi', # Author details author='Hanif Amal Robbani', From 49372b36837a3ff33046ac076f3278e1fc8ce621 Mon Sep 17 00:00:00 2001 From: Prasasto Adi Date: Tue, 24 Apr 2018 17:44:02 +0700 Subject: [PATCH 19/45] Add LICENSE (#8) --- LICENSE | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..1e79b12 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2016 Hanif Amal Robbani + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file From 0ab8ce2a994679af63880f3bdd1bb23570ffc010 Mon Sep 17 00:00:00 2001 From: har07 Date: Tue, 24 Apr 2018 17:44:45 +0700 Subject: [PATCH 20/45] update new pypi badge --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9e4c34f..bf24fad 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ This is Python port of the original [Sastrawi](https://github.com/sastrawi/sastr [![Build Status](https://travis-ci.org/har07/PySastrawi.svg?branch=master)](https://travis-ci.org/har07/PySastrawi) [![Coverage Status](https://coveralls.io/repos/github/har07/PySastrawi/badge.svg?branch=master)](https://coveralls.io/github/har07/PySastrawi?branch=master) -[![PyPI version](https://badge.fury.io/py/sastrawi.svg)](https://badge.fury.io/py/sastrawi) +[![PyPI version](https://badge.fury.io/py/PySastrawi.svg)](https://badge.fury.io/py/PySastrawi) Cara Install ------------- From c784cd7dfbb1159c154ad66d6f1af0a4b633c4ff Mon Sep 17 00:00:00 2001 From: har07 Date: Sun, 23 Sep 2018 09:59:57 +0700 Subject: [PATCH 21/45] remove empty line from list kata-dasar --- src/Sastrawi/Stemmer/data/kata-dasar.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Sastrawi/Stemmer/data/kata-dasar.txt b/src/Sastrawi/Stemmer/data/kata-dasar.txt index f46c3bc..9ebe9fb 100644 --- a/src/Sastrawi/Stemmer/data/kata-dasar.txt +++ b/src/Sastrawi/Stemmer/data/kata-dasar.txt @@ -29929,4 +29929,4 @@ zulmat zulu zurafah zuriah -zus +zus \ No newline at end of file From b31d6f67c24ee78d76d5095d405433f9cc774603 Mon Sep 17 00:00:00 2001 From: sanspa Date: Sun, 23 Sep 2018 10:25:57 +0700 Subject: [PATCH 22/45] Mengubah dictionary dari list ke dictionary Mengubah dictionary dari list ke dictionary --- src/Sastrawi/Dictionary/ArrayDictionary.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/Sastrawi/Dictionary/ArrayDictionary.py b/src/Sastrawi/Dictionary/ArrayDictionary.py index 5819a29..5bfdd90 100644 --- a/src/Sastrawi/Dictionary/ArrayDictionary.py +++ b/src/Sastrawi/Dictionary/ArrayDictionary.py @@ -2,7 +2,7 @@ class ArrayDictionary(object): """description of class""" def __init__(self, words=None): - self.words = [] + self.words = {} if words: self.add_words(words) @@ -14,14 +14,13 @@ def count(self): def add_words(self, words): """Add multiple words to the dictionary""" - for word in words: - self.add(word) + self.words = dict(zip(words,words)) def add(self, word): """Add a word to the dictionary""" if not word or word.strip() == '': return - self.words.append(word) + self.words[word]=word From 65cd03aaf14df14dd6e2d89abd5767912449e6fa Mon Sep 17 00:00:00 2001 From: har07 Date: Sun, 23 Sep 2018 10:45:29 +0700 Subject: [PATCH 23/45] release 1.2.0 --- release.sh | 9 +++++++++ setup.py | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) create mode 100755 release.sh diff --git a/release.sh b/release.sh new file mode 100755 index 0000000..f8fc1d8 --- /dev/null +++ b/release.sh @@ -0,0 +1,9 @@ +# generate disribution package +python -m pip install --user --upgrade setuptools wheel +python setup.py sdist bdist_wheel + +# upload distribution package +python3 -m pip install --user --upgrade twine +twine upload --repository-url https://test.pypi.org/legacy/ dist/* + +twine upload dist/* diff --git a/setup.py b/setup.py index 67daaf8..d657037 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ # Versions should comply with PEP440. For a discussion on single-sourcing # the version across setup.py and the project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='1.1.0', + version='1.2.0', description='Library for stemming Indonesian (Bahasa) text', long_description='Library for stemming Indonesian (Bahasa) text', From 01afc81c579bde14dcb41c33686b26af8afab121 Mon Sep 17 00:00:00 2001 From: Hanif Amal Robbani Date: Mon, 24 Sep 2018 08:55:48 +0700 Subject: [PATCH 24/45] update pip install instruction following change in the package name a while ago --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bf24fad..f1184ff 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ This is Python port of the original [Sastrawi](https://github.com/sastrawi/sastr Cara Install ------------- -Sastrawi dapat di-*install* menggunakan [pip](https://docs.python.org/3.6/installing/index.html), dengan menjalankan perintah berikut di terminal/command prompt : `pip install Sastrawi` +Sastrawi dapat di-*install* menggunakan [pip](https://docs.python.org/3.6/installing/index.html), dengan menjalankan perintah berikut di terminal/command prompt : `pip install PySastrawi` Penggunaan ----------- From 3625027e00dfc9b1cfa314004543064d22f9e626 Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Fri, 15 Mar 2019 01:30:25 +0700 Subject: [PATCH 25/45] Add Stopwords Tala 2003, Add lru_cache --- src/Sastrawi/Dictionary/ArrayDictionary.py | 8 +- src/Sastrawi/Stemmer/StemmerFactory.py | 30 +- .../StopWordRemover/StopWordRemoverFactory.py | 125 +-- .../data/stopword_tala_2003.txt | 758 ++++++++++++++++++ 4 files changed, 811 insertions(+), 110 deletions(-) create mode 100644 src/Sastrawi/StopWordRemover/data/stopword_tala_2003.txt diff --git a/src/Sastrawi/Dictionary/ArrayDictionary.py b/src/Sastrawi/Dictionary/ArrayDictionary.py index 5bfdd90..5c864f0 100644 --- a/src/Sastrawi/Dictionary/ArrayDictionary.py +++ b/src/Sastrawi/Dictionary/ArrayDictionary.py @@ -2,8 +2,12 @@ class ArrayDictionary(object): """description of class""" def __init__(self, words=None): - self.words = {} - if words: + + if words is is dict: + self.words = words + + if words is list: + self.words = {} self.add_words(words) def contains(self, word): diff --git a/src/Sastrawi/Stemmer/StemmerFactory.py b/src/Sastrawi/Stemmer/StemmerFactory.py index 10e6aaa..1e7a583 100644 --- a/src/Sastrawi/Stemmer/StemmerFactory.py +++ b/src/Sastrawi/Stemmer/StemmerFactory.py @@ -1,4 +1,5 @@ import os +import functools from Sastrawi.Dictionary.ArrayDictionary import ArrayDictionary from Sastrawi.Stemmer.Stemmer import Stemmer from Sastrawi.Stemmer.CachedStemmer import CachedStemmer @@ -6,13 +7,16 @@ class StemmerFactory(object): """ Stemmer factory helps creating pre-configured stemmer """ - APC_KEY = 'sastrawi_cache_dictionary' + #APC_KEY = 'sastrawi_cache_dictionary' def create_stemmer(self, isDev=False): """ Returns Stemmer instance """ + if isDev: + words = self.get_words_from_file() + dictionary = ArrayDictionary(words) + else: + dictionary = self.get_prod_words_dictionary() - words = self.get_words(isDev) - dictionary = ArrayDictionary(words) stemmer = Stemmer(dictionary) resultCache = ArrayCache() @@ -20,7 +24,13 @@ def create_stemmer(self, isDev=False): return cachedStemmer - def get_words(self, isDev=False): + @functools.lru_cache(maxsize=32) + def get_prod_words_dictionary(self): + words = self.get_words_from_file() + dictionary = ArrayDictionary(words) + return dictionary + + #def get_words(self, isDev=False): #if isDev or callable(getattr(self, 'apc_fetch')): # words = self.getWordsFromFile() #else: @@ -28,16 +38,18 @@ def get_words(self, isDev=False): # if not words: # words = self.getWordsFromFile() # apc_store(self.APC_KEY, words) - return self.get_words_from_file() + # return self.get_words_from_file() def get_words_from_file(self): current_dir = os.path.dirname(os.path.realpath(__file__)) dictionaryFile = current_dir + '/data/kata-dasar.txt' + if not os.path.isfile(dictionaryFile): raise RuntimeError('Dictionary file is missing. It seems that your installation is corrupted.') - dictionaryContent = '' + content = {} with open(dictionaryFile, 'r') as f: - dictionaryContent = f.read() - - return dictionaryContent.split('\n') \ No newline at end of file + word = f.read() + content.words[word] = word + + return content \ No newline at end of file diff --git a/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py b/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py index 560db07..2c206e6 100644 --- a/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py +++ b/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py @@ -1,109 +1,36 @@ +import os +import functools from Sastrawi.Dictionary.ArrayDictionary import ArrayDictionary from Sastrawi.StopWordRemover.StopWordRemover import StopWordRemover class StopWordRemoverFactory(object): """description of class""" - def create_stop_word_remover(self): - stopWords = self.get_stop_words() - dictionary = ArrayDictionary(stopWords) - stopWordRemover = StopWordRemover(dictionary) + def create_stop_word_remover(self, isDev=False): + if isDev: + stopWords = self.get_stop_words() + dictionary = ArrayDictionary(stopWords) + else: + dictionary = self.get_prod_stop_word_dictionary() + stopWordRemover = StopWordRemover(dictionary) return stopWordRemover - def get_stop_words(self): - return ['a','ada','adalah','adanya','adapun','agak','agaknya','agar','akan','akankah','akhir', - 'akhiri','akhirnya','aku','akulah','amat','amatlah','anda','andalah','antar','antara', - 'antaranya','apa','apaan','apabila','apakah','apalagi','apatah','arti','artinya','asal', - 'asalkan','atas','atau','ataukah','ataupun','awal','awalnya','b','bagai','bagaikan', - 'bagaimana','bagaimanakah','bagaimanapun','bagainamakah','bagi','bagian','bahkan','bahwa', - 'bahwasannya','bahwasanya','baik','baiklah','bakal','bakalan','balik','banyak','bapak', - 'baru','bawah','beberapa','begini','beginian','beginikah','beginilah','begitu','begitukah', - 'begitulah','begitupun','bekerja','belakang','belakangan','belum','belumlah','benar', - 'benarkah','benarlah','berada','berakhir','berakhirlah','berakhirnya','berapa','berapakah', - 'berapalah','berapapun','berarti','berawal','berbagai','berdatangan','beri','berikan', - 'berikut','berikutnya','berjumlah','berkali-kali','berkata','berkehendak','berkeinginan', - 'berkenaan','berlainan','berlalu','berlangsung','berlebihan','bermacam','bermacam-macam', - 'bermaksud','bermula','bersama','bersama-sama','bersiap','bersiap-siap','bertanya', - 'bertanya-tanya','berturut','berturut-turut','bertutur','berujar','berupa','besar', - 'betul','betulkah','biasa','biasanya','bila','bilakah','bisa','bisakah','boleh','bolehkah', - 'bolehlah','buat','bukan','bukankah','bukanlah','bukannya','bulan','bung','c','cara', - 'caranya','cukup','cukupkah','cukuplah','cuma','d','dahulu','dalam','dan','dapat','dari', - 'daripada','datang','dekat','demi','demikian','demikianlah','dengan','depan','di','dia', - 'diakhiri','diakhirinya','dialah','diantara','diantaranya','diberi','diberikan','diberikannya', - 'dibuat','dibuatnya','didapat','didatangkan','digunakan','diibaratkan','diibaratkannya', - 'diingat','diingatkan','diinginkan','dijawab','dijelaskan','dijelaskannya','dikarenakan', - 'dikatakan','dikatakannya','dikerjakan','diketahui','diketahuinya','dikira','dilakukan', - 'dilalui','dilihat','dimaksud','dimaksudkan','dimaksudkannya','dimaksudnya','diminta', - 'dimintai','dimisalkan','dimulai','dimulailah','dimulainya','dimungkinkan','dini','dipastikan', - 'diperbuat','diperbuatnya','dipergunakan','diperkirakan','diperlihatkan','diperlukan', - 'diperlukannya','dipersoalkan','dipertanyakan','dipunyai','diri','dirinya','disampaikan', - 'disebut','disebutkan','disebutkannya','disini','disinilah','ditambahkan','ditandaskan', - 'ditanya','ditanyai','ditanyakan','ditegaskan','ditujukan','ditunjuk','ditunjuki','ditunjukkan', - 'ditunjukkannya','ditunjuknya','dituturkan','dituturkannya','diucapkan','diucapkannya', - 'diungkapkan','dong','dua','dulu','e','empat','enak','enggak','enggaknya','entah','entahlah', - 'f','g','guna','gunakan','h','hadap','hai','hal','halo','hallo','hampir','hanya','hanyalah', - 'hari','harus','haruslah','harusnya','helo','hello','hendak','hendaklah','hendaknya','hingga', - 'i','ia','ialah','ibarat','ibaratkan','ibaratnya','ibu','ikut','ingat','ingat-ingat','ingin', - 'inginkah','inginkan','ini','inikah','inilah','itu','itukah','itulah','j','jadi','jadilah', - 'jadinya','jangan','jangankan','janganlah','jauh','jawab','jawaban','jawabnya','jelas', - 'jelaskan','jelaslah','jelasnya','jika','jikalau','juga','jumlah','jumlahnya','justru', - 'k','kadar','kala','kalau','kalaulah','kalaupun','kali','kalian','kami','kamilah','kamu', - 'kamulah','kan','kapan','kapankah','kapanpun','karena','karenanya','kasus','kata','katakan', - 'katakanlah','katanya','ke','keadaan','kebetulan','kecil','kedua','keduanya','keinginan', - 'kelamaan','kelihatan','kelihatannya','kelima','keluar','kembali','kemudian','kemungkinan', - 'kemungkinannya','kena','kenapa','kepada','kepadanya','kerja','kesampaian','keseluruhan', - 'keseluruhannya','keterlaluan','ketika','khusus','khususnya','kini','kinilah','kira', - 'kira-kira','kiranya','kita','kitalah','kok','kurang','l','lagi','lagian','lah','lain', - 'lainnya','laku','lalu','lama','lamanya','langsung','lanjut','lanjutnya','lebih','lewat', - 'lihat','lima','luar','m','macam','maka','makanya','makin','maksud','malah','malahan', - 'mampu','mampukah','mana','manakala','manalagi','masa','masalah','masalahnya','masih', - 'masihkah','masing','masing-masing','masuk','mata','mau','maupun','melainkan','melakukan', - 'melalui','melihat','melihatnya','memang','memastikan','memberi','memberikan','membuat', - 'memerlukan','memihak','meminta','memintakan','memisalkan','memperbuat','mempergunakan', - 'memperkirakan','memperlihatkan','mempersiapkan','mempersoalkan','mempertanyakan','mempunyai', - 'memulai','memungkinkan','menaiki','menambahkan','menandaskan','menanti','menanti-nanti', - 'menantikan','menanya','menanyai','menanyakan','mendapat','mendapatkan','mendatang','mendatangi', - 'mendatangkan','menegaskan','mengakhiri','mengapa','mengatakan','mengatakannya','mengenai', - 'mengerjakan','mengetahui','menggunakan','menghendaki','mengibaratkan','mengibaratkannya', - 'mengingat','mengingatkan','menginginkan','mengira','mengucapkan','mengucapkannya','mengungkapkan', - 'menjadi','menjawab','menjelaskan','menuju','menunjuk','menunjuki','menunjukkan','menunjuknya', - 'menurut','menuturkan','menyampaikan','menyangkut','menyatakan','menyebutkan','menyeluruh', - 'menyiapkan','merasa','mereka','merekalah','merupakan','meski','meskipun','meyakini','meyakinkan', - 'minta','mirip','misal','misalkan','misalnya','mohon','mula','mulai','mulailah','mulanya','mungkin', - 'mungkinkah','n','nah','naik','namun','nanti','nantinya','nya','nyaris','nyata','nyatanya', - 'o','oleh','olehnya','orang','p','pada','padahal','padanya','pak','paling','panjang','pantas', - 'para','pasti','pastilah','penting','pentingnya','per','percuma','perlu','perlukah','perlunya', - 'pernah','persoalan','pertama','pertama-tama','pertanyaan','pertanyakan','pihak','pihaknya', - 'pukul','pula','pun','punya','q','r','rasa','rasanya','rupa','rupanya','s','saat','saatnya','saja', - 'sajalah','salam','saling','sama','sama-sama','sambil','sampai','sampai-sampai','sampaikan','sana', - 'sangat','sangatlah','sangkut','satu','saya','sayalah','se','sebab','sebabnya','sebagai', - 'sebagaimana','sebagainya','sebagian','sebaik','sebaik-baiknya','sebaiknya','sebaliknya', - 'sebanyak','sebegini','sebegitu','sebelum','sebelumnya','sebenarnya','seberapa','sebesar', - 'sebetulnya','sebisanya','sebuah','sebut','sebutlah','sebutnya','secara','secukupnya','sedang', - 'sedangkan','sedemikian','sedikit','sedikitnya','seenaknya','segala','segalanya','segera', - 'seharusnya','sehingga','seingat','sejak','sejauh','sejenak','sejumlah','sekadar','sekadarnya', - 'sekali','sekali-kali','sekalian','sekaligus','sekalipun','sekarang','sekaranglah','sekecil', - 'seketika','sekiranya','sekitar','sekitarnya','sekurang-kurangnya','sekurangnya','sela','selain', - 'selaku','selalu','selama','selama-lamanya','selamanya','selanjutnya','seluruh','seluruhnya', - 'semacam','semakin','semampu','semampunya','semasa','semasih','semata','semata-mata','semaunya', - 'sementara','semisal','semisalnya','sempat','semua','semuanya','semula','sendiri','sendirian', - 'sendirinya','seolah','seolah-olah','seorang','sepanjang','sepantasnya','sepantasnyalah', - 'seperlunya','seperti','sepertinya','sepihak','sering','seringnya','serta','serupa','sesaat', - 'sesama','sesampai','sesegera','sesekali','seseorang','sesuatu','sesuatunya','sesudah', - 'sesudahnya','setelah','setempat','setengah','seterusnya','setiap','setiba','setibanya', - 'setidak-tidaknya','setidaknya','setinggi','seusai','sewaktu','siap','siapa','siapakah', - 'siapapun','sini','sinilah','soal','soalnya','suatu','sudah','sudahkah','sudahlah','supaya', - 't','tadi','tadinya','tahu','tak','tambah','tambahnya','tampak','tampaknya','tandas','tandasnya', - 'tanpa','tanya','tanyakan','tanyanya','tapi','tegas','tegasnya','telah','tempat','tentang','tentu', - 'tentulah','tentunya','tepat','terakhir','terasa','terbanyak','terdahulu','terdapat','terdiri', - 'terhadap','terhadapnya','teringat','teringat-ingat','terjadi','terjadilah','terjadinya','terkira', - 'terlalu','terlebih','terlihat','termasuk','ternyata','tersampaikan','tersebut','tersebutlah', - 'tertentu','tertuju','terus','terutama','tetap','tetapi','tiap','tiba','tiba-tiba','tidak', - 'tidakkah','tidaklah','tiga','toh','tuju','tunjuk','turut','tutur','tuturnya','u','ucap','ucapnya', - 'ujar','ujarnya','umumnya','ungkap','ungkapnya','untuk','usah','usai','v','w','waduh','wah','wahai', - 'waktunya','walau','walaupun','wong','x','y','ya','yaitu','yakin','yakni','yang','z'] - - - + @functools.lru_cache(maxsize=32) + def get_prod_stop_word_dictionary(self): + stopWords = self.get_stop_words() + return ArrayDictionary(stopWords) + def get_stop_words(self): + current_dir = os.path.dirname(os.path.realpath(__file__)) + dictionaryFile = current_dir + '/data/stopword_tala_2003.txt' + + if not os.path.isfile(dictionaryFile): + raise RuntimeError('Stopword file is missing. It seems that your installation is corrupted.') + + content = {} + with open(dictionaryFile, 'r') as f: + word = f.read() + content.words[word] = word + + return content \ No newline at end of file diff --git a/src/Sastrawi/StopWordRemover/data/stopword_tala_2003.txt b/src/Sastrawi/StopWordRemover/data/stopword_tala_2003.txt new file mode 100644 index 0000000..bf88a45 --- /dev/null +++ b/src/Sastrawi/StopWordRemover/data/stopword_tala_2003.txt @@ -0,0 +1,758 @@ +ada +adalah +adanya +adapun +agak +agaknya +agar +akan +akankah +akhir +akhiri +akhirnya +aku +akulah +amat +amatlah +anda +andalah +antar +antara +antaranya +apa +apaan +apabila +apakah +apalagi +apatah +artinya +asal +asalkan +atas +atau +ataukah +ataupun +awal +awalnya +bagai +bagaikan +bagaimana +bagaimanakah +bagaimanapun +bagi +bagian +bahkan +bahwa +bahwasanya +baik +bakal +bakalan +balik +banyak +bapak +baru +bawah +beberapa +begini +beginian +beginikah +beginilah +begitu +begitukah +begitulah +begitupun +bekerja +belakang +belakangan +belum +belumlah +benar +benarkah +benarlah +berada +berakhir +berakhirlah +berakhirnya +berapa +berapakah +berapalah +berapapun +berarti +berawal +berbagai +berdatangan +beri +berikan +berikut +berikutnya +berjumlah +berkali-kali +berkata +berkehendak +berkeinginan +berkenaan +berlainan +berlalu +berlangsung +berlebihan +bermacam +bermacam-macam +bermaksud +bermula +bersama +bersama-sama +bersiap +bersiap-siap +bertanya +bertanya-tanya +berturut +berturut-turut +bertutur +berujar +berupa +besar +betul +betulkah +biasa +biasanya +bila +bilakah +bisa +bisakah +boleh +bolehkah +bolehlah +buat +bukan +bukankah +bukanlah +bukannya +bulan +bung +cara +caranya +cukup +cukupkah +cukuplah +cuma +dahulu +dalam +dan +dapat +dari +daripada +datang +dekat +demi +demikian +demikianlah +dengan +depan +di +dia +diakhiri +diakhirinya +dialah +diantara +diantaranya +diberi +diberikan +diberikannya +dibuat +dibuatnya +didapat +didatangkan +digunakan +diibaratkan +diibaratkannya +diingat +diingatkan +diinginkan +dijawab +dijelaskan +dijelaskannya +dikarenakan +dikatakan +dikatakannya +dikerjakan +diketahui +diketahuinya +dikira +dilakukan +dilalui +dilihat +dimaksud +dimaksudkan +dimaksudkannya +dimaksudnya +diminta +dimintai +dimisalkan +dimulai +dimulailah +dimulainya +dimungkinkan +dini +dipastikan +diperbuat +diperbuatnya +dipergunakan +diperkirakan +diperlihatkan +diperlukan +diperlukannya +dipersoalkan +dipertanyakan +dipunyai +diri +dirinya +disampaikan +disebut +disebutkan +disebutkannya +disini +disinilah +ditambahkan +ditandaskan +ditanya +ditanyai +ditanyakan +ditegaskan +ditujukan +ditunjuk +ditunjuki +ditunjukkan +ditunjukkannya +ditunjuknya +dituturkan +dituturkannya +diucapkan +diucapkannya +diungkapkan +dong +dua +dulu +empat +enggak +enggaknya +entah +entahlah +guna +gunakan +hal +hampir +hanya +hanyalah +hari +harus +haruslah +harusnya +hendak +hendaklah +hendaknya +hingga +ia +ialah +ibarat +ibaratkan +ibaratnya +ibu +ikut +ingat +ingat-ingat +ingin +inginkah +inginkan +ini +inikah +inilah +itu +itukah +itulah +jadi +jadilah +jadinya +jangan +jangankan +janganlah +jauh +jawab +jawaban +jawabnya +jelas +jelaskan +jelaslah +jelasnya +jika +jikalau +juga +jumlah +jumlahnya +justru +kala +kalau +kalaulah +kalaupun +kalian +kami +kamilah +kamu +kamulah +kan +kapan +kapankah +kapanpun +karena +karenanya +kasus +kata +katakan +katakanlah +katanya +ke +keadaan +kebetulan +kecil +kedua +keduanya +keinginan +kelamaan +kelihatan +kelihatannya +kelima +keluar +kembali +kemudian +kemungkinan +kemungkinannya +kenapa +kepada +kepadanya +kesampaian +keseluruhan +keseluruhannya +keterlaluan +ketika +khususnya +kini +kinilah +kira +kira-kira +kiranya +kita +kitalah +kok +kurang +lagi +lagian +lah +lain +lainnya +lalu +lama +lamanya +lanjut +lanjutnya +lebih +lewat +lima +luar +macam +maka +makanya +makin +malah +malahan +mampu +mampukah +mana +manakala +manalagi +masa +masalah +masalahnya +masih +masihkah +masing +masing-masing +mau +maupun +melainkan +melakukan +melalui +melihat +melihatnya +memang +memastikan +memberi +memberikan +membuat +memerlukan +memihak +meminta +memintakan +memisalkan +memperbuat +mempergunakan +memperkirakan +memperlihatkan +mempersiapkan +mempersoalkan +mempertanyakan +mempunyai +memulai +memungkinkan +menaiki +menambahkan +menandaskan +menanti +menanti-nanti +menantikan +menanya +menanyai +menanyakan +mendapat +mendapatkan +mendatang +mendatangi +mendatangkan +menegaskan +mengakhiri +mengapa +mengatakan +mengatakannya +mengenai +mengerjakan +mengetahui +menggunakan +menghendaki +mengibaratkan +mengibaratkannya +mengingat +mengingatkan +menginginkan +mengira +mengucapkan +mengucapkannya +mengungkapkan +menjadi +menjawab +menjelaskan +menuju +menunjuk +menunjuki +menunjukkan +menunjuknya +menurut +menuturkan +menyampaikan +menyangkut +menyatakan +menyebutkan +menyeluruh +menyiapkan +merasa +mereka +merekalah +merupakan +meski +meskipun +meyakini +meyakinkan +minta +mirip +misal +misalkan +misalnya +mula +mulai +mulailah +mulanya +mungkin +mungkinkah +nah +naik +namun +nanti +nantinya +nyaris +nyatanya +oleh +olehnya +pada +padahal +padanya +pak +paling +panjang +pantas +para +pasti +pastilah +penting +pentingnya +per +percuma +perlu +perlukah +perlunya +pernah +persoalan +pertama +pertama-tama +pertanyaan +pertanyakan +pihak +pihaknya +pukul +pula +pun +punya +rasa +rasanya +rata +rupanya +saat +saatnya +saja +sajalah +saling +sama +sama-sama +sambil +sampai +sampai-sampai +sampaikan +sana +sangat +sangatlah +satu +saya +sayalah +se +sebab +sebabnya +sebagai +sebagaimana +sebagainya +sebagian +sebaik +sebaik-baiknya +sebaiknya +sebaliknya +sebanyak +sebegini +sebegitu +sebelum +sebelumnya +sebenarnya +seberapa +sebesar +sebetulnya +sebisanya +sebuah +sebut +sebutlah +sebutnya +secara +secukupnya +sedang +sedangkan +sedemikian +sedikit +sedikitnya +seenaknya +segala +segalanya +segera +seharusnya +sehingga +seingat +sejak +sejauh +sejenak +sejumlah +sekadar +sekadarnya +sekali +sekali-kali +sekalian +sekaligus +sekalipun +sekarang +sekarang +sekecil +seketika +sekiranya +sekitar +sekitarnya +sekurang-kurangnya +sekurangnya +sela +selain +selaku +selalu +selama +selama-lamanya +selamanya +selanjutnya +seluruh +seluruhnya +semacam +semakin +semampu +semampunya +semasa +semasih +semata +semata-mata +semaunya +sementara +semisal +semisalnya +sempat +semua +semuanya +semula +sendiri +sendirian +sendirinya +seolah +seolah-olah +seorang +sepanjang +sepantasnya +sepantasnyalah +seperlunya +seperti +sepertinya +sepihak +sering +seringnya +serta +serupa +sesaat +sesama +sesampai +sesegera +sesekali +seseorang +sesuatu +sesuatunya +sesudah +sesudahnya +setelah +setempat +setengah +seterusnya +setiap +setiba +setibanya +setidak-tidaknya +setidaknya +setinggi +seusai +sewaktu +siap +siapa +siapakah +siapapun +sini +sinilah +soal +soalnya +suatu +sudah +sudahkah +sudahlah +supaya +tadi +tadinya +tahu +tahun +tak +tambah +tambahnya +tampak +tampaknya +tandas +tandasnya +tanpa +tanya +tanyakan +tanyanya +tapi +tegas +tegasnya +telah +tempat +tengah +tentang +tentu +tentulah +tentunya +tepat +terakhir +terasa +terbanyak +terdahulu +terdapat +terdiri +terhadap +terhadapnya +teringat +teringat-ingat +terjadi +terjadilah +terjadinya +terkira +terlalu +terlebih +terlihat +termasuk +ternyata +tersampaikan +tersebut +tersebutlah +tertentu +tertuju +terus +terutama +tetap +tetapi +tiap +tiba +tiba-tiba +tidak +tidakkah +tidaklah +tiga +tinggi +toh +tunjuk +turut +tutur +tuturnya +ucap +ucapnya +ujar +ujarnya +umum +umumnya +ungkap +ungkapnya +untuk +usah +usai +waduh +wah +wahai +waktu +waktunya +walau +walaupun +wong +yaitu +yakin +yakni +yang \ No newline at end of file From 9890fcf531088d070600a82f25a42690e29cf516 Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Fri, 15 Mar 2019 01:48:49 +0700 Subject: [PATCH 26/45] Test Stopword Tala --- src/Sastrawi/Stemmer/CachedStemmer.py | 2 +- .../StopWordRemover/stop_word_remover_factory_test.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Sastrawi/Stemmer/CachedStemmer.py b/src/Sastrawi/Stemmer/CachedStemmer.py index 97258bc..0356777 100644 --- a/src/Sastrawi/Stemmer/CachedStemmer.py +++ b/src/Sastrawi/Stemmer/CachedStemmer.py @@ -17,7 +17,7 @@ def stem(self, text): if self.cache.has(word): stems.append(self.cache.get(word)) else: - stem = self.delegatedStemmer.stem(word) + stem = self.delegatedStemmer.stem_word(word) self.cache.set(word, stem) stems.append(stem) diff --git a/tests/UnitTests/StopWordRemover/stop_word_remover_factory_test.py b/tests/UnitTests/StopWordRemover/stop_word_remover_factory_test.py index 5f758f8..0b4163e 100644 --- a/tests/UnitTests/StopWordRemover/stop_word_remover_factory_test.py +++ b/tests/UnitTests/StopWordRemover/stop_word_remover_factory_test.py @@ -9,6 +9,9 @@ def setUp(self): def test_createStopWordRemover(self): self.assertIsInstance(self.factory.create_stop_word_remover(), StopWordRemover) + sremover = self.factory.create_stop_word_remover() + self.assertEqual('pergi sekolah', sremover.remove('pergi ke sekolah yang')) + self.assertEqual('makan rumah', sremover.remove('makan di rumah yang')) if __name__ == '__main__': unittest.main() From 7a55cbf575552014ebbfd8a139f90fab91e94eb9 Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Fri, 15 Mar 2019 08:50:10 +0700 Subject: [PATCH 27/45] Boost Performance --- src/Sastrawi/Stemmer/StemmerFactory.py | 2 +- .../UnitTests/Stemmer/stemmer_factory_test.py | 19 +++++++++++++++++++ .../stop_word_remover_factory_test.py | 14 ++++++++++++++ 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/src/Sastrawi/Stemmer/StemmerFactory.py b/src/Sastrawi/Stemmer/StemmerFactory.py index 1e7a583..ace9a63 100644 --- a/src/Sastrawi/Stemmer/StemmerFactory.py +++ b/src/Sastrawi/Stemmer/StemmerFactory.py @@ -24,7 +24,7 @@ def create_stemmer(self, isDev=False): return cachedStemmer - @functools.lru_cache(maxsize=32) + @functools.lru_cache(maxsize=640) def get_prod_words_dictionary(self): words = self.get_words_from_file() dictionary = ArrayDictionary(words) diff --git a/tests/UnitTests/Stemmer/stemmer_factory_test.py b/tests/UnitTests/Stemmer/stemmer_factory_test.py index 146df28..f4f12cf 100644 --- a/tests/UnitTests/Stemmer/stemmer_factory_test.py +++ b/tests/UnitTests/Stemmer/stemmer_factory_test.py @@ -1,4 +1,5 @@ import unittest +import time from Sastrawi.Stemmer.StemmerFactory import StemmerFactory from Sastrawi.Stemmer.Stemmer import Stemmer @@ -23,6 +24,24 @@ def test_fungsional(self): if output != expected: raise AssertionError(str.format('output is {} instead of {}', output, expected)) + def test_execution_time(self): + start = time.time() + sentence = 'Rakyat memenuHi halaMan geDung DPR unTuk menyuarakan isi hatinya. Saat Itu, situasi sangat genting sekali. Terjadi kerusuhan yang mengiringi pergerakan mahasiswa yang memperjuangkan reformasi.' + + factory = StemmerFactory() + stemmer = factory.create_stemmer() + + stemmer.stem(sentence) + + end = time.time() + + execution_time = end - start + + # print(execution_time) + + # test execution time < 3 seconds + self.assertTrue(execution_time < 3) + def test_getWordsFromFile(self): factory = StemmerFactory() factory.get_words_from_file() diff --git a/tests/UnitTests/StopWordRemover/stop_word_remover_factory_test.py b/tests/UnitTests/StopWordRemover/stop_word_remover_factory_test.py index 0b4163e..0ef6695 100644 --- a/tests/UnitTests/StopWordRemover/stop_word_remover_factory_test.py +++ b/tests/UnitTests/StopWordRemover/stop_word_remover_factory_test.py @@ -1,4 +1,5 @@ import unittest +import time from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory from Sastrawi.StopWordRemover.StopWordRemover import StopWordRemover @@ -9,9 +10,22 @@ def setUp(self): def test_createStopWordRemover(self): self.assertIsInstance(self.factory.create_stop_word_remover(), StopWordRemover) + + def test_stopwordRemoval(self): sremover = self.factory.create_stop_word_remover() self.assertEqual('pergi sekolah', sremover.remove('pergi ke sekolah yang')) self.assertEqual('makan rumah', sremover.remove('makan di rumah yang')) + def test_execution_time(self): + start = time.time() + sentence = 'Rakyat memenuHi halaMan geDung DPR unTuk menyuarakan isi hatinya. Saat Itu, situasi sangat genting sekali. Terjadi kerusuhan yang mengiringi pergerakan mahasiswa yang memperjuangkan reformasi.' + sremover = self.factory.create_stop_word_remover() + sremover.remove(sentence) + end = time.time() + # print(execution_time) + execution_time = end - start + + self.assertTrue(execution_time < 1) + if __name__ == '__main__': unittest.main() From 5630ad6ce40b831fdcbdfcf64d27037f595a0765 Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Fri, 15 Mar 2019 09:06:31 +0700 Subject: [PATCH 28/45] add stem word --- src/Sastrawi/Stemmer/CachedStemmer.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/Sastrawi/Stemmer/CachedStemmer.py b/src/Sastrawi/Stemmer/CachedStemmer.py index 0356777..92edfe0 100644 --- a/src/Sastrawi/Stemmer/CachedStemmer.py +++ b/src/Sastrawi/Stemmer/CachedStemmer.py @@ -22,6 +22,14 @@ def stem(self, text): stems.append(stem) return ' '.join(stems) - + + def stem_word(self, word): + if self.cache.has(word): + return self.cache.get(word) + else: + stem = self.delegatedStemmer.stem_word(word) + self.cache.set(word, stem) + return stem + def get_cache(self): return self.cache From 1d9554f289519e9cc7d6283bbcd1eb77a949268e Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Fri, 15 Mar 2019 09:17:43 +0700 Subject: [PATCH 29/45] add stem & stopword removal from tokens/word list --- src/Sastrawi/Stemmer/CachedStemmer.py | 10 +++++++++- src/Sastrawi/StopWordRemover/StopWordRemover.py | 8 +++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/Sastrawi/Stemmer/CachedStemmer.py b/src/Sastrawi/Stemmer/CachedStemmer.py index 92edfe0..ad4e1ef 100644 --- a/src/Sastrawi/Stemmer/CachedStemmer.py +++ b/src/Sastrawi/Stemmer/CachedStemmer.py @@ -7,7 +7,7 @@ def __init__(self, cache, delegatedStemmer): self.cache = cache self.delegatedStemmer = delegatedStemmer - def stem(self, text): + def stem(self, text: str): normalizedText = TextNormalizer.normalize_text(text) words = normalizedText.split(' ') @@ -31,5 +31,13 @@ def stem_word(self, word): self.cache.set(word, stem) return stem + # Stemming word in Tokens + # @author Mufid Jamaluddin + def stem_tokens(self, tokens: list): + stemmed_tokens = [] + for token in tokens: + stemmed_tokens.append(self.stem_word(token)) + return stemmed_tokens + def get_cache(self): return self.cache diff --git a/src/Sastrawi/StopWordRemover/StopWordRemover.py b/src/Sastrawi/StopWordRemover/StopWordRemover.py index d3de2ff..cb462cf 100644 --- a/src/Sastrawi/StopWordRemover/StopWordRemover.py +++ b/src/Sastrawi/StopWordRemover/StopWordRemover.py @@ -14,6 +14,8 @@ def remove(self, text): return ' '.join(stopped_words) - - - + # Remove Stopword in Tokens + # @author Mufid Jamaluddin + def remove_tokens(self, tokens:list): + clean_tokens = [token for token in tokens if not self.dictionary.contains(token)] + return clean_tokens \ No newline at end of file From 81b06a4730b1fb34205ed5f8c6432c67cf92bb87 Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Fri, 15 Mar 2019 09:23:46 +0700 Subject: [PATCH 30/45] add python 3.7 --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 65ec28e..4d259df 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,6 +4,7 @@ python: - "3.3" - "3.4" - "3.5" + - "3.7" sudo: false install: - pip install python-coveralls From 150a839ba3ac98550e0c850119edc02b3fa6d7bc Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Fri, 15 Mar 2019 22:55:58 +0700 Subject: [PATCH 31/45] Minor --- .vscode/settings.json | 3 +++ src/Sastrawi/Dictionary/ArrayDictionary.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..500bc70 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.linting.pylintEnabled": true +} \ No newline at end of file diff --git a/src/Sastrawi/Dictionary/ArrayDictionary.py b/src/Sastrawi/Dictionary/ArrayDictionary.py index 5c864f0..8f94982 100644 --- a/src/Sastrawi/Dictionary/ArrayDictionary.py +++ b/src/Sastrawi/Dictionary/ArrayDictionary.py @@ -3,7 +3,7 @@ class ArrayDictionary(object): def __init__(self, words=None): - if words is is dict: + if words is dict: self.words = words if words is list: From 99bfac54e96ec6b5e917a1094f7bc71da79dc18f Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Fri, 15 Mar 2019 23:40:04 +0700 Subject: [PATCH 32/45] Fix Error --- src/Sastrawi/Dictionary/ArrayDictionary.py | 10 ++++---- src/Sastrawi/Stemmer/CachedStemmer.py | 2 +- src/Sastrawi/Stemmer/StemmerFactory.py | 24 +++++++++---------- .../StopWordRemover/StopWordRemover.py | 2 +- .../StopWordRemover/StopWordRemoverFactory.py | 16 ++++++------- 5 files changed, 24 insertions(+), 30 deletions(-) diff --git a/src/Sastrawi/Dictionary/ArrayDictionary.py b/src/Sastrawi/Dictionary/ArrayDictionary.py index 8f94982..3438455 100644 --- a/src/Sastrawi/Dictionary/ArrayDictionary.py +++ b/src/Sastrawi/Dictionary/ArrayDictionary.py @@ -2,12 +2,10 @@ class ArrayDictionary(object): """description of class""" def __init__(self, words=None): - - if words is dict: - self.words = words - - if words is list: - self.words = {} + if words is dict: + self.words = words + elif words is list: + self.words = {} self.add_words(words) def contains(self, word): diff --git a/src/Sastrawi/Stemmer/CachedStemmer.py b/src/Sastrawi/Stemmer/CachedStemmer.py index ad4e1ef..854a8c3 100644 --- a/src/Sastrawi/Stemmer/CachedStemmer.py +++ b/src/Sastrawi/Stemmer/CachedStemmer.py @@ -33,7 +33,7 @@ def stem_word(self, word): # Stemming word in Tokens # @author Mufid Jamaluddin - def stem_tokens(self, tokens: list): + def stem_tokens(self, tokens): stemmed_tokens = [] for token in tokens: stemmed_tokens.append(self.stem_word(token)) diff --git a/src/Sastrawi/Stemmer/StemmerFactory.py b/src/Sastrawi/Stemmer/StemmerFactory.py index ace9a63..3578cac 100644 --- a/src/Sastrawi/Stemmer/StemmerFactory.py +++ b/src/Sastrawi/Stemmer/StemmerFactory.py @@ -12,10 +12,10 @@ class StemmerFactory(object): def create_stemmer(self, isDev=False): """ Returns Stemmer instance """ if isDev: - words = self.get_words_from_file() - dictionary = ArrayDictionary(words) + words = self.get_words_from_file() + dictionary = ArrayDictionary(words) else: - dictionary = self.get_prod_words_dictionary() + dictionary = self.get_prod_words_dictionary() stemmer = Stemmer(dictionary) @@ -24,11 +24,11 @@ def create_stemmer(self, isDev=False): return cachedStemmer - @functools.lru_cache(maxsize=640) - def get_prod_words_dictionary(self): - words = self.get_words_from_file() - dictionary = ArrayDictionary(words) - return dictionary + @functools.lru_cache(maxsize=640) + def get_prod_words_dictionary(self): + words = self.get_words_from_file() + dictionary = ArrayDictionary(words) + return dictionary #def get_words(self, isDev=False): #if isDev or callable(getattr(self, 'apc_fetch')): @@ -47,9 +47,7 @@ def get_words_from_file(self): if not os.path.isfile(dictionaryFile): raise RuntimeError('Dictionary file is missing. It seems that your installation is corrupted.') - content = {} + text = '' with open(dictionaryFile, 'r') as f: - word = f.read() - content.words[word] = word - - return content \ No newline at end of file + text = f.read() + return text.split('\n') \ No newline at end of file diff --git a/src/Sastrawi/StopWordRemover/StopWordRemover.py b/src/Sastrawi/StopWordRemover/StopWordRemover.py index cb462cf..a5bbd3e 100644 --- a/src/Sastrawi/StopWordRemover/StopWordRemover.py +++ b/src/Sastrawi/StopWordRemover/StopWordRemover.py @@ -16,6 +16,6 @@ def remove(self, text): # Remove Stopword in Tokens # @author Mufid Jamaluddin - def remove_tokens(self, tokens:list): + def remove_tokens(self, tokens): clean_tokens = [token for token in tokens if not self.dictionary.contains(token)] return clean_tokens \ No newline at end of file diff --git a/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py b/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py index 2c206e6..010175d 100644 --- a/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py +++ b/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py @@ -16,10 +16,10 @@ def create_stop_word_remover(self, isDev=False): stopWordRemover = StopWordRemover(dictionary) return stopWordRemover - @functools.lru_cache(maxsize=32) - def get_prod_stop_word_dictionary(self): - stopWords = self.get_stop_words() - return ArrayDictionary(stopWords) + @functools.lru_cache(maxsize=32) + def get_prod_stop_word_dictionary(self): + stopWords = self.get_stop_words() + return ArrayDictionary(stopWords) def get_stop_words(self): current_dir = os.path.dirname(os.path.realpath(__file__)) @@ -28,9 +28,7 @@ def get_stop_words(self): if not os.path.isfile(dictionaryFile): raise RuntimeError('Stopword file is missing. It seems that your installation is corrupted.') - content = {} + text = '' with open(dictionaryFile, 'r') as f: - word = f.read() - content.words[word] = word - - return content \ No newline at end of file + text = f.read() + return text.split('\n') \ No newline at end of file From edf2c818a4bec08aaba2543784174d8e55c134ee Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Fri, 15 Mar 2019 23:51:52 +0700 Subject: [PATCH 33/45] fix error python 2.7 --- src/Sastrawi/Dictionary/ArrayDictionary.py | 9 ++------- src/Sastrawi/Stemmer/CachedStemmer.py | 2 +- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/src/Sastrawi/Dictionary/ArrayDictionary.py b/src/Sastrawi/Dictionary/ArrayDictionary.py index 3438455..1583678 100644 --- a/src/Sastrawi/Dictionary/ArrayDictionary.py +++ b/src/Sastrawi/Dictionary/ArrayDictionary.py @@ -1,11 +1,11 @@ class ArrayDictionary(object): """description of class""" + words = {} def __init__(self, words=None): if words is dict: self.words = words elif words is list: - self.words = {} self.add_words(words) def contains(self, word): @@ -22,9 +22,4 @@ def add(self, word): """Add a word to the dictionary""" if not word or word.strip() == '': return - self.words[word]=word - - - - - + self.words[word]=word \ No newline at end of file diff --git a/src/Sastrawi/Stemmer/CachedStemmer.py b/src/Sastrawi/Stemmer/CachedStemmer.py index 854a8c3..8979114 100644 --- a/src/Sastrawi/Stemmer/CachedStemmer.py +++ b/src/Sastrawi/Stemmer/CachedStemmer.py @@ -7,7 +7,7 @@ def __init__(self, cache, delegatedStemmer): self.cache = cache self.delegatedStemmer = delegatedStemmer - def stem(self, text: str): + def stem(self, text): normalizedText = TextNormalizer.normalize_text(text) words = normalizedText.split(' ') From a47d9b202d3ef93bd73dbf0828479d102c7c0e0b Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Sat, 16 Mar 2019 00:27:36 +0700 Subject: [PATCH 34/45] LruCache python 2.7 --- .travis.yml | 1 + src/Sastrawi/Stemmer/StemmerFactory.py | 4 ++-- src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index 4d259df..e00cd16 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,6 +9,7 @@ sudo: false install: - pip install python-coveralls - pip install coveralls + - pip install cachetools script: nosetests tests --verbose --with-coverage after_success: - coveralls diff --git a/src/Sastrawi/Stemmer/StemmerFactory.py b/src/Sastrawi/Stemmer/StemmerFactory.py index 3578cac..1b861be 100644 --- a/src/Sastrawi/Stemmer/StemmerFactory.py +++ b/src/Sastrawi/Stemmer/StemmerFactory.py @@ -1,5 +1,5 @@ import os -import functools +from cachetools import cached, LRUCache from Sastrawi.Dictionary.ArrayDictionary import ArrayDictionary from Sastrawi.Stemmer.Stemmer import Stemmer from Sastrawi.Stemmer.CachedStemmer import CachedStemmer @@ -24,7 +24,7 @@ def create_stemmer(self, isDev=False): return cachedStemmer - @functools.lru_cache(maxsize=640) + @cached(cache=LRUCache(maxsize=32)) def get_prod_words_dictionary(self): words = self.get_words_from_file() dictionary = ArrayDictionary(words) diff --git a/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py b/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py index 010175d..668ed94 100644 --- a/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py +++ b/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py @@ -1,5 +1,5 @@ import os -import functools +from cachetools import cached, LRUCache from Sastrawi.Dictionary.ArrayDictionary import ArrayDictionary from Sastrawi.StopWordRemover.StopWordRemover import StopWordRemover @@ -16,7 +16,7 @@ def create_stop_word_remover(self, isDev=False): stopWordRemover = StopWordRemover(dictionary) return stopWordRemover - @functools.lru_cache(maxsize=32) + @cached(cache=LRUCache(maxsize=8)) def get_prod_stop_word_dictionary(self): stopWords = self.get_stop_words() return ArrayDictionary(stopWords) From 58d35a7f6570cdb4656186aee114fab53ef3ad30 Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Sat, 16 Mar 2019 01:27:26 +0700 Subject: [PATCH 35/45] minor --- tests/FunctionalTests/Stemmer/stemmer_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/FunctionalTests/Stemmer/stemmer_test.py b/tests/FunctionalTests/Stemmer/stemmer_test.py index c1f56ed..8609fb7 100644 --- a/tests/FunctionalTests/Stemmer/stemmer_test.py +++ b/tests/FunctionalTests/Stemmer/stemmer_test.py @@ -328,8 +328,8 @@ def get_test_data(self): data.append(['menahan', 'tahan']) # test stem multiple sentences - multipleSentence1 = 'Cinta telah bertebaran.Keduanya saling mencintai.'; - multipleSentence2 = "(Cinta telah bertebaran)\n\n\n\nKeduanya saling mencintai."; + multipleSentence1 = 'Cinta telah bertebaran.Keduanya saling mencintai.' + multipleSentence2 = "(Cinta telah bertebaran)\n\n\n\nKeduanya saling mencintai." data.append([multipleSentence1, 'cinta telah tebar dua saling cinta']) data.append([multipleSentence2, 'cinta telah tebar dua saling cinta']) From 345edd1fd1f10c57c755349ac646b165868b3bcb Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Sat, 16 Mar 2019 02:41:10 +0700 Subject: [PATCH 36/45] Fix critical bugs --- src/Sastrawi/Dictionary/ArrayDictionary.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/Sastrawi/Dictionary/ArrayDictionary.py b/src/Sastrawi/Dictionary/ArrayDictionary.py index 1583678..a476e52 100644 --- a/src/Sastrawi/Dictionary/ArrayDictionary.py +++ b/src/Sastrawi/Dictionary/ArrayDictionary.py @@ -1,12 +1,15 @@ class ArrayDictionary(object): """description of class""" - words = {} def __init__(self, words=None): - if words is dict: + if words is None: + self.words = {} + elif type(words) is dict: self.words = words - elif words is list: + elif type(words) is list: self.add_words(words) + else: + self.words = {} def contains(self, word): return word in self.words @@ -22,4 +25,4 @@ def add(self, word): """Add a word to the dictionary""" if not word or word.strip() == '': return - self.words[word]=word \ No newline at end of file + self.words[word] = word \ No newline at end of file From 1a5f7d66fddd3090b73cc30eb43e13ec527cc860 Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Sat, 16 Mar 2019 03:02:56 +0700 Subject: [PATCH 37/45] Travis for Python 3.7 --- .travis.yml | 9 +++++++-- src/Sastrawi/Stemmer/CachedStemmer.py | 2 ++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index e00cd16..87a5c32 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,8 +4,13 @@ python: - "3.3" - "3.4" - "3.5" - - "3.7" sudo: false +# Enable 3.7 without globally enabling sudo and dist: xenial for other build jobs +matrix: + include: + - python: 3.7 + dist: xenial + sudo: true install: - pip install python-coveralls - pip install coveralls @@ -14,4 +19,4 @@ script: nosetests tests --verbose --with-coverage after_success: - coveralls notifications: - email: false + email: false \ No newline at end of file diff --git a/src/Sastrawi/Stemmer/CachedStemmer.py b/src/Sastrawi/Stemmer/CachedStemmer.py index 8979114..e052077 100644 --- a/src/Sastrawi/Stemmer/CachedStemmer.py +++ b/src/Sastrawi/Stemmer/CachedStemmer.py @@ -36,6 +36,8 @@ def stem_word(self, word): def stem_tokens(self, tokens): stemmed_tokens = [] for token in tokens: + if not token or token.strip() == '': + continue stemmed_tokens.append(self.stem_word(token)) return stemmed_tokens From ae3bc914979fb5eeada827dc8bf9cf8913779551 Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Sat, 16 Mar 2019 04:13:44 +0700 Subject: [PATCH 38/45] add test case --- .../Dictionary/array_dictionary_test.py | 7 +++++++ .../UnitTests/Stemmer/stemmer_factory_test.py | 19 +++++++++++++++++++ .../stop_word_remover_factory_test.py | 11 +++++++++++ 3 files changed, 37 insertions(+) diff --git a/tests/UnitTests/Dictionary/array_dictionary_test.py b/tests/UnitTests/Dictionary/array_dictionary_test.py index 57d72fc..8eb63f8 100644 --- a/tests/UnitTests/Dictionary/array_dictionary_test.py +++ b/tests/UnitTests/Dictionary/array_dictionary_test.py @@ -36,5 +36,12 @@ def test_constructor_preserve_words(self): self.assertTrue(dictionary.contains('word1')) self.assertTrue(dictionary.contains('word2')) + def test_dict_param(self): + dictionary = ArrayDictionary({'word1':'word1', 'word2':'word2'}) + self.assertTrue(dictionary.contains('word1')) + self.assertTrue(dictionary.contains('word2')) + self.assertFalse(dictionary.contains('word3')) + self.assertEqual(2, dictionary.count()) + if __name__ == '__main__': unittest.main() diff --git a/tests/UnitTests/Stemmer/stemmer_factory_test.py b/tests/UnitTests/Stemmer/stemmer_factory_test.py index f4f12cf..c3eb4ff 100644 --- a/tests/UnitTests/Stemmer/stemmer_factory_test.py +++ b/tests/UnitTests/Stemmer/stemmer_factory_test.py @@ -46,5 +46,24 @@ def test_getWordsFromFile(self): factory = StemmerFactory() factory.get_words_from_file() + def test_word_stemming(self): + factory = StemmerFactory() + stemmer = factory.create_stemmer() + self.assertEqual('besar', stemmer.stem('terbesar')) + self.assertEqual('abai', stemmer.stem('diabaikan')) + + def test_tokens_stemming(self): + factory = StemmerFactory() + stemmer = factory.create_stemmer() + tokens = ['perekonomian', 'indonesia', 'sedang', 'dalam', 'pertumbuhan' ,'yang', 'membanggakan'] + clean_tokens = stemmer.stem_tokens(tokens) + self.assertEqual('ekonomi', clean_tokens[0]) + self.assertEqual('indonesia', clean_tokens[1]) + self.assertEqual('sedang', clean_tokens[2]) + self.assertEqual('dalam', clean_tokens[3]) + self.assertEqual('tumbuh', clean_tokens[4]) + self.assertEqual('yang', clean_tokens[5]) + self.assertEqual('bangga', clean_tokens[6]) + if __name__ == '__main__': unittest.main() diff --git a/tests/UnitTests/StopWordRemover/stop_word_remover_factory_test.py b/tests/UnitTests/StopWordRemover/stop_word_remover_factory_test.py index 0ef6695..8c82336 100644 --- a/tests/UnitTests/StopWordRemover/stop_word_remover_factory_test.py +++ b/tests/UnitTests/StopWordRemover/stop_word_remover_factory_test.py @@ -15,6 +15,17 @@ def test_stopwordRemoval(self): sremover = self.factory.create_stop_word_remover() self.assertEqual('pergi sekolah', sremover.remove('pergi ke sekolah yang')) self.assertEqual('makan rumah', sremover.remove('makan di rumah yang')) + + def test_tokens_stopwordRemoval(self): + tokens = ['pergi', 'ke', 'sekolah', 'yang', 'bagus', 'adalah', 'impian'] + sremover = self.factory.create_stop_word_remover() + clean_tokens = sremover.remove_tokens(tokens) + text = ' '.join(clean_tokens) + self.assertEquals('pergi sekolah bagus impian', text) + self.assertEqual('pergi', clean_tokens[0]) + self.assertEqual('sekolah', clean_tokens[1]) + self.assertEqual('bagus', clean_tokens[2]) + self.assertEqual('impian', clean_tokens[3]) def test_execution_time(self): start = time.time() From 9fc1b3eb9dee4f135064c934388fda5ecf1c0fc1 Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Sat, 16 Mar 2019 04:45:15 +0700 Subject: [PATCH 39/45] Add Test Case --- tests/UnitTests/Dictionary/array_dictionary_test.py | 4 ++++ tests/UnitTests/Stemmer/stemmer_factory_test.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/UnitTests/Dictionary/array_dictionary_test.py b/tests/UnitTests/Dictionary/array_dictionary_test.py index 8eb63f8..2d736ed 100644 --- a/tests/UnitTests/Dictionary/array_dictionary_test.py +++ b/tests/UnitTests/Dictionary/array_dictionary_test.py @@ -43,5 +43,9 @@ def test_dict_param(self): self.assertFalse(dictionary.contains('word3')) self.assertEqual(2, dictionary.count()) + def test_non_dict_list(self): + dictionary = ArrayDictionary('$$%&**&(^&') + self.assertTrue(0, dictionary.count()) + if __name__ == '__main__': unittest.main() diff --git a/tests/UnitTests/Stemmer/stemmer_factory_test.py b/tests/UnitTests/Stemmer/stemmer_factory_test.py index c3eb4ff..3ae5394 100644 --- a/tests/UnitTests/Stemmer/stemmer_factory_test.py +++ b/tests/UnitTests/Stemmer/stemmer_factory_test.py @@ -55,7 +55,7 @@ def test_word_stemming(self): def test_tokens_stemming(self): factory = StemmerFactory() stemmer = factory.create_stemmer() - tokens = ['perekonomian', 'indonesia', 'sedang', 'dalam', 'pertumbuhan' ,'yang', 'membanggakan'] + tokens = ['perekonomian', '', 'indonesia', 'sedang', ' ', 'dalam', 'pertumbuhan' ,'yang', 'membanggakan'] clean_tokens = stemmer.stem_tokens(tokens) self.assertEqual('ekonomi', clean_tokens[0]) self.assertEqual('indonesia', clean_tokens[1]) From 15fe5d620b8c3731e8b307d05549eecd2ddc8dc9 Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Sat, 16 Mar 2019 05:23:34 +0700 Subject: [PATCH 40/45] Test Case --- .../Dictionary/array_dictionary_test.py | 7 ++++++- .../UnitTests/Stemmer/stemmer_factory_test.py | 19 ++++++++++++++----- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/tests/UnitTests/Dictionary/array_dictionary_test.py b/tests/UnitTests/Dictionary/array_dictionary_test.py index 2d736ed..049ba94 100644 --- a/tests/UnitTests/Dictionary/array_dictionary_test.py +++ b/tests/UnitTests/Dictionary/array_dictionary_test.py @@ -36,16 +36,21 @@ def test_constructor_preserve_words(self): self.assertTrue(dictionary.contains('word1')) self.assertTrue(dictionary.contains('word2')) + # Test ArrayDictionary dengan tipe data dict + # @author Mufid Jamaluddin def test_dict_param(self): dictionary = ArrayDictionary({'word1':'word1', 'word2':'word2'}) self.assertTrue(dictionary.contains('word1')) self.assertTrue(dictionary.contains('word2')) self.assertFalse(dictionary.contains('word3')) self.assertEqual(2, dictionary.count()) + dictionary.add_words('word3') + self.assertTrue(dictionary.contains('word3')) + self.assertEqual(3, dictionary.count()) def test_non_dict_list(self): dictionary = ArrayDictionary('$$%&**&(^&') - self.assertTrue(0, dictionary.count()) + self.assertEqual(0, dictionary.count()) if __name__ == '__main__': unittest.main() diff --git a/tests/UnitTests/Stemmer/stemmer_factory_test.py b/tests/UnitTests/Stemmer/stemmer_factory_test.py index 3ae5394..045b17d 100644 --- a/tests/UnitTests/Stemmer/stemmer_factory_test.py +++ b/tests/UnitTests/Stemmer/stemmer_factory_test.py @@ -24,34 +24,43 @@ def test_fungsional(self): if output != expected: raise AssertionError(str.format('output is {} instead of {}', output, expected)) + # Test Waktu Stemming < 3 detik + # @author Mufid Jamaluddin def test_execution_time(self): start = time.time() sentence = 'Rakyat memenuHi halaMan geDung DPR unTuk menyuarakan isi hatinya. Saat Itu, situasi sangat genting sekali. Terjadi kerusuhan yang mengiringi pergerakan mahasiswa yang memperjuangkan reformasi.' factory = StemmerFactory() stemmer = factory.create_stemmer() - stemmer.stem(sentence) end = time.time() execution_time = end - start - - # print(execution_time) - - # test execution time < 3 seconds self.assertTrue(execution_time < 3) def test_getWordsFromFile(self): factory = StemmerFactory() factory.get_words_from_file() + # Test Stemming per Kata + # @author Mufid Jamaluddin def test_word_stemming(self): factory = StemmerFactory() stemmer = factory.create_stemmer() self.assertEqual('besar', stemmer.stem('terbesar')) self.assertEqual('abai', stemmer.stem('diabaikan')) + # Test Stemming dengan isDev=True (No Cache) + # @author Mufid Jamaluddin + def test_word_stemmingdev(self): + factory = StemmerFactory(isDev=True) + stemmer = factory.create_stemmer() + self.assertEqual('besar', stemmer.stem('terbesar')) + self.assertEqual('abai', stemmer.stem('diabaikan')) + + # Test Stemming dengan list tokens + # @author Mufid Jamaluddin def test_tokens_stemming(self): factory = StemmerFactory() stemmer = factory.create_stemmer() From 3e4151a63d7ac781c2d31fb5e7ac47f4b65bba45 Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Sat, 16 Mar 2019 05:50:04 +0700 Subject: [PATCH 41/45] Define Abstract Method & Update Test Case --- src/Sastrawi/Dictionary/ArrayDictionary.py | 4 +++- .../Dictionary/DictionaryInterface.py | 8 +++++-- src/Sastrawi/Stemmer/Cache/CacheInterface.py | 14 ++++++++----- .../Stemmer/Context/ContextInterface.py | 21 ++++++++++++------- .../Stemmer/Context/RemovalInterface.py | 10 ++++++++- .../Dictionary/array_dictionary_test.py | 3 ++- 6 files changed, 43 insertions(+), 17 deletions(-) diff --git a/src/Sastrawi/Dictionary/ArrayDictionary.py b/src/Sastrawi/Dictionary/ArrayDictionary.py index a476e52..814cf6b 100644 --- a/src/Sastrawi/Dictionary/ArrayDictionary.py +++ b/src/Sastrawi/Dictionary/ArrayDictionary.py @@ -1,4 +1,6 @@ -class ArrayDictionary(object): +from Sastrawi.Dictionary.DictionaryInterface import DictionaryInterface + +class ArrayDictionary(DictionaryInterface): """description of class""" def __init__(self, words=None): diff --git a/src/Sastrawi/Dictionary/DictionaryInterface.py b/src/Sastrawi/Dictionary/DictionaryInterface.py index f899ae4..79bdf05 100644 --- a/src/Sastrawi/Dictionary/DictionaryInterface.py +++ b/src/Sastrawi/Dictionary/DictionaryInterface.py @@ -1,5 +1,9 @@ -class DictionaryInterface(object): +from abc import ABCMeta, abstractmethod + +class DictionaryInterface: """Interface definition of dictionary""" + __metaclass__ = ABCMeta + @abstractmethod def contains(self, word): - raise NotImplementedError('you must implement this method manually') \ No newline at end of file + pass \ No newline at end of file diff --git a/src/Sastrawi/Stemmer/Cache/CacheInterface.py b/src/Sastrawi/Stemmer/Cache/CacheInterface.py index cbed596..8dde964 100644 --- a/src/Sastrawi/Stemmer/Cache/CacheInterface.py +++ b/src/Sastrawi/Stemmer/Cache/CacheInterface.py @@ -1,13 +1,17 @@ -class CacheInterface(object): - """description of class""" +from abc import ABCMeta, abstractmethod +class CacheInterface: + """description of abs class""" + __metaclass__ = ABCMeta + + @abstractmethod def has(self, key): pass + @abstractmethod def set(self, key, value): pass + @abstractmethod def get(self, key): - pass - - + pass \ No newline at end of file diff --git a/src/Sastrawi/Stemmer/Context/ContextInterface.py b/src/Sastrawi/Stemmer/Context/ContextInterface.py index 180c6d0..747dfd2 100644 --- a/src/Sastrawi/Stemmer/Context/ContextInterface.py +++ b/src/Sastrawi/Stemmer/Context/ContextInterface.py @@ -1,30 +1,37 @@ -class ContextInterface(object): - """description of class""" +from abc import ABCMeta, abstractmethod +class ContextInterface: + """description of abs class""" + __metaclass__ = ABCMeta + + @abstractmethod def getOriginalWord(self): pass + @abstractmethod def setCurrentWord(self, word): pass + @abstractmethod def getCurrentWord(self): pass + @abstractmethod def getDictionary(self): pass + @abstractmethod def stopProcess(self): pass + @abstractmethod def processIsStopped(self): pass + @abstractmethod def addRemoval(self, removal): pass + @abstractmethod def getRemovals(self): - pass - - - - + pass \ No newline at end of file diff --git a/src/Sastrawi/Stemmer/Context/RemovalInterface.py b/src/Sastrawi/Stemmer/Context/RemovalInterface.py index 93b6171..7284597 100644 --- a/src/Sastrawi/Stemmer/Context/RemovalInterface.py +++ b/src/Sastrawi/Stemmer/Context/RemovalInterface.py @@ -1,18 +1,26 @@ -class RemovalInterface(object): +from abc import ABCMeta, abstractmethod + +class RemovalInterface: """description of class""" + __metaclass__ = ABCMeta + @abstractmethod def get_visitor(self): pass + @abstractmethod def get_subject(self): pass + @abstractmethod def get_result(self): pass + @abstractmethod def get_removed_part(self): pass + @abstractmethod def get_affix_type(self): pass diff --git a/tests/UnitTests/Dictionary/array_dictionary_test.py b/tests/UnitTests/Dictionary/array_dictionary_test.py index 049ba94..42c5580 100644 --- a/tests/UnitTests/Dictionary/array_dictionary_test.py +++ b/tests/UnitTests/Dictionary/array_dictionary_test.py @@ -44,7 +44,8 @@ def test_dict_param(self): self.assertTrue(dictionary.contains('word2')) self.assertFalse(dictionary.contains('word3')) self.assertEqual(2, dictionary.count()) - dictionary.add_words('word3') + dictionary.add('word3') + dictionary.add(' ') self.assertTrue(dictionary.contains('word3')) self.assertEqual(3, dictionary.count()) From 6d9fd872083038499b95897e04da254d03210ae1 Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Sat, 16 Mar 2019 06:01:24 +0700 Subject: [PATCH 42/45] Minor --- src/Sastrawi/Dictionary/DictionaryInterface.py | 3 +++ src/Sastrawi/Stemmer/Cache/CacheInterface.py | 3 +++ src/Sastrawi/Stemmer/Context/ContextInterface.py | 3 +++ src/Sastrawi/Stemmer/Context/RemovalInterface.py | 3 +++ tests/UnitTests/Stemmer/stemmer_factory_test.py | 4 ++-- 5 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/Sastrawi/Dictionary/DictionaryInterface.py b/src/Sastrawi/Dictionary/DictionaryInterface.py index 79bdf05..46433b6 100644 --- a/src/Sastrawi/Dictionary/DictionaryInterface.py +++ b/src/Sastrawi/Dictionary/DictionaryInterface.py @@ -1,3 +1,6 @@ +# @update_by Mufid Jamaluddin +# @update_date 16/03/2019 + from abc import ABCMeta, abstractmethod class DictionaryInterface: diff --git a/src/Sastrawi/Stemmer/Cache/CacheInterface.py b/src/Sastrawi/Stemmer/Cache/CacheInterface.py index 8dde964..e869c84 100644 --- a/src/Sastrawi/Stemmer/Cache/CacheInterface.py +++ b/src/Sastrawi/Stemmer/Cache/CacheInterface.py @@ -1,3 +1,6 @@ +# @update_by Mufid Jamaluddin +# @update_date 16/03/2019 + from abc import ABCMeta, abstractmethod class CacheInterface: diff --git a/src/Sastrawi/Stemmer/Context/ContextInterface.py b/src/Sastrawi/Stemmer/Context/ContextInterface.py index 747dfd2..5a3b7be 100644 --- a/src/Sastrawi/Stemmer/Context/ContextInterface.py +++ b/src/Sastrawi/Stemmer/Context/ContextInterface.py @@ -1,3 +1,6 @@ +# @update_by Mufid Jamaluddin +# @update_date 16/03/2019 + from abc import ABCMeta, abstractmethod class ContextInterface: diff --git a/src/Sastrawi/Stemmer/Context/RemovalInterface.py b/src/Sastrawi/Stemmer/Context/RemovalInterface.py index 7284597..a94a18f 100644 --- a/src/Sastrawi/Stemmer/Context/RemovalInterface.py +++ b/src/Sastrawi/Stemmer/Context/RemovalInterface.py @@ -1,3 +1,6 @@ +# @update_by Mufid Jamaluddin +# @update_date 16/03/2019 + from abc import ABCMeta, abstractmethod class RemovalInterface: diff --git a/tests/UnitTests/Stemmer/stemmer_factory_test.py b/tests/UnitTests/Stemmer/stemmer_factory_test.py index 045b17d..9df96b7 100644 --- a/tests/UnitTests/Stemmer/stemmer_factory_test.py +++ b/tests/UnitTests/Stemmer/stemmer_factory_test.py @@ -54,8 +54,8 @@ def test_word_stemming(self): # Test Stemming dengan isDev=True (No Cache) # @author Mufid Jamaluddin def test_word_stemmingdev(self): - factory = StemmerFactory(isDev=True) - stemmer = factory.create_stemmer() + factory = StemmerFactory() + stemmer = factory.create_stemmer(isDev=True) self.assertEqual('besar', stemmer.stem('terbesar')) self.assertEqual('abai', stemmer.stem('diabaikan')) From 8bfc448b1bb2d3b119e60e4bcf9028f80c58e45b Mon Sep 17 00:00:00 2001 From: mufidjamaluddin Date: Fri, 19 Apr 2019 13:19:32 +0700 Subject: [PATCH 43/45] LruCache --- src/Sastrawi/Stemmer/Cache/ArrayCache.py | 19 --------- src/Sastrawi/Stemmer/Cache/CacheInterface.py | 20 --------- src/Sastrawi/Stemmer/Cache/__init__.py | 0 src/Sastrawi/Stemmer/CachedStemmer.py | 45 -------------------- src/Sastrawi/Stemmer/Stemmer.py | 12 ++++++ src/Sastrawi/Stemmer/StemmerFactory.py | 18 +------- 6 files changed, 13 insertions(+), 101 deletions(-) delete mode 100644 src/Sastrawi/Stemmer/Cache/ArrayCache.py delete mode 100644 src/Sastrawi/Stemmer/Cache/CacheInterface.py delete mode 100644 src/Sastrawi/Stemmer/Cache/__init__.py delete mode 100644 src/Sastrawi/Stemmer/CachedStemmer.py diff --git a/src/Sastrawi/Stemmer/Cache/ArrayCache.py b/src/Sastrawi/Stemmer/Cache/ArrayCache.py deleted file mode 100644 index 35ff2b3..0000000 --- a/src/Sastrawi/Stemmer/Cache/ArrayCache.py +++ /dev/null @@ -1,19 +0,0 @@ -from Sastrawi.Stemmer.Cache.CacheInterface import CacheInterface - -class ArrayCache(CacheInterface): - """description of class""" - - def __init__(self): - self.data = {} - - def set(self, key, value): - self.data[key] = value - - def get(self, key): - if key in self.data: - return self.data[key] - - def has(self, key): - return key in self.data - - diff --git a/src/Sastrawi/Stemmer/Cache/CacheInterface.py b/src/Sastrawi/Stemmer/Cache/CacheInterface.py deleted file mode 100644 index e869c84..0000000 --- a/src/Sastrawi/Stemmer/Cache/CacheInterface.py +++ /dev/null @@ -1,20 +0,0 @@ -# @update_by Mufid Jamaluddin -# @update_date 16/03/2019 - -from abc import ABCMeta, abstractmethod - -class CacheInterface: - """description of abs class""" - __metaclass__ = ABCMeta - - @abstractmethod - def has(self, key): - pass - - @abstractmethod - def set(self, key, value): - pass - - @abstractmethod - def get(self, key): - pass \ No newline at end of file diff --git a/src/Sastrawi/Stemmer/Cache/__init__.py b/src/Sastrawi/Stemmer/Cache/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/Sastrawi/Stemmer/CachedStemmer.py b/src/Sastrawi/Stemmer/CachedStemmer.py deleted file mode 100644 index e052077..0000000 --- a/src/Sastrawi/Stemmer/CachedStemmer.py +++ /dev/null @@ -1,45 +0,0 @@ -#from Sastrawi.Stemmer.StemmerInterface import StemmerInterface -from Sastrawi.Stemmer.Filter import TextNormalizer - -class CachedStemmer(object): - """description of class""" - def __init__(self, cache, delegatedStemmer): - self.cache = cache - self.delegatedStemmer = delegatedStemmer - - def stem(self, text): - normalizedText = TextNormalizer.normalize_text(text) - - words = normalizedText.split(' ') - stems = [] - - for word in words: - if self.cache.has(word): - stems.append(self.cache.get(word)) - else: - stem = self.delegatedStemmer.stem_word(word) - self.cache.set(word, stem) - stems.append(stem) - - return ' '.join(stems) - - def stem_word(self, word): - if self.cache.has(word): - return self.cache.get(word) - else: - stem = self.delegatedStemmer.stem_word(word) - self.cache.set(word, stem) - return stem - - # Stemming word in Tokens - # @author Mufid Jamaluddin - def stem_tokens(self, tokens): - stemmed_tokens = [] - for token in tokens: - if not token or token.strip() == '': - continue - stemmed_tokens.append(self.stem_word(token)) - return stemmed_tokens - - def get_cache(self): - return self.cache diff --git a/src/Sastrawi/Stemmer/Stemmer.py b/src/Sastrawi/Stemmer/Stemmer.py index cb196c7..b465bac 100644 --- a/src/Sastrawi/Stemmer/Stemmer.py +++ b/src/Sastrawi/Stemmer/Stemmer.py @@ -2,6 +2,7 @@ from Sastrawi.Stemmer.Context.Visitor.VisitorProvider import VisitorProvider from Sastrawi.Stemmer.Filter import TextNormalizer from Sastrawi.Stemmer.Context.Context import Context +from cachetools import cached, LRUCache class Stemmer(object): """Indonesian Stemmer. @@ -28,6 +29,7 @@ def stem(self, text): return ' '.join(stems) + @cached(cache=LRUCache(maxsize=128)) def stem_word(self, word): """Stem a word to its common stem form.""" if self.is_plural(word): @@ -35,6 +37,16 @@ def stem_word(self, word): else: return self.stem_singular_word(word) + # Stemming word in Tokens + # @author Mufid Jamaluddin + def stem_tokens(self, tokens): + stemmed_tokens = [] + for token in tokens: + if not token or token.strip() == '': + continue + stemmed_tokens.append(self.stem_word(token)) + return stemmed_tokens + def is_plural(self, word): #-ku|-mu|-nya #nikmat-Ku, etc diff --git a/src/Sastrawi/Stemmer/StemmerFactory.py b/src/Sastrawi/Stemmer/StemmerFactory.py index 1b861be..dd16bb5 100644 --- a/src/Sastrawi/Stemmer/StemmerFactory.py +++ b/src/Sastrawi/Stemmer/StemmerFactory.py @@ -2,12 +2,9 @@ from cachetools import cached, LRUCache from Sastrawi.Dictionary.ArrayDictionary import ArrayDictionary from Sastrawi.Stemmer.Stemmer import Stemmer -from Sastrawi.Stemmer.CachedStemmer import CachedStemmer -from Sastrawi.Stemmer.Cache.ArrayCache import ArrayCache class StemmerFactory(object): """ Stemmer factory helps creating pre-configured stemmer """ - #APC_KEY = 'sastrawi_cache_dictionary' def create_stemmer(self, isDev=False): """ Returns Stemmer instance """ @@ -19,10 +16,7 @@ def create_stemmer(self, isDev=False): stemmer = Stemmer(dictionary) - resultCache = ArrayCache() - cachedStemmer = CachedStemmer(resultCache, stemmer) - - return cachedStemmer + return stemmer @cached(cache=LRUCache(maxsize=32)) def get_prod_words_dictionary(self): @@ -30,16 +24,6 @@ def get_prod_words_dictionary(self): dictionary = ArrayDictionary(words) return dictionary - #def get_words(self, isDev=False): - #if isDev or callable(getattr(self, 'apc_fetch')): - # words = self.getWordsFromFile() - #else: - # words = apc_fetch(self.APC_KEY) - # if not words: - # words = self.getWordsFromFile() - # apc_store(self.APC_KEY, words) - # return self.get_words_from_file() - def get_words_from_file(self): current_dir = os.path.dirname(os.path.realpath(__file__)) dictionaryFile = current_dir + '/data/kata-dasar.txt' From 34708981c0ed78c4ddf07e73d212e8764a14b93c Mon Sep 17 00:00:00 2001 From: mufidjamaluddin Date: Fri, 19 Apr 2019 13:33:14 +0700 Subject: [PATCH 44/45] minor --- src/Sastrawi/Stemmer/Context/Context.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Sastrawi/Stemmer/Context/Context.py b/src/Sastrawi/Stemmer/Context/Context.py index 18223ee..6667714 100644 --- a/src/Sastrawi/Stemmer/Context/Context.py +++ b/src/Sastrawi/Stemmer/Context/Context.py @@ -160,5 +160,4 @@ def restore_prefix(self): for removal in self.removals: if removal.get_affix_type() == 'DP': - self.removals.remove(removal) - + self.removals.remove(removal) \ No newline at end of file From 169edcf5d9b19b109672dbb2528877650a95f410 Mon Sep 17 00:00:00 2001 From: mufidjamaluddin Date: Fri, 19 Apr 2019 13:45:52 +0700 Subject: [PATCH 45/45] remove lrucache stemword --- src/Sastrawi/Stemmer/Stemmer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Sastrawi/Stemmer/Stemmer.py b/src/Sastrawi/Stemmer/Stemmer.py index b465bac..1f3135e 100644 --- a/src/Sastrawi/Stemmer/Stemmer.py +++ b/src/Sastrawi/Stemmer/Stemmer.py @@ -29,7 +29,6 @@ def stem(self, text): return ' '.join(stems) - @cached(cache=LRUCache(maxsize=128)) def stem_word(self, word): """Stem a word to its common stem form.""" if self.is_plural(word):