From 426bb88e9863a60f8e10fbfa852a12eb87e71fcb Mon Sep 17 00:00:00 2001 From: Hanif Amal Robbani Date: Sat, 16 Jan 2016 09:47:15 +0700 Subject: [PATCH 01/33] Update .travis.yml --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index c763074..84e3032 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,6 +6,8 @@ python: - "3.4" - "3.5" sudo: false +install: + - pip install coverage script: nosetests tests --verbose -with-coverage notifications: email: From d97f73d831d3640a654f4a36c0058337cd56fb7f Mon Sep 17 00:00:00 2001 From: Hanif Amal Robbani Date: Sat, 16 Jan 2016 15:12:52 +0700 Subject: [PATCH 02/33] Update .travis.yml --- .travis.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 84e3032..86a3b63 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,8 +7,10 @@ python: - "3.5" sudo: false install: - - pip install coverage + - pip install coveralls script: nosetests tests --verbose -with-coverage +after-success: + coveralls notifications: email: - dev.har07@gmail.com From 343493bd6fa2b0d691b6e1be378a257634ec81ca Mon Sep 17 00:00:00 2001 From: Hanif Amal Robbani Date: Sat, 16 Jan 2016 15:14:17 +0700 Subject: [PATCH 03/33] Update .travis.yml --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 86a3b63..98efd19 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,7 +8,7 @@ python: sudo: false install: - pip install coveralls -script: nosetests tests --verbose -with-coverage +script: nosetests tests --verbose --with-coverage after-success: coveralls notifications: From 4e374d24600eacb60f7116fe7f9bea6aa74cd810 Mon Sep 17 00:00:00 2001 From: Hanif Amal Robbani Date: Sat, 16 Jan 2016 15:18:51 +0700 Subject: [PATCH 04/33] Create README.md --- README.md | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..c4f4d07 --- /dev/null +++ b/README.md @@ -0,0 +1,4 @@ +# sastrawi +Indonesian stemmer. Python port of PHP Sastrawi project. + +[![Coverage Status](https://coveralls.io/repos/har07/sastrawi/badge.svg?branch=master&service=github)](https://coveralls.io/github/har07/sastrawi?branch=master) From 983b0923747cc20f64a46bde85d395060b0ff564 Mon Sep 17 00:00:00 2001 From: Hanif Amal Robbani Date: Sat, 16 Jan 2016 15:38:02 +0700 Subject: [PATCH 05/33] Update .travis.yml --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 98efd19..7e93c70 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,6 +7,7 @@ python: - "3.5" sudo: false install: + - pip install python-coveralls - pip install coveralls script: nosetests tests --verbose --with-coverage after-success: From 2fc0a73d862793dccc1ddecd71356018931d75f2 Mon Sep 17 00:00:00 2001 From: Hanif Amal Robbani Date: Sat, 16 Jan 2016 16:08:30 +0700 Subject: [PATCH 06/33] Create .coveragerc --- .coveragerc | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .coveragerc diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..ca62439 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,5 @@ +[report] +omit = + */python?.?/* + */site-packages/nose/* + *__init__* From b9052672d46e233399fc2a10a3a014eba687284c Mon Sep 17 00:00:00 2001 From: Hanif Amal Robbani Date: Sat, 16 Jan 2016 16:10:24 +0700 Subject: [PATCH 07/33] Update .travis.yml --- .travis.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 7e93c70..50f4698 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,6 @@ language: python python: - "2.7" - - "3.2" - "3.3" - "3.4" - "3.5" From c7a05e336bd60c1485915eb9058bcf6c71544fab Mon Sep 17 00:00:00 2001 From: Hanif Amal Robbani Date: Sat, 16 Jan 2016 16:19:18 +0700 Subject: [PATCH 08/33] Update .travis.yml --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 50f4698..a6de8b4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,7 +10,7 @@ install: - pip install coveralls script: nosetests tests --verbose --with-coverage after-success: - coveralls + - coveralls notifications: email: - dev.har07@gmail.com From 450b5131154b62ad48f5df38b8cdb4088d900028 Mon Sep 17 00:00:00 2001 From: Hanif Amal Robbani Date: Sat, 16 Jan 2016 16:24:24 +0700 Subject: [PATCH 09/33] Update .travis.yml --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index a6de8b4..ce36b33 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,7 @@ install: - pip install python-coveralls - pip install coveralls script: nosetests tests --verbose --with-coverage -after-success: +after_success: - coveralls notifications: email: From 0eb9f2154336852a23549accc96092b1a1362a65 Mon Sep 17 00:00:00 2001 From: Hanif Amal Robbani Date: Sat, 16 Jan 2016 18:03:05 +0700 Subject: [PATCH 10/33] Update README.md --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index c4f4d07..06f8f09 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,3 @@ # sastrawi Indonesian stemmer. Python port of PHP Sastrawi project. -[![Coverage Status](https://coveralls.io/repos/har07/sastrawi/badge.svg?branch=master&service=github)](https://coveralls.io/github/har07/sastrawi?branch=master) From e37f4c18aedeb47302e55b8e054797ba474a3ce2 Mon Sep 17 00:00:00 2001 From: Hanif Amal Robbani Date: Sat, 16 Jan 2016 18:05:53 +0700 Subject: [PATCH 11/33] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 06f8f09..deeaa80 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,4 @@ # sastrawi Indonesian stemmer. Python port of PHP Sastrawi project. +[![Coverage Status](https://coveralls.io/repos/har07/sastrawi/badge.svg?branch=development&service=github)](https://coveralls.io/github/har07/sastrawi?branch=development) From 900eb35e569e115cda5495503197626d26407d42 Mon Sep 17 00:00:00 2001 From: Hanif Amal Robbani Date: Sat, 16 Jan 2016 19:21:59 +0700 Subject: [PATCH 12/33] turn off travis-ci email notif --- .travis.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index ce36b33..65ec28e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,5 +12,4 @@ script: nosetests tests --verbose --with-coverage after_success: - coveralls notifications: - email: - - dev.har07@gmail.com + email: false From 3625027e00dfc9b1cfa314004543064d22f9e626 Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Fri, 15 Mar 2019 01:30:25 +0700 Subject: [PATCH 13/33] Add Stopwords Tala 2003, Add lru_cache --- src/Sastrawi/Dictionary/ArrayDictionary.py | 8 +- src/Sastrawi/Stemmer/StemmerFactory.py | 30 +- .../StopWordRemover/StopWordRemoverFactory.py | 125 +-- .../data/stopword_tala_2003.txt | 758 ++++++++++++++++++ 4 files changed, 811 insertions(+), 110 deletions(-) create mode 100644 src/Sastrawi/StopWordRemover/data/stopword_tala_2003.txt diff --git a/src/Sastrawi/Dictionary/ArrayDictionary.py b/src/Sastrawi/Dictionary/ArrayDictionary.py index 5bfdd90..5c864f0 100644 --- a/src/Sastrawi/Dictionary/ArrayDictionary.py +++ b/src/Sastrawi/Dictionary/ArrayDictionary.py @@ -2,8 +2,12 @@ class ArrayDictionary(object): """description of class""" def __init__(self, words=None): - self.words = {} - if words: + + if words is is dict: + self.words = words + + if words is list: + self.words = {} self.add_words(words) def contains(self, word): diff --git a/src/Sastrawi/Stemmer/StemmerFactory.py b/src/Sastrawi/Stemmer/StemmerFactory.py index 10e6aaa..1e7a583 100644 --- a/src/Sastrawi/Stemmer/StemmerFactory.py +++ b/src/Sastrawi/Stemmer/StemmerFactory.py @@ -1,4 +1,5 @@ import os +import functools from Sastrawi.Dictionary.ArrayDictionary import ArrayDictionary from Sastrawi.Stemmer.Stemmer import Stemmer from Sastrawi.Stemmer.CachedStemmer import CachedStemmer @@ -6,13 +7,16 @@ class StemmerFactory(object): """ Stemmer factory helps creating pre-configured stemmer """ - APC_KEY = 'sastrawi_cache_dictionary' + #APC_KEY = 'sastrawi_cache_dictionary' def create_stemmer(self, isDev=False): """ Returns Stemmer instance """ + if isDev: + words = self.get_words_from_file() + dictionary = ArrayDictionary(words) + else: + dictionary = self.get_prod_words_dictionary() - words = self.get_words(isDev) - dictionary = ArrayDictionary(words) stemmer = Stemmer(dictionary) resultCache = ArrayCache() @@ -20,7 +24,13 @@ def create_stemmer(self, isDev=False): return cachedStemmer - def get_words(self, isDev=False): + @functools.lru_cache(maxsize=32) + def get_prod_words_dictionary(self): + words = self.get_words_from_file() + dictionary = ArrayDictionary(words) + return dictionary + + #def get_words(self, isDev=False): #if isDev or callable(getattr(self, 'apc_fetch')): # words = self.getWordsFromFile() #else: @@ -28,16 +38,18 @@ def get_words(self, isDev=False): # if not words: # words = self.getWordsFromFile() # apc_store(self.APC_KEY, words) - return self.get_words_from_file() + # return self.get_words_from_file() def get_words_from_file(self): current_dir = os.path.dirname(os.path.realpath(__file__)) dictionaryFile = current_dir + '/data/kata-dasar.txt' + if not os.path.isfile(dictionaryFile): raise RuntimeError('Dictionary file is missing. It seems that your installation is corrupted.') - dictionaryContent = '' + content = {} with open(dictionaryFile, 'r') as f: - dictionaryContent = f.read() - - return dictionaryContent.split('\n') \ No newline at end of file + word = f.read() + content.words[word] = word + + return content \ No newline at end of file diff --git a/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py b/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py index 560db07..2c206e6 100644 --- a/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py +++ b/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py @@ -1,109 +1,36 @@ +import os +import functools from Sastrawi.Dictionary.ArrayDictionary import ArrayDictionary from Sastrawi.StopWordRemover.StopWordRemover import StopWordRemover class StopWordRemoverFactory(object): """description of class""" - def create_stop_word_remover(self): - stopWords = self.get_stop_words() - dictionary = ArrayDictionary(stopWords) - stopWordRemover = StopWordRemover(dictionary) + def create_stop_word_remover(self, isDev=False): + if isDev: + stopWords = self.get_stop_words() + dictionary = ArrayDictionary(stopWords) + else: + dictionary = self.get_prod_stop_word_dictionary() + stopWordRemover = StopWordRemover(dictionary) return stopWordRemover - def get_stop_words(self): - return ['a','ada','adalah','adanya','adapun','agak','agaknya','agar','akan','akankah','akhir', - 'akhiri','akhirnya','aku','akulah','amat','amatlah','anda','andalah','antar','antara', - 'antaranya','apa','apaan','apabila','apakah','apalagi','apatah','arti','artinya','asal', - 'asalkan','atas','atau','ataukah','ataupun','awal','awalnya','b','bagai','bagaikan', - 'bagaimana','bagaimanakah','bagaimanapun','bagainamakah','bagi','bagian','bahkan','bahwa', - 'bahwasannya','bahwasanya','baik','baiklah','bakal','bakalan','balik','banyak','bapak', - 'baru','bawah','beberapa','begini','beginian','beginikah','beginilah','begitu','begitukah', - 'begitulah','begitupun','bekerja','belakang','belakangan','belum','belumlah','benar', - 'benarkah','benarlah','berada','berakhir','berakhirlah','berakhirnya','berapa','berapakah', - 'berapalah','berapapun','berarti','berawal','berbagai','berdatangan','beri','berikan', - 'berikut','berikutnya','berjumlah','berkali-kali','berkata','berkehendak','berkeinginan', - 'berkenaan','berlainan','berlalu','berlangsung','berlebihan','bermacam','bermacam-macam', - 'bermaksud','bermula','bersama','bersama-sama','bersiap','bersiap-siap','bertanya', - 'bertanya-tanya','berturut','berturut-turut','bertutur','berujar','berupa','besar', - 'betul','betulkah','biasa','biasanya','bila','bilakah','bisa','bisakah','boleh','bolehkah', - 'bolehlah','buat','bukan','bukankah','bukanlah','bukannya','bulan','bung','c','cara', - 'caranya','cukup','cukupkah','cukuplah','cuma','d','dahulu','dalam','dan','dapat','dari', - 'daripada','datang','dekat','demi','demikian','demikianlah','dengan','depan','di','dia', - 'diakhiri','diakhirinya','dialah','diantara','diantaranya','diberi','diberikan','diberikannya', - 'dibuat','dibuatnya','didapat','didatangkan','digunakan','diibaratkan','diibaratkannya', - 'diingat','diingatkan','diinginkan','dijawab','dijelaskan','dijelaskannya','dikarenakan', - 'dikatakan','dikatakannya','dikerjakan','diketahui','diketahuinya','dikira','dilakukan', - 'dilalui','dilihat','dimaksud','dimaksudkan','dimaksudkannya','dimaksudnya','diminta', - 'dimintai','dimisalkan','dimulai','dimulailah','dimulainya','dimungkinkan','dini','dipastikan', - 'diperbuat','diperbuatnya','dipergunakan','diperkirakan','diperlihatkan','diperlukan', - 'diperlukannya','dipersoalkan','dipertanyakan','dipunyai','diri','dirinya','disampaikan', - 'disebut','disebutkan','disebutkannya','disini','disinilah','ditambahkan','ditandaskan', - 'ditanya','ditanyai','ditanyakan','ditegaskan','ditujukan','ditunjuk','ditunjuki','ditunjukkan', - 'ditunjukkannya','ditunjuknya','dituturkan','dituturkannya','diucapkan','diucapkannya', - 'diungkapkan','dong','dua','dulu','e','empat','enak','enggak','enggaknya','entah','entahlah', - 'f','g','guna','gunakan','h','hadap','hai','hal','halo','hallo','hampir','hanya','hanyalah', - 'hari','harus','haruslah','harusnya','helo','hello','hendak','hendaklah','hendaknya','hingga', - 'i','ia','ialah','ibarat','ibaratkan','ibaratnya','ibu','ikut','ingat','ingat-ingat','ingin', - 'inginkah','inginkan','ini','inikah','inilah','itu','itukah','itulah','j','jadi','jadilah', - 'jadinya','jangan','jangankan','janganlah','jauh','jawab','jawaban','jawabnya','jelas', - 'jelaskan','jelaslah','jelasnya','jika','jikalau','juga','jumlah','jumlahnya','justru', - 'k','kadar','kala','kalau','kalaulah','kalaupun','kali','kalian','kami','kamilah','kamu', - 'kamulah','kan','kapan','kapankah','kapanpun','karena','karenanya','kasus','kata','katakan', - 'katakanlah','katanya','ke','keadaan','kebetulan','kecil','kedua','keduanya','keinginan', - 'kelamaan','kelihatan','kelihatannya','kelima','keluar','kembali','kemudian','kemungkinan', - 'kemungkinannya','kena','kenapa','kepada','kepadanya','kerja','kesampaian','keseluruhan', - 'keseluruhannya','keterlaluan','ketika','khusus','khususnya','kini','kinilah','kira', - 'kira-kira','kiranya','kita','kitalah','kok','kurang','l','lagi','lagian','lah','lain', - 'lainnya','laku','lalu','lama','lamanya','langsung','lanjut','lanjutnya','lebih','lewat', - 'lihat','lima','luar','m','macam','maka','makanya','makin','maksud','malah','malahan', - 'mampu','mampukah','mana','manakala','manalagi','masa','masalah','masalahnya','masih', - 'masihkah','masing','masing-masing','masuk','mata','mau','maupun','melainkan','melakukan', - 'melalui','melihat','melihatnya','memang','memastikan','memberi','memberikan','membuat', - 'memerlukan','memihak','meminta','memintakan','memisalkan','memperbuat','mempergunakan', - 'memperkirakan','memperlihatkan','mempersiapkan','mempersoalkan','mempertanyakan','mempunyai', - 'memulai','memungkinkan','menaiki','menambahkan','menandaskan','menanti','menanti-nanti', - 'menantikan','menanya','menanyai','menanyakan','mendapat','mendapatkan','mendatang','mendatangi', - 'mendatangkan','menegaskan','mengakhiri','mengapa','mengatakan','mengatakannya','mengenai', - 'mengerjakan','mengetahui','menggunakan','menghendaki','mengibaratkan','mengibaratkannya', - 'mengingat','mengingatkan','menginginkan','mengira','mengucapkan','mengucapkannya','mengungkapkan', - 'menjadi','menjawab','menjelaskan','menuju','menunjuk','menunjuki','menunjukkan','menunjuknya', - 'menurut','menuturkan','menyampaikan','menyangkut','menyatakan','menyebutkan','menyeluruh', - 'menyiapkan','merasa','mereka','merekalah','merupakan','meski','meskipun','meyakini','meyakinkan', - 'minta','mirip','misal','misalkan','misalnya','mohon','mula','mulai','mulailah','mulanya','mungkin', - 'mungkinkah','n','nah','naik','namun','nanti','nantinya','nya','nyaris','nyata','nyatanya', - 'o','oleh','olehnya','orang','p','pada','padahal','padanya','pak','paling','panjang','pantas', - 'para','pasti','pastilah','penting','pentingnya','per','percuma','perlu','perlukah','perlunya', - 'pernah','persoalan','pertama','pertama-tama','pertanyaan','pertanyakan','pihak','pihaknya', - 'pukul','pula','pun','punya','q','r','rasa','rasanya','rupa','rupanya','s','saat','saatnya','saja', - 'sajalah','salam','saling','sama','sama-sama','sambil','sampai','sampai-sampai','sampaikan','sana', - 'sangat','sangatlah','sangkut','satu','saya','sayalah','se','sebab','sebabnya','sebagai', - 'sebagaimana','sebagainya','sebagian','sebaik','sebaik-baiknya','sebaiknya','sebaliknya', - 'sebanyak','sebegini','sebegitu','sebelum','sebelumnya','sebenarnya','seberapa','sebesar', - 'sebetulnya','sebisanya','sebuah','sebut','sebutlah','sebutnya','secara','secukupnya','sedang', - 'sedangkan','sedemikian','sedikit','sedikitnya','seenaknya','segala','segalanya','segera', - 'seharusnya','sehingga','seingat','sejak','sejauh','sejenak','sejumlah','sekadar','sekadarnya', - 'sekali','sekali-kali','sekalian','sekaligus','sekalipun','sekarang','sekaranglah','sekecil', - 'seketika','sekiranya','sekitar','sekitarnya','sekurang-kurangnya','sekurangnya','sela','selain', - 'selaku','selalu','selama','selama-lamanya','selamanya','selanjutnya','seluruh','seluruhnya', - 'semacam','semakin','semampu','semampunya','semasa','semasih','semata','semata-mata','semaunya', - 'sementara','semisal','semisalnya','sempat','semua','semuanya','semula','sendiri','sendirian', - 'sendirinya','seolah','seolah-olah','seorang','sepanjang','sepantasnya','sepantasnyalah', - 'seperlunya','seperti','sepertinya','sepihak','sering','seringnya','serta','serupa','sesaat', - 'sesama','sesampai','sesegera','sesekali','seseorang','sesuatu','sesuatunya','sesudah', - 'sesudahnya','setelah','setempat','setengah','seterusnya','setiap','setiba','setibanya', - 'setidak-tidaknya','setidaknya','setinggi','seusai','sewaktu','siap','siapa','siapakah', - 'siapapun','sini','sinilah','soal','soalnya','suatu','sudah','sudahkah','sudahlah','supaya', - 't','tadi','tadinya','tahu','tak','tambah','tambahnya','tampak','tampaknya','tandas','tandasnya', - 'tanpa','tanya','tanyakan','tanyanya','tapi','tegas','tegasnya','telah','tempat','tentang','tentu', - 'tentulah','tentunya','tepat','terakhir','terasa','terbanyak','terdahulu','terdapat','terdiri', - 'terhadap','terhadapnya','teringat','teringat-ingat','terjadi','terjadilah','terjadinya','terkira', - 'terlalu','terlebih','terlihat','termasuk','ternyata','tersampaikan','tersebut','tersebutlah', - 'tertentu','tertuju','terus','terutama','tetap','tetapi','tiap','tiba','tiba-tiba','tidak', - 'tidakkah','tidaklah','tiga','toh','tuju','tunjuk','turut','tutur','tuturnya','u','ucap','ucapnya', - 'ujar','ujarnya','umumnya','ungkap','ungkapnya','untuk','usah','usai','v','w','waduh','wah','wahai', - 'waktunya','walau','walaupun','wong','x','y','ya','yaitu','yakin','yakni','yang','z'] - - - + @functools.lru_cache(maxsize=32) + def get_prod_stop_word_dictionary(self): + stopWords = self.get_stop_words() + return ArrayDictionary(stopWords) + def get_stop_words(self): + current_dir = os.path.dirname(os.path.realpath(__file__)) + dictionaryFile = current_dir + '/data/stopword_tala_2003.txt' + + if not os.path.isfile(dictionaryFile): + raise RuntimeError('Stopword file is missing. It seems that your installation is corrupted.') + + content = {} + with open(dictionaryFile, 'r') as f: + word = f.read() + content.words[word] = word + + return content \ No newline at end of file diff --git a/src/Sastrawi/StopWordRemover/data/stopword_tala_2003.txt b/src/Sastrawi/StopWordRemover/data/stopword_tala_2003.txt new file mode 100644 index 0000000..bf88a45 --- /dev/null +++ b/src/Sastrawi/StopWordRemover/data/stopword_tala_2003.txt @@ -0,0 +1,758 @@ +ada +adalah +adanya +adapun +agak +agaknya +agar +akan +akankah +akhir +akhiri +akhirnya +aku +akulah +amat +amatlah +anda +andalah +antar +antara +antaranya +apa +apaan +apabila +apakah +apalagi +apatah +artinya +asal +asalkan +atas +atau +ataukah +ataupun +awal +awalnya +bagai +bagaikan +bagaimana +bagaimanakah +bagaimanapun +bagi +bagian +bahkan +bahwa +bahwasanya +baik +bakal +bakalan +balik +banyak +bapak +baru +bawah +beberapa +begini +beginian +beginikah +beginilah +begitu +begitukah +begitulah +begitupun +bekerja +belakang +belakangan +belum +belumlah +benar +benarkah +benarlah +berada +berakhir +berakhirlah +berakhirnya +berapa +berapakah +berapalah +berapapun +berarti +berawal +berbagai +berdatangan +beri +berikan +berikut +berikutnya +berjumlah +berkali-kali +berkata +berkehendak +berkeinginan +berkenaan +berlainan +berlalu +berlangsung +berlebihan +bermacam +bermacam-macam +bermaksud +bermula +bersama +bersama-sama +bersiap +bersiap-siap +bertanya +bertanya-tanya +berturut +berturut-turut +bertutur +berujar +berupa +besar +betul +betulkah +biasa +biasanya +bila +bilakah +bisa +bisakah +boleh +bolehkah +bolehlah +buat +bukan +bukankah +bukanlah +bukannya +bulan +bung +cara +caranya +cukup +cukupkah +cukuplah +cuma +dahulu +dalam +dan +dapat +dari +daripada +datang +dekat +demi +demikian +demikianlah +dengan +depan +di +dia +diakhiri +diakhirinya +dialah +diantara +diantaranya +diberi +diberikan +diberikannya +dibuat +dibuatnya +didapat +didatangkan +digunakan +diibaratkan +diibaratkannya +diingat +diingatkan +diinginkan +dijawab +dijelaskan +dijelaskannya +dikarenakan +dikatakan +dikatakannya +dikerjakan +diketahui +diketahuinya +dikira +dilakukan +dilalui +dilihat +dimaksud +dimaksudkan +dimaksudkannya +dimaksudnya +diminta +dimintai +dimisalkan +dimulai +dimulailah +dimulainya +dimungkinkan +dini +dipastikan +diperbuat +diperbuatnya +dipergunakan +diperkirakan +diperlihatkan +diperlukan +diperlukannya +dipersoalkan +dipertanyakan +dipunyai +diri +dirinya +disampaikan +disebut +disebutkan +disebutkannya +disini +disinilah +ditambahkan +ditandaskan +ditanya +ditanyai +ditanyakan +ditegaskan +ditujukan +ditunjuk +ditunjuki +ditunjukkan +ditunjukkannya +ditunjuknya +dituturkan +dituturkannya +diucapkan +diucapkannya +diungkapkan +dong +dua +dulu +empat +enggak +enggaknya +entah +entahlah +guna +gunakan +hal +hampir +hanya +hanyalah +hari +harus +haruslah +harusnya +hendak +hendaklah +hendaknya +hingga +ia +ialah +ibarat +ibaratkan +ibaratnya +ibu +ikut +ingat +ingat-ingat +ingin +inginkah +inginkan +ini +inikah +inilah +itu +itukah +itulah +jadi +jadilah +jadinya +jangan +jangankan +janganlah +jauh +jawab +jawaban +jawabnya +jelas +jelaskan +jelaslah +jelasnya +jika +jikalau +juga +jumlah +jumlahnya +justru +kala +kalau +kalaulah +kalaupun +kalian +kami +kamilah +kamu +kamulah +kan +kapan +kapankah +kapanpun +karena +karenanya +kasus +kata +katakan +katakanlah +katanya +ke +keadaan +kebetulan +kecil +kedua +keduanya +keinginan +kelamaan +kelihatan +kelihatannya +kelima +keluar +kembali +kemudian +kemungkinan +kemungkinannya +kenapa +kepada +kepadanya +kesampaian +keseluruhan +keseluruhannya +keterlaluan +ketika +khususnya +kini +kinilah +kira +kira-kira +kiranya +kita +kitalah +kok +kurang +lagi +lagian +lah +lain +lainnya +lalu +lama +lamanya +lanjut +lanjutnya +lebih +lewat +lima +luar +macam +maka +makanya +makin +malah +malahan +mampu +mampukah +mana +manakala +manalagi +masa +masalah +masalahnya +masih +masihkah +masing +masing-masing +mau +maupun +melainkan +melakukan +melalui +melihat +melihatnya +memang +memastikan +memberi +memberikan +membuat +memerlukan +memihak +meminta +memintakan +memisalkan +memperbuat +mempergunakan +memperkirakan +memperlihatkan +mempersiapkan +mempersoalkan +mempertanyakan +mempunyai +memulai +memungkinkan +menaiki +menambahkan +menandaskan +menanti +menanti-nanti +menantikan +menanya +menanyai +menanyakan +mendapat +mendapatkan +mendatang +mendatangi +mendatangkan +menegaskan +mengakhiri +mengapa +mengatakan +mengatakannya +mengenai +mengerjakan +mengetahui +menggunakan +menghendaki +mengibaratkan +mengibaratkannya +mengingat +mengingatkan +menginginkan +mengira +mengucapkan +mengucapkannya +mengungkapkan +menjadi +menjawab +menjelaskan +menuju +menunjuk +menunjuki +menunjukkan +menunjuknya +menurut +menuturkan +menyampaikan +menyangkut +menyatakan +menyebutkan +menyeluruh +menyiapkan +merasa +mereka +merekalah +merupakan +meski +meskipun +meyakini +meyakinkan +minta +mirip +misal +misalkan +misalnya +mula +mulai +mulailah +mulanya +mungkin +mungkinkah +nah +naik +namun +nanti +nantinya +nyaris +nyatanya +oleh +olehnya +pada +padahal +padanya +pak +paling +panjang +pantas +para +pasti +pastilah +penting +pentingnya +per +percuma +perlu +perlukah +perlunya +pernah +persoalan +pertama +pertama-tama +pertanyaan +pertanyakan +pihak +pihaknya +pukul +pula +pun +punya +rasa +rasanya +rata +rupanya +saat +saatnya +saja +sajalah +saling +sama +sama-sama +sambil +sampai +sampai-sampai +sampaikan +sana +sangat +sangatlah +satu +saya +sayalah +se +sebab +sebabnya +sebagai +sebagaimana +sebagainya +sebagian +sebaik +sebaik-baiknya +sebaiknya +sebaliknya +sebanyak +sebegini +sebegitu +sebelum +sebelumnya +sebenarnya +seberapa +sebesar +sebetulnya +sebisanya +sebuah +sebut +sebutlah +sebutnya +secara +secukupnya +sedang +sedangkan +sedemikian +sedikit +sedikitnya +seenaknya +segala +segalanya +segera +seharusnya +sehingga +seingat +sejak +sejauh +sejenak +sejumlah +sekadar +sekadarnya +sekali +sekali-kali +sekalian +sekaligus +sekalipun +sekarang +sekarang +sekecil +seketika +sekiranya +sekitar +sekitarnya +sekurang-kurangnya +sekurangnya +sela +selain +selaku +selalu +selama +selama-lamanya +selamanya +selanjutnya +seluruh +seluruhnya +semacam +semakin +semampu +semampunya +semasa +semasih +semata +semata-mata +semaunya +sementara +semisal +semisalnya +sempat +semua +semuanya +semula +sendiri +sendirian +sendirinya +seolah +seolah-olah +seorang +sepanjang +sepantasnya +sepantasnyalah +seperlunya +seperti +sepertinya +sepihak +sering +seringnya +serta +serupa +sesaat +sesama +sesampai +sesegera +sesekali +seseorang +sesuatu +sesuatunya +sesudah +sesudahnya +setelah +setempat +setengah +seterusnya +setiap +setiba +setibanya +setidak-tidaknya +setidaknya +setinggi +seusai +sewaktu +siap +siapa +siapakah +siapapun +sini +sinilah +soal +soalnya +suatu +sudah +sudahkah +sudahlah +supaya +tadi +tadinya +tahu +tahun +tak +tambah +tambahnya +tampak +tampaknya +tandas +tandasnya +tanpa +tanya +tanyakan +tanyanya +tapi +tegas +tegasnya +telah +tempat +tengah +tentang +tentu +tentulah +tentunya +tepat +terakhir +terasa +terbanyak +terdahulu +terdapat +terdiri +terhadap +terhadapnya +teringat +teringat-ingat +terjadi +terjadilah +terjadinya +terkira +terlalu +terlebih +terlihat +termasuk +ternyata +tersampaikan +tersebut +tersebutlah +tertentu +tertuju +terus +terutama +tetap +tetapi +tiap +tiba +tiba-tiba +tidak +tidakkah +tidaklah +tiga +tinggi +toh +tunjuk +turut +tutur +tuturnya +ucap +ucapnya +ujar +ujarnya +umum +umumnya +ungkap +ungkapnya +untuk +usah +usai +waduh +wah +wahai +waktu +waktunya +walau +walaupun +wong +yaitu +yakin +yakni +yang \ No newline at end of file From 9890fcf531088d070600a82f25a42690e29cf516 Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Fri, 15 Mar 2019 01:48:49 +0700 Subject: [PATCH 14/33] Test Stopword Tala --- src/Sastrawi/Stemmer/CachedStemmer.py | 2 +- .../StopWordRemover/stop_word_remover_factory_test.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Sastrawi/Stemmer/CachedStemmer.py b/src/Sastrawi/Stemmer/CachedStemmer.py index 97258bc..0356777 100644 --- a/src/Sastrawi/Stemmer/CachedStemmer.py +++ b/src/Sastrawi/Stemmer/CachedStemmer.py @@ -17,7 +17,7 @@ def stem(self, text): if self.cache.has(word): stems.append(self.cache.get(word)) else: - stem = self.delegatedStemmer.stem(word) + stem = self.delegatedStemmer.stem_word(word) self.cache.set(word, stem) stems.append(stem) diff --git a/tests/UnitTests/StopWordRemover/stop_word_remover_factory_test.py b/tests/UnitTests/StopWordRemover/stop_word_remover_factory_test.py index 5f758f8..0b4163e 100644 --- a/tests/UnitTests/StopWordRemover/stop_word_remover_factory_test.py +++ b/tests/UnitTests/StopWordRemover/stop_word_remover_factory_test.py @@ -9,6 +9,9 @@ def setUp(self): def test_createStopWordRemover(self): self.assertIsInstance(self.factory.create_stop_word_remover(), StopWordRemover) + sremover = self.factory.create_stop_word_remover() + self.assertEqual('pergi sekolah', sremover.remove('pergi ke sekolah yang')) + self.assertEqual('makan rumah', sremover.remove('makan di rumah yang')) if __name__ == '__main__': unittest.main() From 7a55cbf575552014ebbfd8a139f90fab91e94eb9 Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Fri, 15 Mar 2019 08:50:10 +0700 Subject: [PATCH 15/33] Boost Performance --- src/Sastrawi/Stemmer/StemmerFactory.py | 2 +- .../UnitTests/Stemmer/stemmer_factory_test.py | 19 +++++++++++++++++++ .../stop_word_remover_factory_test.py | 14 ++++++++++++++ 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/src/Sastrawi/Stemmer/StemmerFactory.py b/src/Sastrawi/Stemmer/StemmerFactory.py index 1e7a583..ace9a63 100644 --- a/src/Sastrawi/Stemmer/StemmerFactory.py +++ b/src/Sastrawi/Stemmer/StemmerFactory.py @@ -24,7 +24,7 @@ def create_stemmer(self, isDev=False): return cachedStemmer - @functools.lru_cache(maxsize=32) + @functools.lru_cache(maxsize=640) def get_prod_words_dictionary(self): words = self.get_words_from_file() dictionary = ArrayDictionary(words) diff --git a/tests/UnitTests/Stemmer/stemmer_factory_test.py b/tests/UnitTests/Stemmer/stemmer_factory_test.py index 146df28..f4f12cf 100644 --- a/tests/UnitTests/Stemmer/stemmer_factory_test.py +++ b/tests/UnitTests/Stemmer/stemmer_factory_test.py @@ -1,4 +1,5 @@ import unittest +import time from Sastrawi.Stemmer.StemmerFactory import StemmerFactory from Sastrawi.Stemmer.Stemmer import Stemmer @@ -23,6 +24,24 @@ def test_fungsional(self): if output != expected: raise AssertionError(str.format('output is {} instead of {}', output, expected)) + def test_execution_time(self): + start = time.time() + sentence = 'Rakyat memenuHi halaMan geDung DPR unTuk menyuarakan isi hatinya. Saat Itu, situasi sangat genting sekali. Terjadi kerusuhan yang mengiringi pergerakan mahasiswa yang memperjuangkan reformasi.' + + factory = StemmerFactory() + stemmer = factory.create_stemmer() + + stemmer.stem(sentence) + + end = time.time() + + execution_time = end - start + + # print(execution_time) + + # test execution time < 3 seconds + self.assertTrue(execution_time < 3) + def test_getWordsFromFile(self): factory = StemmerFactory() factory.get_words_from_file() diff --git a/tests/UnitTests/StopWordRemover/stop_word_remover_factory_test.py b/tests/UnitTests/StopWordRemover/stop_word_remover_factory_test.py index 0b4163e..0ef6695 100644 --- a/tests/UnitTests/StopWordRemover/stop_word_remover_factory_test.py +++ b/tests/UnitTests/StopWordRemover/stop_word_remover_factory_test.py @@ -1,4 +1,5 @@ import unittest +import time from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory from Sastrawi.StopWordRemover.StopWordRemover import StopWordRemover @@ -9,9 +10,22 @@ def setUp(self): def test_createStopWordRemover(self): self.assertIsInstance(self.factory.create_stop_word_remover(), StopWordRemover) + + def test_stopwordRemoval(self): sremover = self.factory.create_stop_word_remover() self.assertEqual('pergi sekolah', sremover.remove('pergi ke sekolah yang')) self.assertEqual('makan rumah', sremover.remove('makan di rumah yang')) + def test_execution_time(self): + start = time.time() + sentence = 'Rakyat memenuHi halaMan geDung DPR unTuk menyuarakan isi hatinya. Saat Itu, situasi sangat genting sekali. Terjadi kerusuhan yang mengiringi pergerakan mahasiswa yang memperjuangkan reformasi.' + sremover = self.factory.create_stop_word_remover() + sremover.remove(sentence) + end = time.time() + # print(execution_time) + execution_time = end - start + + self.assertTrue(execution_time < 1) + if __name__ == '__main__': unittest.main() From 5630ad6ce40b831fdcbdfcf64d27037f595a0765 Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Fri, 15 Mar 2019 09:06:31 +0700 Subject: [PATCH 16/33] add stem word --- src/Sastrawi/Stemmer/CachedStemmer.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/Sastrawi/Stemmer/CachedStemmer.py b/src/Sastrawi/Stemmer/CachedStemmer.py index 0356777..92edfe0 100644 --- a/src/Sastrawi/Stemmer/CachedStemmer.py +++ b/src/Sastrawi/Stemmer/CachedStemmer.py @@ -22,6 +22,14 @@ def stem(self, text): stems.append(stem) return ' '.join(stems) - + + def stem_word(self, word): + if self.cache.has(word): + return self.cache.get(word) + else: + stem = self.delegatedStemmer.stem_word(word) + self.cache.set(word, stem) + return stem + def get_cache(self): return self.cache From 1d9554f289519e9cc7d6283bbcd1eb77a949268e Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Fri, 15 Mar 2019 09:17:43 +0700 Subject: [PATCH 17/33] add stem & stopword removal from tokens/word list --- src/Sastrawi/Stemmer/CachedStemmer.py | 10 +++++++++- src/Sastrawi/StopWordRemover/StopWordRemover.py | 8 +++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/Sastrawi/Stemmer/CachedStemmer.py b/src/Sastrawi/Stemmer/CachedStemmer.py index 92edfe0..ad4e1ef 100644 --- a/src/Sastrawi/Stemmer/CachedStemmer.py +++ b/src/Sastrawi/Stemmer/CachedStemmer.py @@ -7,7 +7,7 @@ def __init__(self, cache, delegatedStemmer): self.cache = cache self.delegatedStemmer = delegatedStemmer - def stem(self, text): + def stem(self, text: str): normalizedText = TextNormalizer.normalize_text(text) words = normalizedText.split(' ') @@ -31,5 +31,13 @@ def stem_word(self, word): self.cache.set(word, stem) return stem + # Stemming word in Tokens + # @author Mufid Jamaluddin + def stem_tokens(self, tokens: list): + stemmed_tokens = [] + for token in tokens: + stemmed_tokens.append(self.stem_word(token)) + return stemmed_tokens + def get_cache(self): return self.cache diff --git a/src/Sastrawi/StopWordRemover/StopWordRemover.py b/src/Sastrawi/StopWordRemover/StopWordRemover.py index d3de2ff..cb462cf 100644 --- a/src/Sastrawi/StopWordRemover/StopWordRemover.py +++ b/src/Sastrawi/StopWordRemover/StopWordRemover.py @@ -14,6 +14,8 @@ def remove(self, text): return ' '.join(stopped_words) - - - + # Remove Stopword in Tokens + # @author Mufid Jamaluddin + def remove_tokens(self, tokens:list): + clean_tokens = [token for token in tokens if not self.dictionary.contains(token)] + return clean_tokens \ No newline at end of file From 81b06a4730b1fb34205ed5f8c6432c67cf92bb87 Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Fri, 15 Mar 2019 09:23:46 +0700 Subject: [PATCH 18/33] add python 3.7 --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 65ec28e..4d259df 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,6 +4,7 @@ python: - "3.3" - "3.4" - "3.5" + - "3.7" sudo: false install: - pip install python-coveralls From 150a839ba3ac98550e0c850119edc02b3fa6d7bc Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Fri, 15 Mar 2019 22:55:58 +0700 Subject: [PATCH 19/33] Minor --- .vscode/settings.json | 3 +++ src/Sastrawi/Dictionary/ArrayDictionary.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..500bc70 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.linting.pylintEnabled": true +} \ No newline at end of file diff --git a/src/Sastrawi/Dictionary/ArrayDictionary.py b/src/Sastrawi/Dictionary/ArrayDictionary.py index 5c864f0..8f94982 100644 --- a/src/Sastrawi/Dictionary/ArrayDictionary.py +++ b/src/Sastrawi/Dictionary/ArrayDictionary.py @@ -3,7 +3,7 @@ class ArrayDictionary(object): def __init__(self, words=None): - if words is is dict: + if words is dict: self.words = words if words is list: From 99bfac54e96ec6b5e917a1094f7bc71da79dc18f Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Fri, 15 Mar 2019 23:40:04 +0700 Subject: [PATCH 20/33] Fix Error --- src/Sastrawi/Dictionary/ArrayDictionary.py | 10 ++++---- src/Sastrawi/Stemmer/CachedStemmer.py | 2 +- src/Sastrawi/Stemmer/StemmerFactory.py | 24 +++++++++---------- .../StopWordRemover/StopWordRemover.py | 2 +- .../StopWordRemover/StopWordRemoverFactory.py | 16 ++++++------- 5 files changed, 24 insertions(+), 30 deletions(-) diff --git a/src/Sastrawi/Dictionary/ArrayDictionary.py b/src/Sastrawi/Dictionary/ArrayDictionary.py index 8f94982..3438455 100644 --- a/src/Sastrawi/Dictionary/ArrayDictionary.py +++ b/src/Sastrawi/Dictionary/ArrayDictionary.py @@ -2,12 +2,10 @@ class ArrayDictionary(object): """description of class""" def __init__(self, words=None): - - if words is dict: - self.words = words - - if words is list: - self.words = {} + if words is dict: + self.words = words + elif words is list: + self.words = {} self.add_words(words) def contains(self, word): diff --git a/src/Sastrawi/Stemmer/CachedStemmer.py b/src/Sastrawi/Stemmer/CachedStemmer.py index ad4e1ef..854a8c3 100644 --- a/src/Sastrawi/Stemmer/CachedStemmer.py +++ b/src/Sastrawi/Stemmer/CachedStemmer.py @@ -33,7 +33,7 @@ def stem_word(self, word): # Stemming word in Tokens # @author Mufid Jamaluddin - def stem_tokens(self, tokens: list): + def stem_tokens(self, tokens): stemmed_tokens = [] for token in tokens: stemmed_tokens.append(self.stem_word(token)) diff --git a/src/Sastrawi/Stemmer/StemmerFactory.py b/src/Sastrawi/Stemmer/StemmerFactory.py index ace9a63..3578cac 100644 --- a/src/Sastrawi/Stemmer/StemmerFactory.py +++ b/src/Sastrawi/Stemmer/StemmerFactory.py @@ -12,10 +12,10 @@ class StemmerFactory(object): def create_stemmer(self, isDev=False): """ Returns Stemmer instance """ if isDev: - words = self.get_words_from_file() - dictionary = ArrayDictionary(words) + words = self.get_words_from_file() + dictionary = ArrayDictionary(words) else: - dictionary = self.get_prod_words_dictionary() + dictionary = self.get_prod_words_dictionary() stemmer = Stemmer(dictionary) @@ -24,11 +24,11 @@ def create_stemmer(self, isDev=False): return cachedStemmer - @functools.lru_cache(maxsize=640) - def get_prod_words_dictionary(self): - words = self.get_words_from_file() - dictionary = ArrayDictionary(words) - return dictionary + @functools.lru_cache(maxsize=640) + def get_prod_words_dictionary(self): + words = self.get_words_from_file() + dictionary = ArrayDictionary(words) + return dictionary #def get_words(self, isDev=False): #if isDev or callable(getattr(self, 'apc_fetch')): @@ -47,9 +47,7 @@ def get_words_from_file(self): if not os.path.isfile(dictionaryFile): raise RuntimeError('Dictionary file is missing. It seems that your installation is corrupted.') - content = {} + text = '' with open(dictionaryFile, 'r') as f: - word = f.read() - content.words[word] = word - - return content \ No newline at end of file + text = f.read() + return text.split('\n') \ No newline at end of file diff --git a/src/Sastrawi/StopWordRemover/StopWordRemover.py b/src/Sastrawi/StopWordRemover/StopWordRemover.py index cb462cf..a5bbd3e 100644 --- a/src/Sastrawi/StopWordRemover/StopWordRemover.py +++ b/src/Sastrawi/StopWordRemover/StopWordRemover.py @@ -16,6 +16,6 @@ def remove(self, text): # Remove Stopword in Tokens # @author Mufid Jamaluddin - def remove_tokens(self, tokens:list): + def remove_tokens(self, tokens): clean_tokens = [token for token in tokens if not self.dictionary.contains(token)] return clean_tokens \ No newline at end of file diff --git a/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py b/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py index 2c206e6..010175d 100644 --- a/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py +++ b/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py @@ -16,10 +16,10 @@ def create_stop_word_remover(self, isDev=False): stopWordRemover = StopWordRemover(dictionary) return stopWordRemover - @functools.lru_cache(maxsize=32) - def get_prod_stop_word_dictionary(self): - stopWords = self.get_stop_words() - return ArrayDictionary(stopWords) + @functools.lru_cache(maxsize=32) + def get_prod_stop_word_dictionary(self): + stopWords = self.get_stop_words() + return ArrayDictionary(stopWords) def get_stop_words(self): current_dir = os.path.dirname(os.path.realpath(__file__)) @@ -28,9 +28,7 @@ def get_stop_words(self): if not os.path.isfile(dictionaryFile): raise RuntimeError('Stopword file is missing. It seems that your installation is corrupted.') - content = {} + text = '' with open(dictionaryFile, 'r') as f: - word = f.read() - content.words[word] = word - - return content \ No newline at end of file + text = f.read() + return text.split('\n') \ No newline at end of file From edf2c818a4bec08aaba2543784174d8e55c134ee Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Fri, 15 Mar 2019 23:51:52 +0700 Subject: [PATCH 21/33] fix error python 2.7 --- src/Sastrawi/Dictionary/ArrayDictionary.py | 9 ++------- src/Sastrawi/Stemmer/CachedStemmer.py | 2 +- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/src/Sastrawi/Dictionary/ArrayDictionary.py b/src/Sastrawi/Dictionary/ArrayDictionary.py index 3438455..1583678 100644 --- a/src/Sastrawi/Dictionary/ArrayDictionary.py +++ b/src/Sastrawi/Dictionary/ArrayDictionary.py @@ -1,11 +1,11 @@ class ArrayDictionary(object): """description of class""" + words = {} def __init__(self, words=None): if words is dict: self.words = words elif words is list: - self.words = {} self.add_words(words) def contains(self, word): @@ -22,9 +22,4 @@ def add(self, word): """Add a word to the dictionary""" if not word or word.strip() == '': return - self.words[word]=word - - - - - + self.words[word]=word \ No newline at end of file diff --git a/src/Sastrawi/Stemmer/CachedStemmer.py b/src/Sastrawi/Stemmer/CachedStemmer.py index 854a8c3..8979114 100644 --- a/src/Sastrawi/Stemmer/CachedStemmer.py +++ b/src/Sastrawi/Stemmer/CachedStemmer.py @@ -7,7 +7,7 @@ def __init__(self, cache, delegatedStemmer): self.cache = cache self.delegatedStemmer = delegatedStemmer - def stem(self, text: str): + def stem(self, text): normalizedText = TextNormalizer.normalize_text(text) words = normalizedText.split(' ') From a47d9b202d3ef93bd73dbf0828479d102c7c0e0b Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Sat, 16 Mar 2019 00:27:36 +0700 Subject: [PATCH 22/33] LruCache python 2.7 --- .travis.yml | 1 + src/Sastrawi/Stemmer/StemmerFactory.py | 4 ++-- src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index 4d259df..e00cd16 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,6 +9,7 @@ sudo: false install: - pip install python-coveralls - pip install coveralls + - pip install cachetools script: nosetests tests --verbose --with-coverage after_success: - coveralls diff --git a/src/Sastrawi/Stemmer/StemmerFactory.py b/src/Sastrawi/Stemmer/StemmerFactory.py index 3578cac..1b861be 100644 --- a/src/Sastrawi/Stemmer/StemmerFactory.py +++ b/src/Sastrawi/Stemmer/StemmerFactory.py @@ -1,5 +1,5 @@ import os -import functools +from cachetools import cached, LRUCache from Sastrawi.Dictionary.ArrayDictionary import ArrayDictionary from Sastrawi.Stemmer.Stemmer import Stemmer from Sastrawi.Stemmer.CachedStemmer import CachedStemmer @@ -24,7 +24,7 @@ def create_stemmer(self, isDev=False): return cachedStemmer - @functools.lru_cache(maxsize=640) + @cached(cache=LRUCache(maxsize=32)) def get_prod_words_dictionary(self): words = self.get_words_from_file() dictionary = ArrayDictionary(words) diff --git a/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py b/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py index 010175d..668ed94 100644 --- a/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py +++ b/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py @@ -1,5 +1,5 @@ import os -import functools +from cachetools import cached, LRUCache from Sastrawi.Dictionary.ArrayDictionary import ArrayDictionary from Sastrawi.StopWordRemover.StopWordRemover import StopWordRemover @@ -16,7 +16,7 @@ def create_stop_word_remover(self, isDev=False): stopWordRemover = StopWordRemover(dictionary) return stopWordRemover - @functools.lru_cache(maxsize=32) + @cached(cache=LRUCache(maxsize=8)) def get_prod_stop_word_dictionary(self): stopWords = self.get_stop_words() return ArrayDictionary(stopWords) From 58d35a7f6570cdb4656186aee114fab53ef3ad30 Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Sat, 16 Mar 2019 01:27:26 +0700 Subject: [PATCH 23/33] minor --- tests/FunctionalTests/Stemmer/stemmer_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/FunctionalTests/Stemmer/stemmer_test.py b/tests/FunctionalTests/Stemmer/stemmer_test.py index c1f56ed..8609fb7 100644 --- a/tests/FunctionalTests/Stemmer/stemmer_test.py +++ b/tests/FunctionalTests/Stemmer/stemmer_test.py @@ -328,8 +328,8 @@ def get_test_data(self): data.append(['menahan', 'tahan']) # test stem multiple sentences - multipleSentence1 = 'Cinta telah bertebaran.Keduanya saling mencintai.'; - multipleSentence2 = "(Cinta telah bertebaran)\n\n\n\nKeduanya saling mencintai."; + multipleSentence1 = 'Cinta telah bertebaran.Keduanya saling mencintai.' + multipleSentence2 = "(Cinta telah bertebaran)\n\n\n\nKeduanya saling mencintai." data.append([multipleSentence1, 'cinta telah tebar dua saling cinta']) data.append([multipleSentence2, 'cinta telah tebar dua saling cinta']) From 345edd1fd1f10c57c755349ac646b165868b3bcb Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Sat, 16 Mar 2019 02:41:10 +0700 Subject: [PATCH 24/33] Fix critical bugs --- src/Sastrawi/Dictionary/ArrayDictionary.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/Sastrawi/Dictionary/ArrayDictionary.py b/src/Sastrawi/Dictionary/ArrayDictionary.py index 1583678..a476e52 100644 --- a/src/Sastrawi/Dictionary/ArrayDictionary.py +++ b/src/Sastrawi/Dictionary/ArrayDictionary.py @@ -1,12 +1,15 @@ class ArrayDictionary(object): """description of class""" - words = {} def __init__(self, words=None): - if words is dict: + if words is None: + self.words = {} + elif type(words) is dict: self.words = words - elif words is list: + elif type(words) is list: self.add_words(words) + else: + self.words = {} def contains(self, word): return word in self.words @@ -22,4 +25,4 @@ def add(self, word): """Add a word to the dictionary""" if not word or word.strip() == '': return - self.words[word]=word \ No newline at end of file + self.words[word] = word \ No newline at end of file From 1a5f7d66fddd3090b73cc30eb43e13ec527cc860 Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Sat, 16 Mar 2019 03:02:56 +0700 Subject: [PATCH 25/33] Travis for Python 3.7 --- .travis.yml | 9 +++++++-- src/Sastrawi/Stemmer/CachedStemmer.py | 2 ++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index e00cd16..87a5c32 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,8 +4,13 @@ python: - "3.3" - "3.4" - "3.5" - - "3.7" sudo: false +# Enable 3.7 without globally enabling sudo and dist: xenial for other build jobs +matrix: + include: + - python: 3.7 + dist: xenial + sudo: true install: - pip install python-coveralls - pip install coveralls @@ -14,4 +19,4 @@ script: nosetests tests --verbose --with-coverage after_success: - coveralls notifications: - email: false + email: false \ No newline at end of file diff --git a/src/Sastrawi/Stemmer/CachedStemmer.py b/src/Sastrawi/Stemmer/CachedStemmer.py index 8979114..e052077 100644 --- a/src/Sastrawi/Stemmer/CachedStemmer.py +++ b/src/Sastrawi/Stemmer/CachedStemmer.py @@ -36,6 +36,8 @@ def stem_word(self, word): def stem_tokens(self, tokens): stemmed_tokens = [] for token in tokens: + if not token or token.strip() == '': + continue stemmed_tokens.append(self.stem_word(token)) return stemmed_tokens From ae3bc914979fb5eeada827dc8bf9cf8913779551 Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Sat, 16 Mar 2019 04:13:44 +0700 Subject: [PATCH 26/33] add test case --- .../Dictionary/array_dictionary_test.py | 7 +++++++ .../UnitTests/Stemmer/stemmer_factory_test.py | 19 +++++++++++++++++++ .../stop_word_remover_factory_test.py | 11 +++++++++++ 3 files changed, 37 insertions(+) diff --git a/tests/UnitTests/Dictionary/array_dictionary_test.py b/tests/UnitTests/Dictionary/array_dictionary_test.py index 57d72fc..8eb63f8 100644 --- a/tests/UnitTests/Dictionary/array_dictionary_test.py +++ b/tests/UnitTests/Dictionary/array_dictionary_test.py @@ -36,5 +36,12 @@ def test_constructor_preserve_words(self): self.assertTrue(dictionary.contains('word1')) self.assertTrue(dictionary.contains('word2')) + def test_dict_param(self): + dictionary = ArrayDictionary({'word1':'word1', 'word2':'word2'}) + self.assertTrue(dictionary.contains('word1')) + self.assertTrue(dictionary.contains('word2')) + self.assertFalse(dictionary.contains('word3')) + self.assertEqual(2, dictionary.count()) + if __name__ == '__main__': unittest.main() diff --git a/tests/UnitTests/Stemmer/stemmer_factory_test.py b/tests/UnitTests/Stemmer/stemmer_factory_test.py index f4f12cf..c3eb4ff 100644 --- a/tests/UnitTests/Stemmer/stemmer_factory_test.py +++ b/tests/UnitTests/Stemmer/stemmer_factory_test.py @@ -46,5 +46,24 @@ def test_getWordsFromFile(self): factory = StemmerFactory() factory.get_words_from_file() + def test_word_stemming(self): + factory = StemmerFactory() + stemmer = factory.create_stemmer() + self.assertEqual('besar', stemmer.stem('terbesar')) + self.assertEqual('abai', stemmer.stem('diabaikan')) + + def test_tokens_stemming(self): + factory = StemmerFactory() + stemmer = factory.create_stemmer() + tokens = ['perekonomian', 'indonesia', 'sedang', 'dalam', 'pertumbuhan' ,'yang', 'membanggakan'] + clean_tokens = stemmer.stem_tokens(tokens) + self.assertEqual('ekonomi', clean_tokens[0]) + self.assertEqual('indonesia', clean_tokens[1]) + self.assertEqual('sedang', clean_tokens[2]) + self.assertEqual('dalam', clean_tokens[3]) + self.assertEqual('tumbuh', clean_tokens[4]) + self.assertEqual('yang', clean_tokens[5]) + self.assertEqual('bangga', clean_tokens[6]) + if __name__ == '__main__': unittest.main() diff --git a/tests/UnitTests/StopWordRemover/stop_word_remover_factory_test.py b/tests/UnitTests/StopWordRemover/stop_word_remover_factory_test.py index 0ef6695..8c82336 100644 --- a/tests/UnitTests/StopWordRemover/stop_word_remover_factory_test.py +++ b/tests/UnitTests/StopWordRemover/stop_word_remover_factory_test.py @@ -15,6 +15,17 @@ def test_stopwordRemoval(self): sremover = self.factory.create_stop_word_remover() self.assertEqual('pergi sekolah', sremover.remove('pergi ke sekolah yang')) self.assertEqual('makan rumah', sremover.remove('makan di rumah yang')) + + def test_tokens_stopwordRemoval(self): + tokens = ['pergi', 'ke', 'sekolah', 'yang', 'bagus', 'adalah', 'impian'] + sremover = self.factory.create_stop_word_remover() + clean_tokens = sremover.remove_tokens(tokens) + text = ' '.join(clean_tokens) + self.assertEquals('pergi sekolah bagus impian', text) + self.assertEqual('pergi', clean_tokens[0]) + self.assertEqual('sekolah', clean_tokens[1]) + self.assertEqual('bagus', clean_tokens[2]) + self.assertEqual('impian', clean_tokens[3]) def test_execution_time(self): start = time.time() From 9fc1b3eb9dee4f135064c934388fda5ecf1c0fc1 Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Sat, 16 Mar 2019 04:45:15 +0700 Subject: [PATCH 27/33] Add Test Case --- tests/UnitTests/Dictionary/array_dictionary_test.py | 4 ++++ tests/UnitTests/Stemmer/stemmer_factory_test.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/UnitTests/Dictionary/array_dictionary_test.py b/tests/UnitTests/Dictionary/array_dictionary_test.py index 8eb63f8..2d736ed 100644 --- a/tests/UnitTests/Dictionary/array_dictionary_test.py +++ b/tests/UnitTests/Dictionary/array_dictionary_test.py @@ -43,5 +43,9 @@ def test_dict_param(self): self.assertFalse(dictionary.contains('word3')) self.assertEqual(2, dictionary.count()) + def test_non_dict_list(self): + dictionary = ArrayDictionary('$$%&**&(^&') + self.assertTrue(0, dictionary.count()) + if __name__ == '__main__': unittest.main() diff --git a/tests/UnitTests/Stemmer/stemmer_factory_test.py b/tests/UnitTests/Stemmer/stemmer_factory_test.py index c3eb4ff..3ae5394 100644 --- a/tests/UnitTests/Stemmer/stemmer_factory_test.py +++ b/tests/UnitTests/Stemmer/stemmer_factory_test.py @@ -55,7 +55,7 @@ def test_word_stemming(self): def test_tokens_stemming(self): factory = StemmerFactory() stemmer = factory.create_stemmer() - tokens = ['perekonomian', 'indonesia', 'sedang', 'dalam', 'pertumbuhan' ,'yang', 'membanggakan'] + tokens = ['perekonomian', '', 'indonesia', 'sedang', ' ', 'dalam', 'pertumbuhan' ,'yang', 'membanggakan'] clean_tokens = stemmer.stem_tokens(tokens) self.assertEqual('ekonomi', clean_tokens[0]) self.assertEqual('indonesia', clean_tokens[1]) From 15fe5d620b8c3731e8b307d05549eecd2ddc8dc9 Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Sat, 16 Mar 2019 05:23:34 +0700 Subject: [PATCH 28/33] Test Case --- .../Dictionary/array_dictionary_test.py | 7 ++++++- .../UnitTests/Stemmer/stemmer_factory_test.py | 19 ++++++++++++++----- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/tests/UnitTests/Dictionary/array_dictionary_test.py b/tests/UnitTests/Dictionary/array_dictionary_test.py index 2d736ed..049ba94 100644 --- a/tests/UnitTests/Dictionary/array_dictionary_test.py +++ b/tests/UnitTests/Dictionary/array_dictionary_test.py @@ -36,16 +36,21 @@ def test_constructor_preserve_words(self): self.assertTrue(dictionary.contains('word1')) self.assertTrue(dictionary.contains('word2')) + # Test ArrayDictionary dengan tipe data dict + # @author Mufid Jamaluddin def test_dict_param(self): dictionary = ArrayDictionary({'word1':'word1', 'word2':'word2'}) self.assertTrue(dictionary.contains('word1')) self.assertTrue(dictionary.contains('word2')) self.assertFalse(dictionary.contains('word3')) self.assertEqual(2, dictionary.count()) + dictionary.add_words('word3') + self.assertTrue(dictionary.contains('word3')) + self.assertEqual(3, dictionary.count()) def test_non_dict_list(self): dictionary = ArrayDictionary('$$%&**&(^&') - self.assertTrue(0, dictionary.count()) + self.assertEqual(0, dictionary.count()) if __name__ == '__main__': unittest.main() diff --git a/tests/UnitTests/Stemmer/stemmer_factory_test.py b/tests/UnitTests/Stemmer/stemmer_factory_test.py index 3ae5394..045b17d 100644 --- a/tests/UnitTests/Stemmer/stemmer_factory_test.py +++ b/tests/UnitTests/Stemmer/stemmer_factory_test.py @@ -24,34 +24,43 @@ def test_fungsional(self): if output != expected: raise AssertionError(str.format('output is {} instead of {}', output, expected)) + # Test Waktu Stemming < 3 detik + # @author Mufid Jamaluddin def test_execution_time(self): start = time.time() sentence = 'Rakyat memenuHi halaMan geDung DPR unTuk menyuarakan isi hatinya. Saat Itu, situasi sangat genting sekali. Terjadi kerusuhan yang mengiringi pergerakan mahasiswa yang memperjuangkan reformasi.' factory = StemmerFactory() stemmer = factory.create_stemmer() - stemmer.stem(sentence) end = time.time() execution_time = end - start - - # print(execution_time) - - # test execution time < 3 seconds self.assertTrue(execution_time < 3) def test_getWordsFromFile(self): factory = StemmerFactory() factory.get_words_from_file() + # Test Stemming per Kata + # @author Mufid Jamaluddin def test_word_stemming(self): factory = StemmerFactory() stemmer = factory.create_stemmer() self.assertEqual('besar', stemmer.stem('terbesar')) self.assertEqual('abai', stemmer.stem('diabaikan')) + # Test Stemming dengan isDev=True (No Cache) + # @author Mufid Jamaluddin + def test_word_stemmingdev(self): + factory = StemmerFactory(isDev=True) + stemmer = factory.create_stemmer() + self.assertEqual('besar', stemmer.stem('terbesar')) + self.assertEqual('abai', stemmer.stem('diabaikan')) + + # Test Stemming dengan list tokens + # @author Mufid Jamaluddin def test_tokens_stemming(self): factory = StemmerFactory() stemmer = factory.create_stemmer() From 3e4151a63d7ac781c2d31fb5e7ac47f4b65bba45 Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Sat, 16 Mar 2019 05:50:04 +0700 Subject: [PATCH 29/33] Define Abstract Method & Update Test Case --- src/Sastrawi/Dictionary/ArrayDictionary.py | 4 +++- .../Dictionary/DictionaryInterface.py | 8 +++++-- src/Sastrawi/Stemmer/Cache/CacheInterface.py | 14 ++++++++----- .../Stemmer/Context/ContextInterface.py | 21 ++++++++++++------- .../Stemmer/Context/RemovalInterface.py | 10 ++++++++- .../Dictionary/array_dictionary_test.py | 3 ++- 6 files changed, 43 insertions(+), 17 deletions(-) diff --git a/src/Sastrawi/Dictionary/ArrayDictionary.py b/src/Sastrawi/Dictionary/ArrayDictionary.py index a476e52..814cf6b 100644 --- a/src/Sastrawi/Dictionary/ArrayDictionary.py +++ b/src/Sastrawi/Dictionary/ArrayDictionary.py @@ -1,4 +1,6 @@ -class ArrayDictionary(object): +from Sastrawi.Dictionary.DictionaryInterface import DictionaryInterface + +class ArrayDictionary(DictionaryInterface): """description of class""" def __init__(self, words=None): diff --git a/src/Sastrawi/Dictionary/DictionaryInterface.py b/src/Sastrawi/Dictionary/DictionaryInterface.py index f899ae4..79bdf05 100644 --- a/src/Sastrawi/Dictionary/DictionaryInterface.py +++ b/src/Sastrawi/Dictionary/DictionaryInterface.py @@ -1,5 +1,9 @@ -class DictionaryInterface(object): +from abc import ABCMeta, abstractmethod + +class DictionaryInterface: """Interface definition of dictionary""" + __metaclass__ = ABCMeta + @abstractmethod def contains(self, word): - raise NotImplementedError('you must implement this method manually') \ No newline at end of file + pass \ No newline at end of file diff --git a/src/Sastrawi/Stemmer/Cache/CacheInterface.py b/src/Sastrawi/Stemmer/Cache/CacheInterface.py index cbed596..8dde964 100644 --- a/src/Sastrawi/Stemmer/Cache/CacheInterface.py +++ b/src/Sastrawi/Stemmer/Cache/CacheInterface.py @@ -1,13 +1,17 @@ -class CacheInterface(object): - """description of class""" +from abc import ABCMeta, abstractmethod +class CacheInterface: + """description of abs class""" + __metaclass__ = ABCMeta + + @abstractmethod def has(self, key): pass + @abstractmethod def set(self, key, value): pass + @abstractmethod def get(self, key): - pass - - + pass \ No newline at end of file diff --git a/src/Sastrawi/Stemmer/Context/ContextInterface.py b/src/Sastrawi/Stemmer/Context/ContextInterface.py index 180c6d0..747dfd2 100644 --- a/src/Sastrawi/Stemmer/Context/ContextInterface.py +++ b/src/Sastrawi/Stemmer/Context/ContextInterface.py @@ -1,30 +1,37 @@ -class ContextInterface(object): - """description of class""" +from abc import ABCMeta, abstractmethod +class ContextInterface: + """description of abs class""" + __metaclass__ = ABCMeta + + @abstractmethod def getOriginalWord(self): pass + @abstractmethod def setCurrentWord(self, word): pass + @abstractmethod def getCurrentWord(self): pass + @abstractmethod def getDictionary(self): pass + @abstractmethod def stopProcess(self): pass + @abstractmethod def processIsStopped(self): pass + @abstractmethod def addRemoval(self, removal): pass + @abstractmethod def getRemovals(self): - pass - - - - + pass \ No newline at end of file diff --git a/src/Sastrawi/Stemmer/Context/RemovalInterface.py b/src/Sastrawi/Stemmer/Context/RemovalInterface.py index 93b6171..7284597 100644 --- a/src/Sastrawi/Stemmer/Context/RemovalInterface.py +++ b/src/Sastrawi/Stemmer/Context/RemovalInterface.py @@ -1,18 +1,26 @@ -class RemovalInterface(object): +from abc import ABCMeta, abstractmethod + +class RemovalInterface: """description of class""" + __metaclass__ = ABCMeta + @abstractmethod def get_visitor(self): pass + @abstractmethod def get_subject(self): pass + @abstractmethod def get_result(self): pass + @abstractmethod def get_removed_part(self): pass + @abstractmethod def get_affix_type(self): pass diff --git a/tests/UnitTests/Dictionary/array_dictionary_test.py b/tests/UnitTests/Dictionary/array_dictionary_test.py index 049ba94..42c5580 100644 --- a/tests/UnitTests/Dictionary/array_dictionary_test.py +++ b/tests/UnitTests/Dictionary/array_dictionary_test.py @@ -44,7 +44,8 @@ def test_dict_param(self): self.assertTrue(dictionary.contains('word2')) self.assertFalse(dictionary.contains('word3')) self.assertEqual(2, dictionary.count()) - dictionary.add_words('word3') + dictionary.add('word3') + dictionary.add(' ') self.assertTrue(dictionary.contains('word3')) self.assertEqual(3, dictionary.count()) From 6d9fd872083038499b95897e04da254d03210ae1 Mon Sep 17 00:00:00 2001 From: Mufid Jamaluddin Date: Sat, 16 Mar 2019 06:01:24 +0700 Subject: [PATCH 30/33] Minor --- src/Sastrawi/Dictionary/DictionaryInterface.py | 3 +++ src/Sastrawi/Stemmer/Cache/CacheInterface.py | 3 +++ src/Sastrawi/Stemmer/Context/ContextInterface.py | 3 +++ src/Sastrawi/Stemmer/Context/RemovalInterface.py | 3 +++ tests/UnitTests/Stemmer/stemmer_factory_test.py | 4 ++-- 5 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/Sastrawi/Dictionary/DictionaryInterface.py b/src/Sastrawi/Dictionary/DictionaryInterface.py index 79bdf05..46433b6 100644 --- a/src/Sastrawi/Dictionary/DictionaryInterface.py +++ b/src/Sastrawi/Dictionary/DictionaryInterface.py @@ -1,3 +1,6 @@ +# @update_by Mufid Jamaluddin +# @update_date 16/03/2019 + from abc import ABCMeta, abstractmethod class DictionaryInterface: diff --git a/src/Sastrawi/Stemmer/Cache/CacheInterface.py b/src/Sastrawi/Stemmer/Cache/CacheInterface.py index 8dde964..e869c84 100644 --- a/src/Sastrawi/Stemmer/Cache/CacheInterface.py +++ b/src/Sastrawi/Stemmer/Cache/CacheInterface.py @@ -1,3 +1,6 @@ +# @update_by Mufid Jamaluddin +# @update_date 16/03/2019 + from abc import ABCMeta, abstractmethod class CacheInterface: diff --git a/src/Sastrawi/Stemmer/Context/ContextInterface.py b/src/Sastrawi/Stemmer/Context/ContextInterface.py index 747dfd2..5a3b7be 100644 --- a/src/Sastrawi/Stemmer/Context/ContextInterface.py +++ b/src/Sastrawi/Stemmer/Context/ContextInterface.py @@ -1,3 +1,6 @@ +# @update_by Mufid Jamaluddin +# @update_date 16/03/2019 + from abc import ABCMeta, abstractmethod class ContextInterface: diff --git a/src/Sastrawi/Stemmer/Context/RemovalInterface.py b/src/Sastrawi/Stemmer/Context/RemovalInterface.py index 7284597..a94a18f 100644 --- a/src/Sastrawi/Stemmer/Context/RemovalInterface.py +++ b/src/Sastrawi/Stemmer/Context/RemovalInterface.py @@ -1,3 +1,6 @@ +# @update_by Mufid Jamaluddin +# @update_date 16/03/2019 + from abc import ABCMeta, abstractmethod class RemovalInterface: diff --git a/tests/UnitTests/Stemmer/stemmer_factory_test.py b/tests/UnitTests/Stemmer/stemmer_factory_test.py index 045b17d..9df96b7 100644 --- a/tests/UnitTests/Stemmer/stemmer_factory_test.py +++ b/tests/UnitTests/Stemmer/stemmer_factory_test.py @@ -54,8 +54,8 @@ def test_word_stemming(self): # Test Stemming dengan isDev=True (No Cache) # @author Mufid Jamaluddin def test_word_stemmingdev(self): - factory = StemmerFactory(isDev=True) - stemmer = factory.create_stemmer() + factory = StemmerFactory() + stemmer = factory.create_stemmer(isDev=True) self.assertEqual('besar', stemmer.stem('terbesar')) self.assertEqual('abai', stemmer.stem('diabaikan')) From 8bfc448b1bb2d3b119e60e4bcf9028f80c58e45b Mon Sep 17 00:00:00 2001 From: mufidjamaluddin Date: Fri, 19 Apr 2019 13:19:32 +0700 Subject: [PATCH 31/33] LruCache --- src/Sastrawi/Stemmer/Cache/ArrayCache.py | 19 --------- src/Sastrawi/Stemmer/Cache/CacheInterface.py | 20 --------- src/Sastrawi/Stemmer/Cache/__init__.py | 0 src/Sastrawi/Stemmer/CachedStemmer.py | 45 -------------------- src/Sastrawi/Stemmer/Stemmer.py | 12 ++++++ src/Sastrawi/Stemmer/StemmerFactory.py | 18 +------- 6 files changed, 13 insertions(+), 101 deletions(-) delete mode 100644 src/Sastrawi/Stemmer/Cache/ArrayCache.py delete mode 100644 src/Sastrawi/Stemmer/Cache/CacheInterface.py delete mode 100644 src/Sastrawi/Stemmer/Cache/__init__.py delete mode 100644 src/Sastrawi/Stemmer/CachedStemmer.py diff --git a/src/Sastrawi/Stemmer/Cache/ArrayCache.py b/src/Sastrawi/Stemmer/Cache/ArrayCache.py deleted file mode 100644 index 35ff2b3..0000000 --- a/src/Sastrawi/Stemmer/Cache/ArrayCache.py +++ /dev/null @@ -1,19 +0,0 @@ -from Sastrawi.Stemmer.Cache.CacheInterface import CacheInterface - -class ArrayCache(CacheInterface): - """description of class""" - - def __init__(self): - self.data = {} - - def set(self, key, value): - self.data[key] = value - - def get(self, key): - if key in self.data: - return self.data[key] - - def has(self, key): - return key in self.data - - diff --git a/src/Sastrawi/Stemmer/Cache/CacheInterface.py b/src/Sastrawi/Stemmer/Cache/CacheInterface.py deleted file mode 100644 index e869c84..0000000 --- a/src/Sastrawi/Stemmer/Cache/CacheInterface.py +++ /dev/null @@ -1,20 +0,0 @@ -# @update_by Mufid Jamaluddin -# @update_date 16/03/2019 - -from abc import ABCMeta, abstractmethod - -class CacheInterface: - """description of abs class""" - __metaclass__ = ABCMeta - - @abstractmethod - def has(self, key): - pass - - @abstractmethod - def set(self, key, value): - pass - - @abstractmethod - def get(self, key): - pass \ No newline at end of file diff --git a/src/Sastrawi/Stemmer/Cache/__init__.py b/src/Sastrawi/Stemmer/Cache/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/Sastrawi/Stemmer/CachedStemmer.py b/src/Sastrawi/Stemmer/CachedStemmer.py deleted file mode 100644 index e052077..0000000 --- a/src/Sastrawi/Stemmer/CachedStemmer.py +++ /dev/null @@ -1,45 +0,0 @@ -#from Sastrawi.Stemmer.StemmerInterface import StemmerInterface -from Sastrawi.Stemmer.Filter import TextNormalizer - -class CachedStemmer(object): - """description of class""" - def __init__(self, cache, delegatedStemmer): - self.cache = cache - self.delegatedStemmer = delegatedStemmer - - def stem(self, text): - normalizedText = TextNormalizer.normalize_text(text) - - words = normalizedText.split(' ') - stems = [] - - for word in words: - if self.cache.has(word): - stems.append(self.cache.get(word)) - else: - stem = self.delegatedStemmer.stem_word(word) - self.cache.set(word, stem) - stems.append(stem) - - return ' '.join(stems) - - def stem_word(self, word): - if self.cache.has(word): - return self.cache.get(word) - else: - stem = self.delegatedStemmer.stem_word(word) - self.cache.set(word, stem) - return stem - - # Stemming word in Tokens - # @author Mufid Jamaluddin - def stem_tokens(self, tokens): - stemmed_tokens = [] - for token in tokens: - if not token or token.strip() == '': - continue - stemmed_tokens.append(self.stem_word(token)) - return stemmed_tokens - - def get_cache(self): - return self.cache diff --git a/src/Sastrawi/Stemmer/Stemmer.py b/src/Sastrawi/Stemmer/Stemmer.py index cb196c7..b465bac 100644 --- a/src/Sastrawi/Stemmer/Stemmer.py +++ b/src/Sastrawi/Stemmer/Stemmer.py @@ -2,6 +2,7 @@ from Sastrawi.Stemmer.Context.Visitor.VisitorProvider import VisitorProvider from Sastrawi.Stemmer.Filter import TextNormalizer from Sastrawi.Stemmer.Context.Context import Context +from cachetools import cached, LRUCache class Stemmer(object): """Indonesian Stemmer. @@ -28,6 +29,7 @@ def stem(self, text): return ' '.join(stems) + @cached(cache=LRUCache(maxsize=128)) def stem_word(self, word): """Stem a word to its common stem form.""" if self.is_plural(word): @@ -35,6 +37,16 @@ def stem_word(self, word): else: return self.stem_singular_word(word) + # Stemming word in Tokens + # @author Mufid Jamaluddin + def stem_tokens(self, tokens): + stemmed_tokens = [] + for token in tokens: + if not token or token.strip() == '': + continue + stemmed_tokens.append(self.stem_word(token)) + return stemmed_tokens + def is_plural(self, word): #-ku|-mu|-nya #nikmat-Ku, etc diff --git a/src/Sastrawi/Stemmer/StemmerFactory.py b/src/Sastrawi/Stemmer/StemmerFactory.py index 1b861be..dd16bb5 100644 --- a/src/Sastrawi/Stemmer/StemmerFactory.py +++ b/src/Sastrawi/Stemmer/StemmerFactory.py @@ -2,12 +2,9 @@ from cachetools import cached, LRUCache from Sastrawi.Dictionary.ArrayDictionary import ArrayDictionary from Sastrawi.Stemmer.Stemmer import Stemmer -from Sastrawi.Stemmer.CachedStemmer import CachedStemmer -from Sastrawi.Stemmer.Cache.ArrayCache import ArrayCache class StemmerFactory(object): """ Stemmer factory helps creating pre-configured stemmer """ - #APC_KEY = 'sastrawi_cache_dictionary' def create_stemmer(self, isDev=False): """ Returns Stemmer instance """ @@ -19,10 +16,7 @@ def create_stemmer(self, isDev=False): stemmer = Stemmer(dictionary) - resultCache = ArrayCache() - cachedStemmer = CachedStemmer(resultCache, stemmer) - - return cachedStemmer + return stemmer @cached(cache=LRUCache(maxsize=32)) def get_prod_words_dictionary(self): @@ -30,16 +24,6 @@ def get_prod_words_dictionary(self): dictionary = ArrayDictionary(words) return dictionary - #def get_words(self, isDev=False): - #if isDev or callable(getattr(self, 'apc_fetch')): - # words = self.getWordsFromFile() - #else: - # words = apc_fetch(self.APC_KEY) - # if not words: - # words = self.getWordsFromFile() - # apc_store(self.APC_KEY, words) - # return self.get_words_from_file() - def get_words_from_file(self): current_dir = os.path.dirname(os.path.realpath(__file__)) dictionaryFile = current_dir + '/data/kata-dasar.txt' From 34708981c0ed78c4ddf07e73d212e8764a14b93c Mon Sep 17 00:00:00 2001 From: mufidjamaluddin Date: Fri, 19 Apr 2019 13:33:14 +0700 Subject: [PATCH 32/33] minor --- src/Sastrawi/Stemmer/Context/Context.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Sastrawi/Stemmer/Context/Context.py b/src/Sastrawi/Stemmer/Context/Context.py index 18223ee..6667714 100644 --- a/src/Sastrawi/Stemmer/Context/Context.py +++ b/src/Sastrawi/Stemmer/Context/Context.py @@ -160,5 +160,4 @@ def restore_prefix(self): for removal in self.removals: if removal.get_affix_type() == 'DP': - self.removals.remove(removal) - + self.removals.remove(removal) \ No newline at end of file From 169edcf5d9b19b109672dbb2528877650a95f410 Mon Sep 17 00:00:00 2001 From: mufidjamaluddin Date: Fri, 19 Apr 2019 13:45:52 +0700 Subject: [PATCH 33/33] remove lrucache stemword --- src/Sastrawi/Stemmer/Stemmer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Sastrawi/Stemmer/Stemmer.py b/src/Sastrawi/Stemmer/Stemmer.py index b465bac..1f3135e 100644 --- a/src/Sastrawi/Stemmer/Stemmer.py +++ b/src/Sastrawi/Stemmer/Stemmer.py @@ -29,7 +29,6 @@ def stem(self, text): return ' '.join(stems) - @cached(cache=LRUCache(maxsize=128)) def stem_word(self, word): """Stem a word to its common stem form.""" if self.is_plural(word):