Skip to content

Commit

Permalink
Merge pull request #57 from Kensuke-Mitsuzawa/issue/#56
Browse files Browse the repository at this point in the history
added jumandic and unidic to mecab wrapper moduke
  • Loading branch information
Kensuke-Mitsuzawa authored Mar 25, 2019
2 parents 45af698 + cb7819a commit 3bdfb6b
Show file tree
Hide file tree
Showing 7 changed files with 148 additions and 63 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,5 @@ Mykytea-python/
.DS_Store
*tox
.cache/
python/
python2/
104 changes: 66 additions & 38 deletions JapaneseTokenizer/mecab_wrapper/mecab_wrapper.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#! -*- coding: utf-8 -*-
# core mddule
# core module
from JapaneseTokenizer.object_models import WrapperBase
from JapaneseTokenizer.common.text_preprocess import normalize_text
from JapaneseTokenizer import init_logger
Expand All @@ -13,7 +13,7 @@
import six
from six import text_type
# typing
from typing import List, Dict, Tuple, Union, TypeVar, Callable
from typing import List, Tuple, Union, TypeVar, Callable
ContentsTypes = TypeVar('T')

__author__ = 'kensuke-mi'
Expand All @@ -30,27 +30,54 @@


class MecabWrapper(WrapperBase):
def __init__(self, dictType, pathUserDictCsv='', path_mecab_config=None, string_encoding='utf-8'):
# type: (text_type, text_type, text_type, text_type)->None
assert dictType in ["neologd", "all", "ipadic", "ipaddic", "user", "", None]
def __init__(self,
dictType,
pathUserDictCsv=None,
path_mecab_config=None,
path_dictionary=None,
string_encoding='utf-8'):
# type: (text_type, text_type, text_type, text_type, text_type)->None
"""
:param dictType: a dictionary type called by mecab
:param pathUserDictCsv: path to your original dictionary file
:param path_mecab_config: path to 'mecab_config' command. It's automatically detected if not give
:param path_dictionary: path to a dictionary which you want to use. If not given, it's automatically detected
:param string_encoding: encoding option to parse command line result. This is mainly used for python2.x
"""
self.string_encoding = string_encoding
if dictType == 'all' or dictType == 'user': assert os.path.exists(pathUserDictCsv)
self._dictType = dictType
self._pathUserDictCsv = pathUserDictCsv
self._path_dictionary = path_dictionary
if path_mecab_config is None:
self._path_mecab_config = self.__get_path_to_mecab_config()
else:
self._path_mecab_config = path_mecab_config

self._dictType = dictType
self._pathUserDictCsv = pathUserDictCsv
self._mecab_dictionary_path = self.__check_mecab_dict_path()
if self._path_dictionary is not None:
assert os.path.exists(self._path_dictionary), 'Path dictionary is NOT exist.'
self._mecab_dictionary_path = None
else:
self._mecab_dictionary_path = self.__check_mecab_dict_path()

logger.info("mecab dictionary path is detected under {}".format(self._mecab_dictionary_path))

self.mecabObj = self.__CallMecab()

assert dictType in ["neologd", "all", "ipadic", "ipaddic", "user", "", "jumandic", "unidic", None], \
'Dictionary Type Error. Your dict = {} is NOT available.'
if dictType == 'all':
logger.error('dictionary type "all" is deprecated from version1.6')
raise Exception('dictionary type "all" is deprecated from version1.6')
if dictType == 'user':
logger.error('dictionary type "user" is deprecated from version1.6. You just give path to dictionary csv.')
raise Exception('dictionary type "all" is deprecated from version1.6. You just give path to dictionary csv.')

if pathUserDictCsv is not None and isinstance(pathUserDictCsv, text_type) and pathUserDictCsv != '':
assert os.path.exists(pathUserDictCsv), \
'Your user dictionary does NOT exist. Path={}'.format(pathUserDictCsv)

def __get_path_to_mecab_config(self):
"""* What you can do
- You get path into mecab-config
"""You get path into mecab-config
"""
if six.PY2:
path_mecab_config_dir = subprocess.check_output(['which', 'mecab-config'])
Expand All @@ -62,7 +89,6 @@ def __get_path_to_mecab_config(self):
logger.info(msg='mecab-config is detected at {}'.format(path_mecab_config_dir))
return path_mecab_config_dir


def __check_mecab_dict_path(self):
"""check path to dict of Mecab in system environment
"""
Expand All @@ -78,16 +104,13 @@ def __check_mecab_dict_path(self):
logger.error("{}".format(mecab_dic_cmd))
raise subprocess.CalledProcessError(returncode=-1, cmd="Failed to execute mecab-config command")
if path_mecab_dict == '':
raise SystemError(
'mecab dictionary path is not found with following command: {} You are not able to use additional dictionary. Still you are able to call mecab default dictionary'.format(mecab_dic_cmd)
)
raise SystemError("""mecab dictionary path is not found with following command: {}
You are not able to use additional dictionary.
Still you are able to call mecab default dictionary""".format(mecab_dic_cmd))

return path_mecab_dict

def __check_mecab_libexe(self):
"""* What you can do
"""

mecab_libexe_cmd = "echo `{} --libexecdir`".format(os.path.join(self._path_mecab_config, 'mecab-config'))

try:
Expand All @@ -100,35 +123,40 @@ def __check_mecab_libexe(self):
logger.error("{}".format(mecab_libexe_cmd))
raise subprocess.CalledProcessError(returncode=-1, cmd="Failed to execute mecab-config --libexecdir")
if path_mecab_libexe == '':
raise SystemError('Mecab config is not callable with following command: {} You are not able to compile your user dictionary. Still, you are able to use default mecab dictionary.'.format(mecab_libexe_cmd))
raise SystemError("""Mecab config is not callable with following command: {}
You are not able to compile your user dictionary.
Still, you are able to use default mecab dictionary.""".format(mecab_libexe_cmd))

return path_mecab_libexe

def __CallMecab(self):
"""* What you can do
"""
if self._dictType == 'neologd':
if self._path_dictionary is not None and self._mecab_dictionary_path is None:
logger.debug('Use dictionary you specified.')
cmMecabInitialize = '-d {}'.format(self._path_dictionary)
elif self._dictType == 'neologd':
# use neologd
logger.debug('Use neologd additional dictionary')
cmMecabInitialize = '-d {}'.format(os.path.join(self._mecab_dictionary_path, "mecab-ipadic-neologd"))

elif self._dictType == 'all':
logger.debug('Use neologd additional dictionary')
pathUserDict = self.__CompileUserdict()
cmMecabInitialize = '-u {} -d {}'.format(pathUserDict,
os.path.join(self._mecab_dictionary_path, "mecab-ipadic-neologd"))
elif self._dictType == 'ipadic':
logger.debug('Use ipadic additional dictionary')
elif self._dictType == 'ipadic' or self._dictType == 'ipaddic':
# use ipadic
logger.debug('Use ipadic dictionary')
cmMecabInitialize = '-d {}'.format(os.path.join(self._mecab_dictionary_path, "ipadic"))

elif self._dictType == 'user':
logger.debug('Use User dictionary')
pathUserDict = self.__CompileUserdict()
cmMecabInitialize = '-u {}'.format(pathUserDict)

elif six.PY2 is False and self._dictType == 'jumandic':
# use jumandic. This is impossible to call in Python2.x
logger.debug('Use jumandic dictionary')
cmMecabInitialize = '-d {}'.format(os.path.join(self._mecab_dictionary_path, "jumandic"))
elif six.PY2 and self._dictType == 'jumandic':
raise Exception('In python2.x, impossible to call jumandic.')
else:
logger.debug('Use no default dictionary')
cmMecabInitialize = ''

# execute compile if user dictionary is given
if self._pathUserDictCsv is not None:
logger.debug('Use User dictionary')
pathUserDict = self.__CompileUserdict()
cmMecabInitialize += ' -u {}'.format(pathUserDict)

if six.PY2:
cmMecabCall = "-Ochasen {}".format(cmMecabInitialize)
else:
Expand Down Expand Up @@ -246,7 +274,7 @@ def tokenize(self, sentence,
else:
pass

### decide normalization function depending on dictType
# decide normalization function depending on dictType
if func_normalizer is None and self._dictType == 'neologd' and is_neologdn_valid:
normalized_sentence = neologdn.normalize(sentence)
elif func_normalizer is None and self._dictType == 'neologd' and is_neologdn_valid == False:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
else:
raise NotImplementedError()

version = '1.5'
version = '1.6'
name = 'JapaneseTokenizer'
short_description = '`JapaneseTokenizer` is a package for easy Japanese Tokenization'

Expand Down
14 changes: 13 additions & 1 deletion test/Dockerfile-dev
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ ENV MECAB_VERSION 0.996
ENV IPADIC_VERSION 2.7.0-20070801
ENV mecab_url https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7cENtOXlicTFaRUE
ENV ipadic_url https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM
ENV jumandic_url https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7X2pESGlLREpxdXM
ENV unidic_url https://unidic.ninjal.ac.jp/unidic_archive/cwj/2.3.0/unidic-cwj-2.3.0.zip
ENV build_deps 'curl git bash file sudo openssh gcc make build-base'
ENV dependencies 'openssl'

Expand Down Expand Up @@ -38,10 +40,20 @@ RUN apk add --update --no-cache ${build_deps} \
# Install Neologd
&& git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git \
&& mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n -y \
# Install jumandic
&& curl -SL -o jumandic.tar.gz ${jumandic_url} \
&& tar zxf jumandic.tar.gz \
&& cd mecab-jumandic-7.0-20130310 \
&& ./configure --with-charset=utf8 \
&& make \
&& make install \
# delete dictionary files
&& cd \
&& rm -rf \
mecab-${MECAB_VERSION}* \
mecab-${IPADIC_VERSION}* \
mecab-ipadic-neologd
mecab-ipadic-neologd \
mecab-jumandic-7.0-20130310

# general
RUN apk --no-cache add vim \
Expand Down
30 changes: 22 additions & 8 deletions test/test_mecab_wrapper_python2.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,18 +47,32 @@ def test_default_parse(self):
for morph in parsed_obj:
assert isinstance(morph, string_types)


def test_init_userdict(self):
# test when user dictionary is called
mecab_obj = MecabWrapper(dictType='ipadic', pathUserDictCsv=self.path_user_dict)
assert isinstance(mecab_obj, MecabWrapper)
parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=True)
is_ok = False
for morph in parsed_obj:
if u'さくらまな' == morph:
is_ok = True
else:
pass
assert is_ok

def test_parse_jumandic(self):
with self.assertRaises(Exception):
mecab_obj = MecabWrapper(dictType='jumandic')
assert isinstance(mecab_obj, MecabWrapper)

def test_init_alldict(self):
"""* Test case
- すべての辞書を利用した場合の動作を確認する
"""
mecab_obj = MecabWrapper(dictType='all', pathUserDictCsv=self.path_user_dict)
assert isinstance(mecab_obj, MecabWrapper)

res = mecab_obj.tokenize(sentence=self.test_senetence, return_list=True)
assert isinstance(res, list)
assert u'さくらまな' in res
with self.assertRaises(Exception):
mecab_obj = MecabWrapper(dictType='all', pathUserDictCsv=self.path_user_dict)
assert isinstance(mecab_obj, MecabWrapper)


if __name__ == '__main__':
unittest.main()
unittest.main()
49 changes: 35 additions & 14 deletions test/test_mecab_wrapper_python3.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,7 @@ def setUp(self):
self.path_user_dict = os.path.join(os.path.dirname(__file__), 'resources/test/userdict.csv')

def test_neologd_parse(self):
"""* Test case
- neologd辞書で正しく分割できることを確認する
"""
# test using neologd dictionary
mecab_obj = MecabWrapper(dictType='neologd')
parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence)
self.assertTrue(parsed_obj, TokenizedSenetence)
Expand All @@ -31,9 +29,7 @@ def test_neologd_parse(self):
self.assertTrue(all(isinstance(mrph, str) for mrph in parsed_obj.convert_list_object()))

def test_default_parse(self):
"""* Test case
- デフォルトの状態で動作を確認する
"""
# test default status
dictType = "ipadic"
mecab_obj = MecabWrapper(dictType=dictType)
assert isinstance(mecab_obj, MecabWrapper)
Expand All @@ -48,19 +44,44 @@ def test_default_parse(self):
for morph in parsed_obj:
assert isinstance(morph, str)

def test_init_userdict(self):
"""* Test case
- すべての辞書を利用した場合の動作を確認する
"""
mecab_obj = MecabWrapper(dictType='all', pathUserDictCsv=self.path_user_dict)
def test_parse_jumandic(self):
mecab_obj = MecabWrapper(dictType='jumandic')
assert isinstance(mecab_obj, MecabWrapper)

res = mecab_obj.tokenize(sentence=self.test_senetence, return_list=True)
assert isinstance(res, list)
assert 'さくらまな' in res
parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=False)
assert isinstance(parsed_obj, TokenizedSenetence)
for tokenized_obj in parsed_obj.tokenized_objects:
if tokenized_obj.word_stem == '女優':
# ドメイン:文化・芸術 is special output only in Jumandic
assert 'ドメイン:文化・芸術' in tokenized_obj.analyzed_line

def test_parse_userdic(self):
pass

def test_parse_dictionary_path(self):
# put path to dictionary and parse sentence.
path_default_ipadic = '/usr/local/lib/mecab/dic/mecab-ipadic-neologd'
if os.path.exists(path_default_ipadic):
mecab_obj = MecabWrapper(dictType=None, path_dictionary=path_default_ipadic)
assert mecab_obj._path_dictionary == path_default_ipadic
parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=False)
assert isinstance(parsed_obj, TokenizedSenetence)

def test_init_userdict(self):
# this test should be error response.
mecab_obj = MecabWrapper(dictType='ipadic', pathUserDictCsv=self.path_user_dict)
assert isinstance(mecab_obj, MecabWrapper)
parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=False)
assert isinstance(parsed_obj, TokenizedSenetence)
is_ok = False
for tokenized_obj in parsed_obj.tokenized_objects:
if tokenized_obj.word_stem == 'さくらまな':
is_ok = True
assert is_ok


if __name__ == '__main__':
unittest.main()



10 changes: 9 additions & 1 deletion travis-mecab-install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,13 @@ make
sudo make install
sudo ldconfig

wget -O jumandic.tar.gz 'https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7X2pESGlLREpxdXM'
tar zxfv jumandic.tar.gz
cd mecab-jumandic-7.0-20130310
./configure --with-charset=utf8
make
sudo make install
sudo ldconfig

cd $base_dir
rm -rf mecab-0.996 mecab-ipadic-2.7.0-20070801
rm -rf mecab-0.996 mecab-ipadic-2.7.0-20070801 mecab-jumandic-7.0-20130310

0 comments on commit 3bdfb6b

Please sign in to comment.