Skip to content

Commit

Permalink
Support Korea morphological analyzer through mecab (#1860)
Browse files Browse the repository at this point in the history
### What problem does this PR solve?

Use libmecab embedded within ijma, and the dictionary is generated
according to the instructions of
https://bitbucket.org/eunjeon/mecab-ko-dic

Issue link:#1228

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
  • Loading branch information
yingfeng authored Sep 11, 2024
1 parent 55b0977 commit 81d7c7f
Show file tree
Hide file tree
Showing 10 changed files with 285 additions and 1 deletion.
1 change: 1 addition & 0 deletions docs/references/http_api_reference.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -926,6 +926,7 @@ curl --request POST \
- `"chinese"`: Simplified Chinese
- `"tradition"`: Traditional Chinese
- `"japanese"`: Japanese
- `"korea"`: Korea
- `"ngram"`: [N-gram](https://en.wikipedia.org/wiki/N-gram)
- Parameter settings for a secondary index:
- `"type"`: `"secondary"`
Expand Down
1 change: 1 addition & 0 deletions docs/references/pysdk_api_reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -721,6 +721,7 @@ An `IndexInfo` structure contains three fields,`column_name`, `index_type`, and
- `"chinese"`: Simplified Chinese
- `"tradition"`: Traditional Chinese
- `"japanese"`: Japanese
- `"korea"`: Korea
- `"ngram"`: [N-gram](https://en.wikipedia.org/wiki/N-gram)
- Parameter settings for a secondary index:
No parameters are required. For now, use an empty list `[]`.
Expand Down
22 changes: 22 additions & 0 deletions src/common/analyzer/analyzer_pool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import stemmer;
import chinese_analyzer;
import traditional_chinese_analyzer;
import japanese_analyzer;
import korea_analyzer;
import standard_analyzer;
import ngram_analyzer;
import logger;
Expand Down Expand Up @@ -150,6 +151,27 @@ Tuple<UniquePtr<Analyzer>, Status> AnalyzerPool::GetAnalyzer(const std::string_v
}
return {MakeUnique<JapaneseAnalyzer>(*reinterpret_cast<JapaneseAnalyzer *>(prototype)), Status::OK()};
}
case Str2Int(KOREA.data()): {
Analyzer *prototype = cache_[KOREA].get();
if (prototype == nullptr) {
String path;
Config *config = InfinityContext::instance().config();
if (config == nullptr) {
// InfinityContext has not been initialized.
path = "/var/infinity/resource";
} else {
path = config->ResourcePath();
}
UniquePtr<KoreaAnalyzer> analyzer = MakeUnique<KoreaAnalyzer>(std::move(path));
Status load_status = analyzer->Load();
if (!load_status.ok()) {
return {nullptr, load_status};
}
prototype = analyzer.get();
cache_[KOREA] = std::move(analyzer);
}
return {MakeUnique<KoreaAnalyzer>(*reinterpret_cast<KoreaAnalyzer *>(prototype)), Status::OK()};
}
case Str2Int(STANDARD.data()): {
UniquePtr<StandardAnalyzer> analyzer = MakeUnique<StandardAnalyzer>();
Language lang = STEM_LANG_ENGLISH;
Expand Down
1 change: 1 addition & 0 deletions src/common/analyzer/analyzer_pool.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ public:
static constexpr std::string_view CHINESE = "chinese";
static constexpr std::string_view TRADITIONALCHINESE = "tradition";
static constexpr std::string_view JAPANESE = "japanese";
static constexpr std::string_view KOREA = "korea";
static constexpr std::string_view STANDARD = "standard";
static constexpr std::string_view NGRAM = "ngram";

Expand Down
3 changes: 2 additions & 1 deletion src/common/analyzer/ijma.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ module;
#pragma clang diagnostic ignored "-Wunused-variable"
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
#include <ijma.h>
#include <mecab_wrapper.h>
#pragma clang diagnostic pop

export module ijma;
Expand All @@ -26,6 +27,6 @@ export namespace jma {

using jma::Analyzer;
using jma::Knowledge;
using jma::MeCab;
using jma::Sentence;

} // namespace jma
73 changes: 73 additions & 0 deletions src/common/analyzer/korea_analyzer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
// Copyright(C) 2024 InfiniFlow, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

module;

#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wunused-variable"
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
#include <mecab_wrapper.h>
#pragma clang diagnostic pop

#include <cstring>
#include <filesystem>
#include <iostream>
import stl;
import term;
import analyzer;
import common_analyzer;
import logger;
import status;
import ijma;

module korea_analyzer;

namespace fs = std::filesystem;

namespace infinity {
static const String KNOWLEDGE_PATH = "mecab/ko-dic";

KoreaAnalyzer::KoreaAnalyzer(const String &base_path) {
cjk_ = true;
own_mecab_ = true;
fs::path root(base_path);
fs::path knowledge_path(root / KNOWLEDGE_PATH);
knowledge_path_ = "-d " + knowledge_path.string();
}

KoreaAnalyzer::KoreaAnalyzer(const KoreaAnalyzer &other) {
cjk_ = true;
knowledge_path_ = other.knowledge_path_;
own_mecab_ = false;
SetCaseSensitive(false);
}

KoreaAnalyzer::~KoreaAnalyzer() {
if (own_mecab_)
delete mecab_;
}

Status KoreaAnalyzer::Load() {
try {
mecab_ = new jma::MeCab(knowledge_path_);
} catch (std::logic_error) {
return Status::InvalidAnalyzerFile("Failed to load Korea analyzer");
}

SetCaseSensitive(false);

return Status::OK();
}

} // namespace infinity
80 changes: 80 additions & 0 deletions src/common/analyzer/korea_analyzer.cppm
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// Copyright(C) 2024 InfiniFlow, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

module;

export module korea_analyzer;

import stl;
import ijma;
import term;
import common_analyzer;
import status;

namespace infinity {

export class KoreaAnalyzer : public CommonLanguageAnalyzer {
public:
KoreaAnalyzer(const String &path);

KoreaAnalyzer(const KoreaAnalyzer &other);

~KoreaAnalyzer();

Status Load();

protected:
void Parse(const String &input) override {
mecab_->SetSentence(input);
local_offset_ = -1;

ResetToken();
}

bool NextToken() override {
while (DoNext()) {
mecab_->GetToken(token_str_);
token_ = token_str_.c_str();
len_ = token_str_.size();
offset_ = local_offset_;
is_index_ = true;
return true;
}
ResetToken();
return false;
}

bool IsAlpha() override { return mecab_->IsAlpha(); }

bool IsSpecialChar() override { return false; }

private:
bool DoNext() {
while (!mecab_->IsEnd()) {
mecab_->Next();
++local_offset_;
return true;
}
return false;
}

String knowledge_path_;

jma::MeCab *mecab_{nullptr};

bool own_mecab_;

String token_str_;
};
} // namespace infinity
File renamed without changes.
38 changes: 38 additions & 0 deletions third_party/ijma/include/mecab_wrapper.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#pragma once

#include <string>
#include <vector>

#include "mecab.h"

namespace jma {

class MeCab {
public:
MeCab(const std::string &option);

~MeCab();

bool Parse(std::vector<std::string> &out, const char *str, size_t str_len = 0);

void SetSentence(const std::string &str);

bool IsAlpha() const;

bool IsSpecial() const;

bool IsEnd() const;

void Next();

void GetToken(std::string &out);

std::string GetFeature();

private:
::MeCab::Tagger *tagger_{nullptr};
const ::MeCab::Node *node_{nullptr};
char buf_[1024];
};

} // namespace jma
67 changes: 67 additions & 0 deletions third_party/ijma/src/mecab_wrapper.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#include "mecab_wrapper.h"
#include "mecab.h"

namespace jma {

MeCab::MeCab(const std::string &option) : tagger_(::MeCab::createTagger(option.c_str())) {}

MeCab::~MeCab() { delete tagger_; }

bool MeCab::Parse(std::vector<std::string> &out, const char *str, size_t str_len) {
if (str_len == 0) {
str_len = strlen(str);
}
const char *p = tagger_->parse(str, str_len);
if (p == 0)
return false;
while (*p) {
if (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t') {
p++;
continue;
}
const char *q = strchr(p, ' ');
if (q == 0) {
out.push_back(p);
break;
}
out.push_back(std::string(p, q));
p = q + 1;
}
return true;
}

void MeCab::SetSentence(const std::string &str) { node_ = tagger_->parseToNode(str.c_str(), str.size()); }

bool MeCab::IsAlpha() const {
const char *p = node_->feature;
if (node_->length < 2)
return false;
return p[0] == 'S' && p[1] == 'L';
}

bool MeCab::IsSpecial() const {
const char *p = node_->feature;
return p[0] == 'S' && p[1] == 'C';
}

bool MeCab::IsEnd() const {
if (node_ == nullptr)
return true;
return node_->stat == MECAB_EOS_NODE;
}

void MeCab::Next() {
// assert(node_);
node_ = node_->next;
}

void MeCab::GetToken(std::string &out) {
out = node_->surface;
strcpy(buf_, node_->surface);
buf_[node_->length] = '\0';
out.assign(buf_);
}

std::string MeCab::GetFeature() { return node_->feature; }

} // namespace jma

0 comments on commit 81d7c7f

Please sign in to comment.