From 6ff3a41cfaf7d1b7d185e8b55e78465dfd41e1b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=CC=88rg=20Prante?= Date: Mon, 22 Feb 2016 17:30:40 +0100 Subject: [PATCH] adding rule-based number formatter as token filter --- README.md | 3 +- build.gradle | 7 +- .../icu/ICUNumberFormatTokenFilter.java | 61 ++++++++++++++ .../icu/IcuAnalysisBinderProcessor.java | 1 + .../IcuNumberFormatTokenFilterFactory.java | 80 ++++++++++++++++++ .../analysis/icu/IcuNumberFormatTests.java | 82 +++++++++++++++++++ .../index/analysis/icu/icu_numberformat.json | 24 ++++++ 7 files changed, 251 insertions(+), 7 deletions(-) create mode 100644 src/main/java/org/xbib/elasticsearch/index/analysis/icu/ICUNumberFormatTokenFilter.java create mode 100644 src/main/java/org/xbib/elasticsearch/index/analysis/icu/IcuNumberFormatTokenFilterFactory.java create mode 100644 src/test/java/org/xbib/elasticsearch/index/analysis/icu/IcuNumberFormatTests.java create mode 100644 src/test/resources/org/xbib/elasticsearch/index/analysis/icu/icu_numberformat.json diff --git a/README.md b/README.md index d0790ce8..0e92047f 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,7 @@ A plugin that consists of a compilation of useful Elasticsearch plugins related | Elasticsearch version | Plugin | Release date | | ------------------------ | ------------- | -------------| +| 2.2.0 | 2.2.0.1 | Feb 22, 2016 | | 2.2.0 | 2.2.0.0 | Feb 8, 2016 | | 2.1.1 | 2.1.1.2 | Dec 30, 2015 | | 2.1.1 | 2.1.1.0 | Dec 21, 2015 | @@ -49,7 +50,7 @@ A plugin that consists of a compilation of useful Elasticsearch plugins related ### Elasticsearch 2.x - ./bin/plugin install http://xbib.org/repository/org/xbib/elasticsearch/plugin/elasticsearch-plugin-bundle/2.2.0.0/elasticsearch-plugin-bundle-2.2.0.0-plugin.zip + ./bin/plugin install http://xbib.org/repository/org/xbib/elasticsearch/plugin/elasticsearch-plugin-bundle/2.2.0.1/elasticsearch-plugin-bundle-2.2.0.1-plugin.zip ### Elasticsearch 1.x diff --git a/build.gradle b/build.gradle index e74481ac..1999f323 100644 --- a/build.gradle +++ b/build.gradle @@ -1,6 +1,6 @@ def xbibGroup = 'org.xbib.elasticsearch.plugin' -def xbibVersion = '2.2.0.0' +def xbibVersion = '2.2.0.1' group = xbibGroup version = xbibVersion @@ -84,11 +84,6 @@ dependencies { wagon 'org.apache.maven.wagon:wagon-ssh-external:2.10' } -compileJava { - sourceCompatibility = 1.7 - targetCompatibility = 1.7 -} - tasks.withType(JavaCompile) { options.compilerArgs << "-Xlint:unchecked" << "-Xlint:deprecation" } diff --git a/src/main/java/org/xbib/elasticsearch/index/analysis/icu/ICUNumberFormatTokenFilter.java b/src/main/java/org/xbib/elasticsearch/index/analysis/icu/ICUNumberFormatTokenFilter.java new file mode 100644 index 00000000..d53a9083 --- /dev/null +++ b/src/main/java/org/xbib/elasticsearch/index/analysis/icu/ICUNumberFormatTokenFilter.java @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2016 Jörg Prante + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published + * by the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program; if not, see http://www.gnu.org/licenses + * or write to the Free Software Foundation, Inc., 51 Franklin Street, + * Fifth Floor, Boston, MA 02110-1301 USA. + * + * The interactive user interfaces in modified source and object code + * versions of this program must display Appropriate Legal Notices, + * as required under Section 5 of the GNU Affero General Public License. + * + */ +package org.xbib.elasticsearch.index.analysis.icu; + +import com.ibm.icu.text.NumberFormat; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +import java.io.IOException; +import java.text.ParsePosition; + +public final class IcuNumberFormatTokenFilter extends TokenFilter { + + private final NumberFormat numberFormat; + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + + public IcuNumberFormatTokenFilter(TokenStream input, NumberFormat numberFormat) { + super(input); + this.numberFormat = numberFormat; + } + + @Override + public final boolean incrementToken() throws IOException { + if (!input.incrementToken()) { + return false; + } else { + String s = termAtt.toString(); + ParsePosition parsePosition = new ParsePosition(0); + Number result = numberFormat.parse(s, parsePosition); + if (parsePosition.getIndex() > 0) { + // zehn-tausend -> zehntausend + // one hundred thousand -> onehundredthousand + s = numberFormat.format(result).replaceAll("[\u00AD\u0020]",""); + } + termAtt.setEmpty().append(s); + return true; + } + } +} diff --git a/src/main/java/org/xbib/elasticsearch/index/analysis/icu/IcuAnalysisBinderProcessor.java b/src/main/java/org/xbib/elasticsearch/index/analysis/icu/IcuAnalysisBinderProcessor.java index 5c818408..5cf1a235 100644 --- a/src/main/java/org/xbib/elasticsearch/index/analysis/icu/IcuAnalysisBinderProcessor.java +++ b/src/main/java/org/xbib/elasticsearch/index/analysis/icu/IcuAnalysisBinderProcessor.java @@ -43,6 +43,7 @@ public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) { tokenFiltersBindings.processTokenFilter("icu_normalizer", IcuNormalizerTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("icu_folding", IcuFoldingTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("icu_transform", IcuTransformTokenFilterFactory.class); + tokenFiltersBindings.processTokenFilter("icu_numberformat", IcuNumberFormatTokenFilterFactory.class); } @Override diff --git a/src/main/java/org/xbib/elasticsearch/index/analysis/icu/IcuNumberFormatTokenFilterFactory.java b/src/main/java/org/xbib/elasticsearch/index/analysis/icu/IcuNumberFormatTokenFilterFactory.java new file mode 100644 index 00000000..94da30b5 --- /dev/null +++ b/src/main/java/org/xbib/elasticsearch/index/analysis/icu/IcuNumberFormatTokenFilterFactory.java @@ -0,0 +1,80 @@ +/* + * Copyright (C) 2016 Jörg Prante + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published + * by the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program; if not, see http://www.gnu.org/licenses + * or write to the Free Software Foundation, Inc., 51 Franklin Street, + * Fifth Floor, Boston, MA 02110-1301 USA. + * + * The interactive user interfaces in modified source and object code + * versions of this program must display Appropriate Legal Notices, + * as required under Section 5 of the GNU Affero General Public License. + * + */ +package org.xbib.elasticsearch.index.analysis.icu; + +import com.ibm.icu.text.NumberFormat; +import com.ibm.icu.text.RuleBasedNumberFormat; +import org.apache.lucene.analysis.TokenStream; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; +import org.elasticsearch.index.settings.IndexSettingsService; + +import java.util.Locale; + +public class IcuNumberFormatTokenFilterFactory extends AbstractTokenFilterFactory { + + private final NumberFormat numberFormat; + + @Inject + public IcuNumberFormatTokenFilterFactory(Index index, + IndexSettingsService indexSettingsService, + @Assisted String name, + @Assisted Settings settings) { + super(index, indexSettingsService.indexSettings(), name, settings); + Locale locale = settings.get("locale") != null ? new Locale(settings.get("locale")) : Locale.getDefault(); + String formatStr = settings.get("format", "SPELLOUT"); + int format; + switch (formatStr.toUpperCase()) { + case "SPELLOUT" : + format = RuleBasedNumberFormat.SPELLOUT; + break; + case "DURATION" : + format = RuleBasedNumberFormat.DURATION; + break; + case "NUMBERING_SYSTEM" : + format = RuleBasedNumberFormat.NUMBERING_SYSTEM; + break; + case "NUMBERSTYLE" : + format = RuleBasedNumberFormat.NUMBERSTYLE; + break; + case "ORDINAL" : + format = RuleBasedNumberFormat.ORDINAL; + break; + default: format = RuleBasedNumberFormat.SPELLOUT; + break; + } + RuleBasedNumberFormat ruleBasedNumberFormat = new RuleBasedNumberFormat(locale, format); + // RBNF parsing is incredibly slow when lenient is enabled but the only method to parse compound number words + ruleBasedNumberFormat.setLenientParseMode(settings.getAsBoolean("lenient", true)); + ruleBasedNumberFormat.setGroupingUsed(settings.getAsBoolean("grouping", true)); + this.numberFormat = ruleBasedNumberFormat; + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new IcuNumberFormatTokenFilter(tokenStream, numberFormat); + } +} \ No newline at end of file diff --git a/src/test/java/org/xbib/elasticsearch/index/analysis/icu/IcuNumberFormatTests.java b/src/test/java/org/xbib/elasticsearch/index/analysis/icu/IcuNumberFormatTests.java new file mode 100644 index 00000000..afd30a03 --- /dev/null +++ b/src/test/java/org/xbib/elasticsearch/index/analysis/icu/IcuNumberFormatTests.java @@ -0,0 +1,82 @@ +package org.xbib.elasticsearch.index.analysis.icu; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.elasticsearch.index.analysis.AnalysisService; +import org.elasticsearch.index.analysis.TokenFilterFactory; +import org.junit.Assert; +import org.junit.Test; +import org.xbib.elasticsearch.MapperTestUtils; + +import java.io.IOException; +import java.io.StringReader; + +public class IcuNumberFormatTests extends Assert { + + @Test + public void testGermanNumberFormat() throws IOException { + + String source = "Muss Rudi Völler fünftausend oder 10000 EUR Strafe zahlen?"; + + String[] expected = { + "Muss", + "Rudi", + "Völler", + "fünftausend", + "oder", + "zehntausend", + "EUR", + "Strafe", + "zahlen" + }; + AnalysisService analysisService = + MapperTestUtils.analysisService("/org/xbib/elasticsearch/index/analysis/icu/icu_numberformat.json"); + Tokenizer tokenizer = analysisService.tokenizer("my_tokenizer").create(); + tokenizer.setReader(new StringReader(source)); + TokenFilterFactory tokenFilter = analysisService.tokenFilter("spellout_de"); + TokenStream tokenStream = tokenFilter.create(tokenizer); + assertSimpleTSOutput(tokenStream, expected); + } + + @Test + public void testAmericanEnglish() throws IOException { + + String source = "You will never get 100,000 US dollars of salary per year."; + + String[] expected = { + "You", + "will", + "never", + "get", + "onehundredthousand", + "US", + "dollars", + "of", + "salary", + "per", + "year" + }; + AnalysisService analysisService = + MapperTestUtils.analysisService("/org/xbib/elasticsearch/index/analysis/icu/icu_numberformat.json"); + Tokenizer tokenizer = analysisService.tokenizer("my_tokenizer").create(); + tokenizer.setReader(new StringReader(source)); + TokenFilterFactory tokenFilter = analysisService.tokenFilter("spellout_en"); + TokenStream tokenStream = tokenFilter.create(tokenizer); + assertSimpleTSOutput(tokenStream, expected); + } + + private void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException { + stream.reset(); + CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); + assertNotNull(termAttr); + int i = 0; + while (stream.incrementToken()) { + assertTrue(i < expected.length); + assertEquals(expected[i], termAttr.toString()); + i++; + } + assertEquals(i, expected.length); + stream.close(); + } +} diff --git a/src/test/resources/org/xbib/elasticsearch/index/analysis/icu/icu_numberformat.json b/src/test/resources/org/xbib/elasticsearch/index/analysis/icu/icu_numberformat.json new file mode 100644 index 00000000..56f2257d --- /dev/null +++ b/src/test/resources/org/xbib/elasticsearch/index/analysis/icu/icu_numberformat.json @@ -0,0 +1,24 @@ +{ + "index":{ + "analysis":{ + "filter" : { + "spellout_de" : { + "type" : "icu_numberformat", + "locale" : "de", + "format" : "spellout" + }, + "spellout_en" : { + "type" : "icu_numberformat", + "locale" : "en_US", + "format" : "spellout" + } + }, + "tokenizer" : { + "my_tokenizer" : { + "type" : "icu_tokenizer", + "filter" : "spellout_de" + } + } + } + } +} \ No newline at end of file