adding rule-based number formatter as token filter

jprante · Feb 22, 2016 · 6ff3a41 · 6ff3a41
1 parent 4290c54
commit 6ff3a41
Show file tree

Hide file tree

Showing 7 changed files with 251 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -28,6 +28,7 @@ A plugin that consists of a compilation of useful Elasticsearch plugins related
 
 | Elasticsearch version    | Plugin        | Release date |
 | ------------------------ | ------------- | -------------|
+| 2.2.0                    | 2.2.0.1       | Feb 22, 2016 |
 | 2.2.0                    | 2.2.0.0       | Feb  8, 2016 |
 | 2.1.1                    | 2.1.1.2       | Dec 30, 2015 |
 | 2.1.1                    | 2.1.1.0       | Dec 21, 2015 |
@@ -49,7 +50,7 @@ A plugin that consists of a compilation of useful Elasticsearch plugins related
 
 ### Elasticsearch 2.x
 
-    ./bin/plugin install http://xbib.org/repository/org/xbib/elasticsearch/plugin/elasticsearch-plugin-bundle/2.2.0.0/elasticsearch-plugin-bundle-2.2.0.0-plugin.zip
+    ./bin/plugin install http://xbib.org/repository/org/xbib/elasticsearch/plugin/elasticsearch-plugin-bundle/2.2.0.1/elasticsearch-plugin-bundle-2.2.0.1-plugin.zip
 
 ### Elasticsearch 1.x
 

diff --git a/build.gradle b/build.gradle
@@ -1,6 +1,6 @@
 
 def xbibGroup = 'org.xbib.elasticsearch.plugin'
-def xbibVersion = '2.2.0.0'
+def xbibVersion = '2.2.0.1'
 
 group = xbibGroup
 version = xbibVersion
@@ -84,11 +84,6 @@ dependencies {
     wagon 'org.apache.maven.wagon:wagon-ssh-external:2.10'
 }
 
-compileJava {
-    sourceCompatibility = 1.7
-    targetCompatibility = 1.7
-}
-
 tasks.withType(JavaCompile) {
     options.compilerArgs << "-Xlint:unchecked" << "-Xlint:deprecation"
 }

diff --git a/src/main/java/org/xbib/elasticsearch/index/analysis/icu/ICUNumberFormatTokenFilter.java b/src/main/java/org/xbib/elasticsearch/index/analysis/icu/ICUNumberFormatTokenFilter.java
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2016 Jörg Prante
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published
+ * by the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program; if not, see http://www.gnu.org/licenses
+ * or write to the Free Software Foundation, Inc., 51 Franklin Street,
+ * Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * The interactive user interfaces in modified source and object code
+ * versions of this program must display Appropriate Legal Notices,
+ * as required under Section 5 of the GNU Affero General Public License.
+ *
+ */
+package org.xbib.elasticsearch.index.analysis.icu;
+
+import com.ibm.icu.text.NumberFormat;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+import java.io.IOException;
+import java.text.ParsePosition;
+
+public final class IcuNumberFormatTokenFilter extends TokenFilter {
+
+    private final NumberFormat numberFormat;
+
+    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+    public IcuNumberFormatTokenFilter(TokenStream input, NumberFormat numberFormat) {
+        super(input);
+        this.numberFormat = numberFormat;
+    }
+
+    @Override
+    public final boolean incrementToken() throws IOException {
+        if (!input.incrementToken()) {
+            return false;
+        } else {
+            String s = termAtt.toString();
+            ParsePosition parsePosition = new ParsePosition(0);
+            Number result = numberFormat.parse(s, parsePosition);
+            if (parsePosition.getIndex() > 0) {
+                // zehn-tausend -> zehntausend
+                // one hundred thousand -> onehundredthousand
+                s = numberFormat.format(result).replaceAll("[\u00AD\u0020]","");
+            }
+            termAtt.setEmpty().append(s);
+            return true;
+        }
+    }
+}
diff --git a/src/main/java/org/xbib/elasticsearch/index/analysis/icu/IcuAnalysisBinderProcessor.java b/src/main/java/org/xbib/elasticsearch/index/analysis/icu/IcuAnalysisBinderProcessor.java
@@ -43,6 +43,7 @@ public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
         tokenFiltersBindings.processTokenFilter("icu_normalizer", IcuNormalizerTokenFilterFactory.class);
         tokenFiltersBindings.processTokenFilter("icu_folding", IcuFoldingTokenFilterFactory.class);
         tokenFiltersBindings.processTokenFilter("icu_transform", IcuTransformTokenFilterFactory.class);
+        tokenFiltersBindings.processTokenFilter("icu_numberformat", IcuNumberFormatTokenFilterFactory.class);
     }
 
     @Override

diff --git a/...ain/java/org/xbib/elasticsearch/index/analysis/icu/IcuNumberFormatTokenFilterFactory.java b/...ain/java/org/xbib/elasticsearch/index/analysis/icu/IcuNumberFormatTokenFilterFactory.java
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 2016 Jörg Prante
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published
+ * by the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program; if not, see http://www.gnu.org/licenses
+ * or write to the Free Software Foundation, Inc., 51 Franklin Street,
+ * Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * The interactive user interfaces in modified source and object code
+ * versions of this program must display Appropriate Legal Notices,
+ * as required under Section 5 of the GNU Affero General Public License.
+ *
+ */
+package org.xbib.elasticsearch.index.analysis.icu;
+
+import com.ibm.icu.text.NumberFormat;
+import com.ibm.icu.text.RuleBasedNumberFormat;
+import org.apache.lucene.analysis.TokenStream;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.inject.assistedinject.Assisted;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
+import org.elasticsearch.index.settings.IndexSettingsService;
+
+import java.util.Locale;
+
+public class IcuNumberFormatTokenFilterFactory extends AbstractTokenFilterFactory {
+
+    private final NumberFormat numberFormat;
+
+    @Inject
+    public IcuNumberFormatTokenFilterFactory(Index index,
+                                             IndexSettingsService indexSettingsService,
+                                             @Assisted String name,
+                                             @Assisted Settings settings) {
+        super(index, indexSettingsService.indexSettings(), name, settings);
+        Locale locale = settings.get("locale") != null ? new Locale(settings.get("locale")) : Locale.getDefault();
+        String formatStr = settings.get("format", "SPELLOUT");
+        int format;
+        switch (formatStr.toUpperCase()) {
+            case "SPELLOUT" :
+                format = RuleBasedNumberFormat.SPELLOUT;
+                break;
+            case "DURATION" :
+                format = RuleBasedNumberFormat.DURATION;
+                break;
+            case "NUMBERING_SYSTEM" :
+                format = RuleBasedNumberFormat.NUMBERING_SYSTEM;
+                break;
+            case "NUMBERSTYLE" :
+                format = RuleBasedNumberFormat.NUMBERSTYLE;
+                break;
+            case "ORDINAL" :
+                format = RuleBasedNumberFormat.ORDINAL;
+                break;
+            default: format = RuleBasedNumberFormat.SPELLOUT;
+                break;
+        }
+        RuleBasedNumberFormat ruleBasedNumberFormat = new RuleBasedNumberFormat(locale, format);
+        // RBNF parsing is incredibly slow when lenient is enabled but the only method to parse compound number words
+        ruleBasedNumberFormat.setLenientParseMode(settings.getAsBoolean("lenient", true));
+        ruleBasedNumberFormat.setGroupingUsed(settings.getAsBoolean("grouping", true));
+        this.numberFormat = ruleBasedNumberFormat;
+    }
+
+    @Override
+    public TokenStream create(TokenStream tokenStream) {
+        return new IcuNumberFormatTokenFilter(tokenStream, numberFormat);
+    }
+}
diff --git a/src/test/java/org/xbib/elasticsearch/index/analysis/icu/IcuNumberFormatTests.java b/src/test/java/org/xbib/elasticsearch/index/analysis/icu/IcuNumberFormatTests.java
@@ -0,0 +1,82 @@
+package org.xbib.elasticsearch.index.analysis.icu;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.elasticsearch.index.analysis.AnalysisService;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
+import org.junit.Assert;
+import org.junit.Test;
+import org.xbib.elasticsearch.MapperTestUtils;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+public class IcuNumberFormatTests extends Assert {
+
+    @Test
+    public void testGermanNumberFormat() throws IOException {
+
+        String source = "Muss Rudi Völler fünftausend oder 10000 EUR Strafe zahlen?";
+
+        String[] expected = {
+                "Muss",
+                "Rudi",
+                "Völler",
+                "fünftausend",
+                "oder",
+                "zehntausend",
+                "EUR",
+                "Strafe",
+                "zahlen"
+        };
+        AnalysisService analysisService =
+                MapperTestUtils.analysisService("/org/xbib/elasticsearch/index/analysis/icu/icu_numberformat.json");
+        Tokenizer tokenizer = analysisService.tokenizer("my_tokenizer").create();
+        tokenizer.setReader(new StringReader(source));
+        TokenFilterFactory tokenFilter = analysisService.tokenFilter("spellout_de");
+        TokenStream tokenStream = tokenFilter.create(tokenizer);
+        assertSimpleTSOutput(tokenStream, expected);
+    }
+
+    @Test
+    public void testAmericanEnglish() throws IOException {
+
+        String source = "You will never get 100,000 US dollars of salary per year.";
+
+        String[] expected = {
+                "You",
+                "will",
+                "never",
+                "get",
+                "onehundredthousand",
+                "US",
+                "dollars",
+                "of",
+                "salary",
+                "per",
+                "year"
+        };
+        AnalysisService analysisService =
+                MapperTestUtils.analysisService("/org/xbib/elasticsearch/index/analysis/icu/icu_numberformat.json");
+        Tokenizer tokenizer = analysisService.tokenizer("my_tokenizer").create();
+        tokenizer.setReader(new StringReader(source));
+        TokenFilterFactory tokenFilter = analysisService.tokenFilter("spellout_en");
+        TokenStream tokenStream = tokenFilter.create(tokenizer);
+        assertSimpleTSOutput(tokenStream, expected);
+    }
+
+    private void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException {
+        stream.reset();
+        CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
+        assertNotNull(termAttr);
+        int i = 0;
+        while (stream.incrementToken()) {
+            assertTrue(i < expected.length);
+            assertEquals(expected[i], termAttr.toString());
+            i++;
+        }
+        assertEquals(i, expected.length);
+        stream.close();
+    }
+}
diff --git a/src/test/resources/org/xbib/elasticsearch/index/analysis/icu/icu_numberformat.json b/src/test/resources/org/xbib/elasticsearch/index/analysis/icu/icu_numberformat.json
@@ -0,0 +1,24 @@
+{
+    "index":{
+        "analysis":{
+            "filter" : {
+                "spellout_de" : {
+                  "type" : "icu_numberformat",
+                  "locale" : "de",
+                  "format" : "spellout"
+                },
+                "spellout_en" : {
+                  "type" : "icu_numberformat",
+                  "locale" : "en_US",
+                  "format" : "spellout"
+                }
+            },
+            "tokenizer" : {
+                "my_tokenizer" : {
+                  "type" : "icu_tokenizer",
+                  "filter" : "spellout_de"
+                }
+            }
+        }
+    }
+}