Skip to content

Commit

Permalink
adding rule-based number formatter as token filter
Browse files Browse the repository at this point in the history
  • Loading branch information
jprante committed Feb 22, 2016
1 parent 4290c54 commit 6ff3a41
Show file tree
Hide file tree
Showing 7 changed files with 251 additions and 7 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ A plugin that consists of a compilation of useful Elasticsearch plugins related

| Elasticsearch version | Plugin | Release date |
| ------------------------ | ------------- | -------------|
| 2.2.0 | 2.2.0.1 | Feb 22, 2016 |
| 2.2.0 | 2.2.0.0 | Feb 8, 2016 |
| 2.1.1 | 2.1.1.2 | Dec 30, 2015 |
| 2.1.1 | 2.1.1.0 | Dec 21, 2015 |
Expand All @@ -49,7 +50,7 @@ A plugin that consists of a compilation of useful Elasticsearch plugins related

### Elasticsearch 2.x

./bin/plugin install http://xbib.org/repository/org/xbib/elasticsearch/plugin/elasticsearch-plugin-bundle/2.2.0.0/elasticsearch-plugin-bundle-2.2.0.0-plugin.zip
./bin/plugin install http://xbib.org/repository/org/xbib/elasticsearch/plugin/elasticsearch-plugin-bundle/2.2.0.1/elasticsearch-plugin-bundle-2.2.0.1-plugin.zip

### Elasticsearch 1.x

Expand Down
7 changes: 1 addition & 6 deletions build.gradle
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

def xbibGroup = 'org.xbib.elasticsearch.plugin'
def xbibVersion = '2.2.0.0'
def xbibVersion = '2.2.0.1'

group = xbibGroup
version = xbibVersion
Expand Down Expand Up @@ -84,11 +84,6 @@ dependencies {
wagon 'org.apache.maven.wagon:wagon-ssh-external:2.10'
}

compileJava {
sourceCompatibility = 1.7
targetCompatibility = 1.7
}

tasks.withType(JavaCompile) {
options.compilerArgs << "-Xlint:unchecked" << "-Xlint:deprecation"
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
/*
* Copyright (C) 2016 Jörg Prante
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program; if not, see http://www.gnu.org/licenses
* or write to the Free Software Foundation, Inc., 51 Franklin Street,
* Fifth Floor, Boston, MA 02110-1301 USA.
*
* The interactive user interfaces in modified source and object code
* versions of this program must display Appropriate Legal Notices,
* as required under Section 5 of the GNU Affero General Public License.
*
*/
package org.xbib.elasticsearch.index.analysis.icu;

import com.ibm.icu.text.NumberFormat;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

import java.io.IOException;
import java.text.ParsePosition;

public final class IcuNumberFormatTokenFilter extends TokenFilter {

private final NumberFormat numberFormat;

private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

public IcuNumberFormatTokenFilter(TokenStream input, NumberFormat numberFormat) {
super(input);
this.numberFormat = numberFormat;
}

@Override
public final boolean incrementToken() throws IOException {
if (!input.incrementToken()) {
return false;
} else {
String s = termAtt.toString();
ParsePosition parsePosition = new ParsePosition(0);
Number result = numberFormat.parse(s, parsePosition);
if (parsePosition.getIndex() > 0) {
// zehn-tausend -> zehntausend
// one hundred thousand -> onehundredthousand
s = numberFormat.format(result).replaceAll("[\u00AD\u0020]","");
}
termAtt.setEmpty().append(s);
return true;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
tokenFiltersBindings.processTokenFilter("icu_normalizer", IcuNormalizerTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("icu_folding", IcuFoldingTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("icu_transform", IcuTransformTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("icu_numberformat", IcuNumberFormatTokenFilterFactory.class);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
* Copyright (C) 2016 Jörg Prante
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program; if not, see http://www.gnu.org/licenses
* or write to the Free Software Foundation, Inc., 51 Franklin Street,
* Fifth Floor, Boston, MA 02110-1301 USA.
*
* The interactive user interfaces in modified source and object code
* versions of this program must display Appropriate Legal Notices,
* as required under Section 5 of the GNU Affero General Public License.
*
*/
package org.xbib.elasticsearch.index.analysis.icu;

import com.ibm.icu.text.NumberFormat;
import com.ibm.icu.text.RuleBasedNumberFormat;
import org.apache.lucene.analysis.TokenStream;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.settings.IndexSettingsService;

import java.util.Locale;

public class IcuNumberFormatTokenFilterFactory extends AbstractTokenFilterFactory {

private final NumberFormat numberFormat;

@Inject
public IcuNumberFormatTokenFilterFactory(Index index,
IndexSettingsService indexSettingsService,
@Assisted String name,
@Assisted Settings settings) {
super(index, indexSettingsService.indexSettings(), name, settings);
Locale locale = settings.get("locale") != null ? new Locale(settings.get("locale")) : Locale.getDefault();
String formatStr = settings.get("format", "SPELLOUT");
int format;
switch (formatStr.toUpperCase()) {
case "SPELLOUT" :
format = RuleBasedNumberFormat.SPELLOUT;
break;
case "DURATION" :
format = RuleBasedNumberFormat.DURATION;
break;
case "NUMBERING_SYSTEM" :
format = RuleBasedNumberFormat.NUMBERING_SYSTEM;
break;
case "NUMBERSTYLE" :
format = RuleBasedNumberFormat.NUMBERSTYLE;
break;
case "ORDINAL" :
format = RuleBasedNumberFormat.ORDINAL;
break;
default: format = RuleBasedNumberFormat.SPELLOUT;
break;
}
RuleBasedNumberFormat ruleBasedNumberFormat = new RuleBasedNumberFormat(locale, format);
// RBNF parsing is incredibly slow when lenient is enabled but the only method to parse compound number words
ruleBasedNumberFormat.setLenientParseMode(settings.getAsBoolean("lenient", true));
ruleBasedNumberFormat.setGroupingUsed(settings.getAsBoolean("grouping", true));
this.numberFormat = ruleBasedNumberFormat;
}

@Override
public TokenStream create(TokenStream tokenStream) {
return new IcuNumberFormatTokenFilter(tokenStream, numberFormat);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
package org.xbib.elasticsearch.index.analysis.icu;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.elasticsearch.index.analysis.AnalysisService;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.junit.Assert;
import org.junit.Test;
import org.xbib.elasticsearch.MapperTestUtils;

import java.io.IOException;
import java.io.StringReader;

public class IcuNumberFormatTests extends Assert {

@Test
public void testGermanNumberFormat() throws IOException {

String source = "Muss Rudi Völler fünftausend oder 10000 EUR Strafe zahlen?";

String[] expected = {
"Muss",
"Rudi",
"Völler",
"fünftausend",
"oder",
"zehntausend",
"EUR",
"Strafe",
"zahlen"
};
AnalysisService analysisService =
MapperTestUtils.analysisService("/org/xbib/elasticsearch/index/analysis/icu/icu_numberformat.json");
Tokenizer tokenizer = analysisService.tokenizer("my_tokenizer").create();
tokenizer.setReader(new StringReader(source));
TokenFilterFactory tokenFilter = analysisService.tokenFilter("spellout_de");
TokenStream tokenStream = tokenFilter.create(tokenizer);
assertSimpleTSOutput(tokenStream, expected);
}

@Test
public void testAmericanEnglish() throws IOException {

String source = "You will never get 100,000 US dollars of salary per year.";

String[] expected = {
"You",
"will",
"never",
"get",
"onehundredthousand",
"US",
"dollars",
"of",
"salary",
"per",
"year"
};
AnalysisService analysisService =
MapperTestUtils.analysisService("/org/xbib/elasticsearch/index/analysis/icu/icu_numberformat.json");
Tokenizer tokenizer = analysisService.tokenizer("my_tokenizer").create();
tokenizer.setReader(new StringReader(source));
TokenFilterFactory tokenFilter = analysisService.tokenFilter("spellout_en");
TokenStream tokenStream = tokenFilter.create(tokenizer);
assertSimpleTSOutput(tokenStream, expected);
}

private void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException {
stream.reset();
CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
assertNotNull(termAttr);
int i = 0;
while (stream.incrementToken()) {
assertTrue(i < expected.length);
assertEquals(expected[i], termAttr.toString());
i++;
}
assertEquals(i, expected.length);
stream.close();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"index":{
"analysis":{
"filter" : {
"spellout_de" : {
"type" : "icu_numberformat",
"locale" : "de",
"format" : "spellout"
},
"spellout_en" : {
"type" : "icu_numberformat",
"locale" : "en_US",
"format" : "spellout"
}
},
"tokenizer" : {
"my_tokenizer" : {
"type" : "icu_tokenizer",
"filter" : "spellout_de"
}
}
}
}
}

0 comments on commit 6ff3a41

Please sign in to comment.