-
Notifications
You must be signed in to change notification settings - Fork 17
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
adding rule-based number formatter as token filter
- Loading branch information
Showing
7 changed files
with
251 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
61 changes: 61 additions & 0 deletions
61
src/main/java/org/xbib/elasticsearch/index/analysis/icu/ICUNumberFormatTokenFilter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
/* | ||
* Copyright (C) 2016 Jörg Prante | ||
* | ||
* This program is free software; you can redistribute it and/or modify | ||
* it under the terms of the GNU Affero General Public License as published | ||
* by the Free Software Foundation; either version 3 of the License, or | ||
* (at your option) any later version. | ||
* This program is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
* GNU Affero General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU Affero General Public License | ||
* along with this program; if not, see http://www.gnu.org/licenses | ||
* or write to the Free Software Foundation, Inc., 51 Franklin Street, | ||
* Fifth Floor, Boston, MA 02110-1301 USA. | ||
* | ||
* The interactive user interfaces in modified source and object code | ||
* versions of this program must display Appropriate Legal Notices, | ||
* as required under Section 5 of the GNU Affero General Public License. | ||
* | ||
*/ | ||
package org.xbib.elasticsearch.index.analysis.icu; | ||
|
||
import com.ibm.icu.text.NumberFormat; | ||
import org.apache.lucene.analysis.TokenFilter; | ||
import org.apache.lucene.analysis.TokenStream; | ||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | ||
|
||
import java.io.IOException; | ||
import java.text.ParsePosition; | ||
|
||
public final class IcuNumberFormatTokenFilter extends TokenFilter { | ||
|
||
private final NumberFormat numberFormat; | ||
|
||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); | ||
|
||
public IcuNumberFormatTokenFilter(TokenStream input, NumberFormat numberFormat) { | ||
super(input); | ||
this.numberFormat = numberFormat; | ||
} | ||
|
||
@Override | ||
public final boolean incrementToken() throws IOException { | ||
if (!input.incrementToken()) { | ||
return false; | ||
} else { | ||
String s = termAtt.toString(); | ||
ParsePosition parsePosition = new ParsePosition(0); | ||
Number result = numberFormat.parse(s, parsePosition); | ||
if (parsePosition.getIndex() > 0) { | ||
// zehn-tausend -> zehntausend | ||
// one hundred thousand -> onehundredthousand | ||
s = numberFormat.format(result).replaceAll("[\u00AD\u0020]",""); | ||
} | ||
termAtt.setEmpty().append(s); | ||
return true; | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
80 changes: 80 additions & 0 deletions
80
...ain/java/org/xbib/elasticsearch/index/analysis/icu/IcuNumberFormatTokenFilterFactory.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
/* | ||
* Copyright (C) 2016 Jörg Prante | ||
* | ||
* This program is free software; you can redistribute it and/or modify | ||
* it under the terms of the GNU Affero General Public License as published | ||
* by the Free Software Foundation; either version 3 of the License, or | ||
* (at your option) any later version. | ||
* This program is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
* GNU Affero General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU Affero General Public License | ||
* along with this program; if not, see http://www.gnu.org/licenses | ||
* or write to the Free Software Foundation, Inc., 51 Franklin Street, | ||
* Fifth Floor, Boston, MA 02110-1301 USA. | ||
* | ||
* The interactive user interfaces in modified source and object code | ||
* versions of this program must display Appropriate Legal Notices, | ||
* as required under Section 5 of the GNU Affero General Public License. | ||
* | ||
*/ | ||
package org.xbib.elasticsearch.index.analysis.icu; | ||
|
||
import com.ibm.icu.text.NumberFormat; | ||
import com.ibm.icu.text.RuleBasedNumberFormat; | ||
import org.apache.lucene.analysis.TokenStream; | ||
import org.elasticsearch.common.inject.Inject; | ||
import org.elasticsearch.common.inject.assistedinject.Assisted; | ||
import org.elasticsearch.common.settings.Settings; | ||
import org.elasticsearch.index.Index; | ||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; | ||
import org.elasticsearch.index.settings.IndexSettingsService; | ||
|
||
import java.util.Locale; | ||
|
||
public class IcuNumberFormatTokenFilterFactory extends AbstractTokenFilterFactory { | ||
|
||
private final NumberFormat numberFormat; | ||
|
||
@Inject | ||
public IcuNumberFormatTokenFilterFactory(Index index, | ||
IndexSettingsService indexSettingsService, | ||
@Assisted String name, | ||
@Assisted Settings settings) { | ||
super(index, indexSettingsService.indexSettings(), name, settings); | ||
Locale locale = settings.get("locale") != null ? new Locale(settings.get("locale")) : Locale.getDefault(); | ||
String formatStr = settings.get("format", "SPELLOUT"); | ||
int format; | ||
switch (formatStr.toUpperCase()) { | ||
case "SPELLOUT" : | ||
format = RuleBasedNumberFormat.SPELLOUT; | ||
break; | ||
case "DURATION" : | ||
format = RuleBasedNumberFormat.DURATION; | ||
break; | ||
case "NUMBERING_SYSTEM" : | ||
format = RuleBasedNumberFormat.NUMBERING_SYSTEM; | ||
break; | ||
case "NUMBERSTYLE" : | ||
format = RuleBasedNumberFormat.NUMBERSTYLE; | ||
break; | ||
case "ORDINAL" : | ||
format = RuleBasedNumberFormat.ORDINAL; | ||
break; | ||
default: format = RuleBasedNumberFormat.SPELLOUT; | ||
break; | ||
} | ||
RuleBasedNumberFormat ruleBasedNumberFormat = new RuleBasedNumberFormat(locale, format); | ||
// RBNF parsing is incredibly slow when lenient is enabled but the only method to parse compound number words | ||
ruleBasedNumberFormat.setLenientParseMode(settings.getAsBoolean("lenient", true)); | ||
ruleBasedNumberFormat.setGroupingUsed(settings.getAsBoolean("grouping", true)); | ||
this.numberFormat = ruleBasedNumberFormat; | ||
} | ||
|
||
@Override | ||
public TokenStream create(TokenStream tokenStream) { | ||
return new IcuNumberFormatTokenFilter(tokenStream, numberFormat); | ||
} | ||
} |
82 changes: 82 additions & 0 deletions
82
src/test/java/org/xbib/elasticsearch/index/analysis/icu/IcuNumberFormatTests.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
package org.xbib.elasticsearch.index.analysis.icu; | ||
|
||
import org.apache.lucene.analysis.TokenStream; | ||
import org.apache.lucene.analysis.Tokenizer; | ||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | ||
import org.elasticsearch.index.analysis.AnalysisService; | ||
import org.elasticsearch.index.analysis.TokenFilterFactory; | ||
import org.junit.Assert; | ||
import org.junit.Test; | ||
import org.xbib.elasticsearch.MapperTestUtils; | ||
|
||
import java.io.IOException; | ||
import java.io.StringReader; | ||
|
||
public class IcuNumberFormatTests extends Assert { | ||
|
||
@Test | ||
public void testGermanNumberFormat() throws IOException { | ||
|
||
String source = "Muss Rudi Völler fünftausend oder 10000 EUR Strafe zahlen?"; | ||
|
||
String[] expected = { | ||
"Muss", | ||
"Rudi", | ||
"Völler", | ||
"fünftausend", | ||
"oder", | ||
"zehntausend", | ||
"EUR", | ||
"Strafe", | ||
"zahlen" | ||
}; | ||
AnalysisService analysisService = | ||
MapperTestUtils.analysisService("/org/xbib/elasticsearch/index/analysis/icu/icu_numberformat.json"); | ||
Tokenizer tokenizer = analysisService.tokenizer("my_tokenizer").create(); | ||
tokenizer.setReader(new StringReader(source)); | ||
TokenFilterFactory tokenFilter = analysisService.tokenFilter("spellout_de"); | ||
TokenStream tokenStream = tokenFilter.create(tokenizer); | ||
assertSimpleTSOutput(tokenStream, expected); | ||
} | ||
|
||
@Test | ||
public void testAmericanEnglish() throws IOException { | ||
|
||
String source = "You will never get 100,000 US dollars of salary per year."; | ||
|
||
String[] expected = { | ||
"You", | ||
"will", | ||
"never", | ||
"get", | ||
"onehundredthousand", | ||
"US", | ||
"dollars", | ||
"of", | ||
"salary", | ||
"per", | ||
"year" | ||
}; | ||
AnalysisService analysisService = | ||
MapperTestUtils.analysisService("/org/xbib/elasticsearch/index/analysis/icu/icu_numberformat.json"); | ||
Tokenizer tokenizer = analysisService.tokenizer("my_tokenizer").create(); | ||
tokenizer.setReader(new StringReader(source)); | ||
TokenFilterFactory tokenFilter = analysisService.tokenFilter("spellout_en"); | ||
TokenStream tokenStream = tokenFilter.create(tokenizer); | ||
assertSimpleTSOutput(tokenStream, expected); | ||
} | ||
|
||
private void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException { | ||
stream.reset(); | ||
CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); | ||
assertNotNull(termAttr); | ||
int i = 0; | ||
while (stream.incrementToken()) { | ||
assertTrue(i < expected.length); | ||
assertEquals(expected[i], termAttr.toString()); | ||
i++; | ||
} | ||
assertEquals(i, expected.length); | ||
stream.close(); | ||
} | ||
} |
24 changes: 24 additions & 0 deletions
24
src/test/resources/org/xbib/elasticsearch/index/analysis/icu/icu_numberformat.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
{ | ||
"index":{ | ||
"analysis":{ | ||
"filter" : { | ||
"spellout_de" : { | ||
"type" : "icu_numberformat", | ||
"locale" : "de", | ||
"format" : "spellout" | ||
}, | ||
"spellout_en" : { | ||
"type" : "icu_numberformat", | ||
"locale" : "en_US", | ||
"format" : "spellout" | ||
} | ||
}, | ||
"tokenizer" : { | ||
"my_tokenizer" : { | ||
"type" : "icu_tokenizer", | ||
"filter" : "spellout_de" | ||
} | ||
} | ||
} | ||
} | ||
} |