From 76a44a40de7fac7ede0fc15bf853cb72d02deb16 Mon Sep 17 00:00:00 2001 From: ahmetaa Date: Fri, 23 Nov 2018 00:17:21 +0300 Subject: [PATCH] ZemberekSpellCheck -> ZemberekSpellChecker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TurkishLinguist ve DummyTurkishLinguist silindi. Fix #7 Informal Analysis mekanizmasının eklentiye eklenmesi Fix #18 Tek başına yazılan harfler doğru kabul edilmeli. --- .../spellchecker/DummyTurkishLinguist.java | 26 ------ .../tools/spellchecker/TurkishLinguist.java | 10 --- .../spellchecker/TurkishSpellChecker.java | 11 ++- ...llCheck.java => ZemberekSpellChecker.java} | 89 ++++++++++++++++--- 4 files changed, 84 insertions(+), 52 deletions(-) delete mode 100644 libreoffice-tr-tools/src/lo/tr/tools/spellchecker/DummyTurkishLinguist.java delete mode 100644 libreoffice-tr-tools/src/lo/tr/tools/spellchecker/TurkishLinguist.java rename libreoffice-tr-tools/src/lo/tr/tools/spellchecker/{ZemberekSpellCheck.java => ZemberekSpellChecker.java} (52%) diff --git a/libreoffice-tr-tools/src/lo/tr/tools/spellchecker/DummyTurkishLinguist.java b/libreoffice-tr-tools/src/lo/tr/tools/spellchecker/DummyTurkishLinguist.java deleted file mode 100644 index 6d8d4c3..0000000 --- a/libreoffice-tr-tools/src/lo/tr/tools/spellchecker/DummyTurkishLinguist.java +++ /dev/null @@ -1,26 +0,0 @@ -package lo.tr.tools.spellchecker; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -class DummyTurkishLinguist implements TurkishLinguist { - - private Map> spellMap = new HashMap<>(); - - DummyTurkishLinguist() { - spellMap.put("yanlız", Arrays.asList("yalnız", "yanlı")); - spellMap.put("mrb", Arrays.asList("merhaba", "maraba")); - spellMap.put("keske", Arrays.asList("keşke", "keski", "keşkek")); - } - - public List getSuggestions(String s) { - return spellMap.containsKey(s) ? spellMap.get(s) : new ArrayList<>(0); - } - - public boolean isCorrect(String w) { - return !spellMap.containsKey(w); - } -} diff --git a/libreoffice-tr-tools/src/lo/tr/tools/spellchecker/TurkishLinguist.java b/libreoffice-tr-tools/src/lo/tr/tools/spellchecker/TurkishLinguist.java deleted file mode 100644 index f1ea68a..0000000 --- a/libreoffice-tr-tools/src/lo/tr/tools/spellchecker/TurkishLinguist.java +++ /dev/null @@ -1,10 +0,0 @@ -package lo.tr.tools.spellchecker; - -import java.util.List; - -public interface TurkishLinguist { - - boolean isCorrect(String w); - - List getSuggestions(String s); -} diff --git a/libreoffice-tr-tools/src/lo/tr/tools/spellchecker/TurkishSpellChecker.java b/libreoffice-tr-tools/src/lo/tr/tools/spellchecker/TurkishSpellChecker.java index a76686b..16e635a 100644 --- a/libreoffice-tr-tools/src/lo/tr/tools/spellchecker/TurkishSpellChecker.java +++ b/libreoffice-tr-tools/src/lo/tr/tools/spellchecker/TurkishSpellChecker.java @@ -5,6 +5,7 @@ import com.sun.star.beans.XPropertySet; import com.sun.star.lang.IllegalArgumentException; import com.sun.star.lang.Locale; +import com.sun.star.lang.XEventListener; import com.sun.star.lang.XInitialization; import com.sun.star.lang.XMultiServiceFactory; import com.sun.star.lang.XServiceDisplayName; @@ -36,6 +37,12 @@ public class TurkishSpellChecker extends ComponentBase implements XServiceDisplayName, XServiceInfo { + @Override + public void addEventListener(XEventListener xEventListener) { + System.out.println(xEventListener); + super.addEventListener(xEventListener); + } + static final String[] EMPTY_STRING_ARRAY = new String[0]; private static String serviceName = TurkishSpellChecker.class.getName(); private static Locale turkishLocale = new Locale("tr", "TR", ""); @@ -46,8 +53,8 @@ public class TurkishSpellChecker extends ComponentBase implements "lo.tr.tools.spellchecker.TurkishSpellChecker" }; - private static ZemberekSpellCheck spellChecker = - ZemberekSpellCheck.getInstance(); + private static ZemberekSpellChecker spellChecker = + ZemberekSpellChecker.getInstance(); PropChgHelperSpell propertyChangeHelper; ArrayList eventListeners; diff --git a/libreoffice-tr-tools/src/lo/tr/tools/spellchecker/ZemberekSpellCheck.java b/libreoffice-tr-tools/src/lo/tr/tools/spellchecker/ZemberekSpellChecker.java similarity index 52% rename from libreoffice-tr-tools/src/lo/tr/tools/spellchecker/ZemberekSpellCheck.java rename to libreoffice-tr-tools/src/lo/tr/tools/spellchecker/ZemberekSpellChecker.java index de91b45..0a16e91 100644 --- a/libreoffice-tr-tools/src/lo/tr/tools/spellchecker/ZemberekSpellCheck.java +++ b/libreoffice-tr-tools/src/lo/tr/tools/spellchecker/ZemberekSpellChecker.java @@ -3,6 +3,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Collections; +import java.util.LinkedHashSet; import java.util.List; import java.util.stream.Collectors; import zemberek.core.ScoredItem; @@ -10,36 +11,57 @@ import zemberek.lm.LmVocabulary; import zemberek.lm.NgramLanguageModel; import zemberek.morphology.TurkishMorphology; +import zemberek.morphology.analysis.InformalAnalysisConverter; +import zemberek.morphology.analysis.SingleAnalysis; +import zemberek.morphology.analysis.WordAnalysis; +import zemberek.morphology.analysis.WordAnalysisSurfaceFormatter; +import zemberek.morphology.analysis.WordAnalysisSurfaceFormatter.CaseType; +import zemberek.morphology.generator.WordGenerator; +import zemberek.morphology.lexicon.RootLexicon; import zemberek.normalization.TurkishSpellChecker; -public class ZemberekSpellCheck implements TurkishLinguist { +public class ZemberekSpellChecker { - public static ZemberekSpellCheck instance = new ZemberekSpellCheck(); + public static ZemberekSpellChecker instance = new ZemberekSpellChecker(); private TurkishMorphology morphology; private TurkishSpellChecker spellChecker; private NgramLanguageModel uniGramLanguageModel; + private InformalAnalysisConverter informalConverter; - private ZemberekSpellCheck() { - this.morphology = TurkishMorphology.createWithDefaults(); + private ZemberekSpellChecker() { + this.morphology = TurkishMorphology.builder() + .setLexicon(RootLexicon.getDefault()) + .useInformalAnalysis().build(); try { this.spellChecker = new TurkishSpellChecker(morphology); // add a predicate to the spell checker // so that informal or out of official Turkish dictionary words are not allowed. this.spellChecker.setAnalysisPredicate( - a -> !a.getDictionaryItem().hasAnyAttribute(RootAttribute.Ext, RootAttribute.Informal)); + a -> !a.getDictionaryItem() + .hasAnyAttribute(RootAttribute.Ext, RootAttribute.Informal) + && !a.containsInformalMorpheme()); this.uniGramLanguageModel = spellChecker.getUnigramLanguageModel(); - + this.informalConverter = new InformalAnalysisConverter(morphology.getWordGenerator()); } catch (IOException e) { e.printStackTrace(); } } - public static ZemberekSpellCheck getInstance() { + public static ZemberekSpellChecker getInstance() { return instance; } public boolean isCorrect(String w) { + + if (w == null || w.isEmpty()) { + return true; + } + + if (w.length() == 1) { + return true; + } + String input = removePunctuation(w); int indexOfDash = input.indexOf("-"); if (indexOfDash != -1) { @@ -54,14 +76,54 @@ public boolean isCorrect(String w) { public List getSuggestions(String s) { - List suggestions = new ArrayList<>(); - suggestions.addAll(splitWordSuggestions(s)); - suggestions.addAll(spellChecker.suggestForWord(removePunctuation(s))); - if (suggestions.size() > 7) { - return suggestions.subList(0, 7); + LinkedHashSet suggestions = new LinkedHashSet<>(splitWordSuggestions(s)); + String word = removePunctuation(s); + suggestions.addAll(informalWordSuggestions(word)); + suggestions.addAll(spellChecker.suggestForWord(word)); + + List result = new ArrayList<>(suggestions); + if (result.size() > 9) { + + return result.subList(0, 9); + } + return result; + } + + private static WordAnalysisSurfaceFormatter formatter = new WordAnalysisSurfaceFormatter(); + + private List informalWordSuggestions(String s) { + + CaseType caseType = formatter.guessCase(s); + + WordAnalysis a = morphology.analyze(s); + if (a.analysisCount() == 0) { + return Collections.emptyList(); } + List result = new ArrayList<>(1); + for (SingleAnalysis analysis : a) { + if (analysis.containsInformalMorpheme()) { + WordGenerator.Result res = informalConverter.convert(s, analysis); + String apostrophe = getApostrophe(s); + + if (formatter.canBeFormatted(analysis, caseType)) { + String formatted = formatter.formatToCase(res.analysis, caseType, apostrophe); + result.add(formatted); + } else { + result.add(res.surface); + } + } + } + return result; + } - return suggestions; + private String getApostrophe(String input) { + String apostrophe; + if (input.indexOf('’') > 0) { + apostrophe = "’"; + } else { + apostrophe = "'"; + } + return apostrophe; } private String removePunctuation(String s) { @@ -106,5 +168,4 @@ private List splitWordSuggestions(String s) { return suggestions.stream().map(a -> a.item).collect(Collectors.toList()); } - }