Bitişik kelimelerin ayrılması durumunda birden fazla öneri durumunda …

…sıralama için tek kelimelik dil modeli olasılıklarını kullandım. Ancak bu çok doğru çalışmıyor. Normalde 2 kelimelik dil modeli olasılıkları kullanılması gerekirdi. Şu an için bu mümkün olmadığından bu şekil idare edebiliriz.
COMU · Nov 7, 2018 · bb167db · bb167db
1 parent 9e89456
commit bb167db
Show file tree

Hide file tree

Showing 2 changed files with 56 additions and 11 deletions.
diff --git a/libreoffice-tr-tools/lib/runtime/zemberek-lo.jar b/libreoffice-tr-tools/lib/runtime/zemberek-lo.jar
diff --git a/libreoffice-tr-tools/src/lo/tr/tools/spellchecker/ZemberekSpellCheck.java b/libreoffice-tr-tools/src/lo/tr/tools/spellchecker/ZemberekSpellCheck.java
@@ -1,8 +1,14 @@
 package lo.tr.tools.spellchecker;
 
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
+import java.util.stream.Collectors;
+import zemberek.core.ScoredItem;
 import zemberek.core.turkish.RootAttribute;
+import zemberek.lm.LmVocabulary;
+import zemberek.lm.NgramLanguageModel;
 import zemberek.morphology.TurkishMorphology;
 import zemberek.normalization.TurkishSpellChecker;
 
@@ -12,6 +18,7 @@ public class ZemberekSpellCheck implements TurkishLinguist {
 
   private TurkishMorphology morphology;
   private TurkishSpellChecker spellChecker;
+  private NgramLanguageModel uniGramLanguageModel;
 
   private ZemberekSpellCheck() {
     this.morphology = TurkishMorphology.createWithDefaults();
@@ -21,6 +28,8 @@ private ZemberekSpellCheck() {
       // so that informal or out of official Turkish dictionary words are not allowed.
       this.spellChecker.setAnalysisPredicate(
           a -> !a.getDictionaryItem().hasAnyAttribute(RootAttribute.Ext, RootAttribute.Informal));
+      this.uniGramLanguageModel = spellChecker.getUnigramLanguageModel();
+
     } catch (IOException e) {
       e.printStackTrace();
     }
@@ -31,33 +40,69 @@ public static ZemberekSpellCheck getInstance() {
   }
 
   public boolean isCorrect(String w) {
-    int indexOfDash = w.indexOf("-");
+    String input = removePunctuation(w);
+    int indexOfDash = input.indexOf("-");
     if (indexOfDash != -1) {
-      String w1 = w.substring(0, indexOfDash);
-      String w2 = w.substring(indexOfDash + 1);
+      String w1 = input.substring(0, indexOfDash);
+      String w2 = input.substring(indexOfDash + 1);
       if (spellChecker.check(w1) && spellChecker.check(w2)) {
         return true;
       }
     }
-    return spellChecker.check(removePunctuation(w));
+    return spellChecker.check(input);
   }
 
   public List<String> getSuggestions(String s) {
+
     List<String> suggestions = spellChecker.suggestForWord(removePunctuation(s));
-    for (int i = 1; i <= s.length() - 2; i++) {
-      String s1 = s.substring(0, i);
-      String s2 = s.substring(i);
-      if (isCorrect(s1) && isCorrect(s2)) {
-        suggestions.add(0, s1 + " " + s2);
-      }
-    }
     if (suggestions.size() > 7) {
       return suggestions.subList(0, 7);
     }
+    suggestions.addAll(splitWordSuggestions(s));
     return suggestions;
   }
 
   private String removePunctuation(String s) {
     return s.replaceAll("\\p{Punct}+$", "");
   }
+
+  private List<String> splitWordSuggestions(String s) {
+
+    // Prevent small or large inputs.
+    if (s.length() < 3 || s.length() > 25) {
+      return Collections.emptyList();
+    }
+
+    // Apply brute force splitting, and use uni-gram probabilities for ranking multiple scores.
+    // Normally using a higher order language model would be the correct approach
+    // for ranking but that is not available.
+    List<ScoredItem<String>> suggestions = new ArrayList<>(3);
+    LmVocabulary vocabulary = uniGramLanguageModel.getVocabulary();
+
+    for (int i = 1; i < s.length() - 1; i++) {
+      String s1 = s.substring(0, i);
+      String s2 = s.substring(i);
+      if (isCorrect(s1) && isCorrect(s2)) {
+        float p1 = uniGramLanguageModel.getProbability(vocabulary.indexOf(s1));
+        float p2 = uniGramLanguageModel.getProbability(vocabulary.indexOf(s2));
+        suggestions.add(new ScoredItem<>(s1 + " " + s2, p1 + p2));
+      }
+    }
+
+    if (suggestions.size() == 0) {
+      return Collections.emptyList();
+    }
+
+    // Sort with scores. Higher scored item comes first.
+    suggestions.sort((a, b) -> Float.compare(b.score, a.score));
+
+    // Only top 3
+    if (suggestions.size() > 3) {
+      suggestions = suggestions.subList(0, 3);
+    }
+
+    return suggestions.stream().map(a -> a.item).collect(Collectors.toList());
+  }
+
+
 }