Grammar packer now records max source phrase length, big simplification

joshua-decoder · Apr 11, 2015 · fb16123 · fb16123
1 parent 45f2a19
commit fb16123
Show file tree

Hide file tree

Showing 8 changed files with 62 additions and 34 deletions.
diff --git a/src/joshua/decoder/ff/tm/AbstractGrammar.java b/src/joshua/decoder/ff/tm/AbstractGrammar.java
@@ -44,6 +44,21 @@ public abstract class AbstractGrammar implements Grammar {
    */
   protected int owner = -1;
 
+  /*
+   * The maximum length of a source-side phrase. Mostly used by the phrase-based decoder.
+   */
+  protected int maxSourcePhraseLength = -1;
+
+    /**
+   * Returns the longest source phrase read.
+   * 
+   * @return the longest source phrase read (nonterminal + terminal symbols).
+   */
+  @Override
+  public int getMaxSourcePhraseLength() {
+    return maxSourcePhraseLength;
+  }
+
   @Override
   public int getOwner() {
     return owner;

diff --git a/src/joshua/decoder/ff/tm/Grammar.java b/src/joshua/decoder/ff/tm/Grammar.java
@@ -92,6 +92,11 @@ public interface Grammar {
    * Return the grammar's owner.
    */
   int getOwner();
+
+  /**
+   * Return the maximum source phrase length (terminals + nonterminals).
+   */
+  int getMaxSourcePhraseLength();
 
   /**
    * Add an OOV rule for the requested word for the grammar.

diff --git a/src/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java b/src/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
@@ -49,9 +49,6 @@ public class MemoryBasedBatchGrammar extends AbstractGrammar {
 
   private GrammarReader<Rule> modelReader;
 
-  /* Maximum source phrase length */
-  protected int maxSourcePhraseLength = 0;
-
   /* Whether the grammar's rules contain regular expressions. */
   private boolean isRegexpGrammar = false;
 
@@ -228,16 +225,7 @@ public boolean isRegexpGrammar() {
   public void setRegexpGrammar(boolean value) {
     this.isRegexpGrammar = value;
   }
-
-  /**
-   * Returns the longest source phrase read.
-   * 
-   * @return the longest source phrase read (nonterminal + terminal symbols).
-   */
-  public int getMaxSourcePhraseLength() {
-    return maxSourcePhraseLength;
-  }
-
+
   /***
    * Takes an input word and creates an OOV rule in the current grammar for that word.
    * 

diff --git a/src/joshua/decoder/ff/tm/packed/PackedGrammar.java b/src/joshua/decoder/ff/tm/packed/PackedGrammar.java
@@ -1,7 +1,7 @@
 package joshua.decoder.ff.tm.packed;
 
 /***
- * This package implements Joshua's packed grammar structure, which enables the efficient loading
+ * This package implements Joshua's packed grammar structure, which enables the efficient loading	
  * and accessing of grammars. It is described in the paper:
  * 
  * @article{ganitkevitch2012joshua,
@@ -61,6 +61,7 @@
 import joshua.decoder.ff.tm.hash_based.ExtensionIterator;
 import joshua.util.encoding.EncoderConfiguration;
 import joshua.util.encoding.FloatEncoder;
+import joshua.util.io.LineReader;
 
 public class PackedGrammar extends AbstractGrammar {
 
@@ -83,6 +84,13 @@ public PackedGrammar(String grammar_dir, int span_limit, String owner, String ty
     Decoder.LOG(1, String.format("Reading vocabulary: %s", vocabFile));
     Vocabulary.read(vocabFile);
 
+    // Read the config
+    String configFile = grammar_dir + File.separator + "config";
+    if (new File(configFile).exists()) {
+      Decoder.LOG(1, String.format("Reading packed config: %s", configFile));
+      readConfig(configFile);
+    }
+
     // Read the quantizer setup.
     Decoder.LOG(1, String.format("Reading encoder configuration: %s%sencoding", grammar_dir, File.separator));
     encoding = new EncoderConfiguration();
@@ -837,4 +845,12 @@ public boolean isRegexpGrammar() {
   public void addOOVRules(int word, List<FeatureFunction> featureFunctions) {
     throw new RuntimeException("PackedGrammar: I can't add OOV rules");
   }
+
+  private void readConfig(String config) throws IOException {
+    for (String line: new LineReader(config)) {
+      String[] tokens = line.split(" = ");
+      if (tokens[0].equals("max-source-len"))
+        this.maxSourcePhraseLength = Integer.parseInt(tokens[1]);
+    }
+  }
 }
diff --git a/src/joshua/decoder/phrase/PhraseTable.java b/src/joshua/decoder/phrase/PhraseTable.java
@@ -24,7 +24,6 @@ public class PhraseTable implements Grammar {
 
   private JoshuaConfiguration config;
   private Grammar backend;
-  private int maxSourcePhraseLength = -1;
 
   /**
    * Chain to the super with a number of defaults. For example, we only use a single nonterminal,
@@ -42,12 +41,11 @@ public PhraseTable(String grammarFile, String owner, JoshuaConfiguration config,
 
     if (new File(grammarFile).isDirectory()) {
       this.backend = new PackedGrammar(grammarFile, spanLimit, owner, "moses", config);
-      if (maxSource == -1) {
-        System.err.println("FATAL: Using a packed grammar for a phrase table backend requires");
-        System.err.println("       you to specify -max-source-len in the tm line");
+      if (this.backend.getMaxSourcePhraseLength() == -1) {
+        System.err.println("FATAL: Using a packed grammar for a phrase table backend requires that you");
+        System.err.println("       packed the grammar with Joshua 6.0.2 or greater");
         System.exit(-1);
       }
-      setMaxSourcePhraseLength(maxSource);
 
     } else {
       this.backend = new MemoryBasedBatchGrammar("moses", grammarFile, owner, "[X]", spanLimit, config);
@@ -59,28 +57,20 @@ public PhraseTable(String owner, JoshuaConfiguration config) {
 
     this.backend = new MemoryBasedBatchGrammar(owner, config);
   }
-
-  /**
-   * The maximum length of a source phrase found in the grammar.
-   * 
-   * @param max
-   */
-  public void setMaxSourcePhraseLength(int max) {
-    this.maxSourcePhraseLength = max;
-  }
 
   /**
-   * Returns the longest source phrase read, subtracting off the nonterminal that was added.
-   * The {@link MemoryBasedBatchGrammar} computes this as the grammar is read in, whereas
-   * a {@link PackedGrammar} has to be told the maximum length via the config file.
+   * Returns the longest source phrase read. For {@link MemoryBasedBatchGrammar}s, we subtract 1
+   * since the grammar includes the nonterminal. For {@link PackedGrammar}s, the value was either
+   * in the packed config file (Joshua 6.0.2+) or was passed in via the TM config line.
    * 
    * @return
    */
+  @Override
   public int getMaxSourcePhraseLength() {
     if (backend instanceof MemoryBasedBatchGrammar)
-      return ((MemoryBasedBatchGrammar)backend).getMaxSourcePhraseLength() - 1;
+      return this.backend.getMaxSourcePhraseLength() - 1;
     else
-      return this.maxSourcePhraseLength;
+      return this.backend.getMaxSourcePhraseLength();
   }
 
   /**

diff --git a/src/joshua/tools/GrammarPacker.java b/src/joshua/tools/GrammarPacker.java
@@ -4,6 +4,7 @@
 import java.io.DataOutputStream;
 import java.io.File;
 import java.io.FileOutputStream;
+import java.io.FileWriter;
 import java.io.IOException;
 import java.io.PrintWriter;
 import java.nio.ByteBuffer;
@@ -51,6 +52,8 @@ public class GrammarPacker {
 
   private String dump;
 
+  private int max_source_len;
+
   static {
     SLICE_SIZE = 1000000;
     DATA_SIZE_LIMIT = (int) (Integer.MAX_VALUE * 0.8);
@@ -65,6 +68,7 @@ public GrammarPacker(String grammar_filename, String config_filename, String out
     this.output = output_filename;
     this.dump = featuredump_filename;
     this.grammarAlignments = grammar_alignments;
+    this.max_source_len = 0;
 
     // TODO: Always open encoder config? This is debatable.
     this.types = new FeatureTypeAnalyzer(true);
@@ -152,6 +156,13 @@ public void pack() throws IOException {
     logger.info("Writing vocab.");
     Vocabulary.write(output + File.separator + "vocabulary");
 
+    String configFile = output + File.separator + "config";
+    logger.info(String.format("Writing config to '%s'", configFile));
+    // Write config options
+    FileWriter config = new FileWriter(configFile);
+    config.write(String.format("max-source-len = %d\n", max_source_len));
+    config.close();
+
     // Read previously written encoder configuration to match up to changed
     // vocabulary id's.
     logger.info("Reading encoding.");
@@ -204,6 +215,8 @@ private void explore(LineReader grammar) {
       String[] source = fields.get(0).split("\\s");
       String[] target = fields.get(1).split("\\s");
       String[] features = fields.get(2).split("\\s");
+
+      max_source_len = Math.max(max_source_len, source.length);
 
       Vocabulary.id(lhs);
       try {

diff --git a/test/decoder/phrase/constrained/config b/test/decoder/phrase/constrained/config
@@ -1,5 +1,5 @@
 tm = moses pt 0 ../decode/rules.1.gz
-tm = thrax pt 0 glue.grammar
+
 lm = kenlm 5 true false 100 ../decode/lm.1.gz
 
 mark-oovs = false

diff --git a/test/decoder/phrase/decode/rules.packed/config b/test/decoder/phrase/decode/rules.packed/config
@@ -0,0 +1 @@
+max-source-len = 5