From fb161236d1a2b92189e86cd412c36dc641cc246a Mon Sep 17 00:00:00 2001 From: Matt Post Date: Fri, 10 Apr 2015 20:10:07 -0400 Subject: [PATCH] Grammar packer now records max source phrase length, big simplification --- src/joshua/decoder/ff/tm/AbstractGrammar.java | 15 ++++++++++ src/joshua/decoder/ff/tm/Grammar.java | 5 ++++ .../hash_based/MemoryBasedBatchGrammar.java | 14 +--------- .../decoder/ff/tm/packed/PackedGrammar.java | 18 +++++++++++- src/joshua/decoder/phrase/PhraseTable.java | 28 ++++++------------- src/joshua/tools/GrammarPacker.java | 13 +++++++++ test/decoder/phrase/constrained/config | 2 +- .../decoder/phrase/decode/rules.packed/config | 1 + 8 files changed, 62 insertions(+), 34 deletions(-) create mode 100644 test/decoder/phrase/decode/rules.packed/config diff --git a/src/joshua/decoder/ff/tm/AbstractGrammar.java b/src/joshua/decoder/ff/tm/AbstractGrammar.java index eac172056..65c792eb7 100644 --- a/src/joshua/decoder/ff/tm/AbstractGrammar.java +++ b/src/joshua/decoder/ff/tm/AbstractGrammar.java @@ -44,6 +44,21 @@ public abstract class AbstractGrammar implements Grammar { */ protected int owner = -1; + /* + * The maximum length of a source-side phrase. Mostly used by the phrase-based decoder. + */ + protected int maxSourcePhraseLength = -1; + + /** + * Returns the longest source phrase read. + * + * @return the longest source phrase read (nonterminal + terminal symbols). + */ + @Override + public int getMaxSourcePhraseLength() { + return maxSourcePhraseLength; + } + @Override public int getOwner() { return owner; diff --git a/src/joshua/decoder/ff/tm/Grammar.java b/src/joshua/decoder/ff/tm/Grammar.java index f9a14cbe9..72850bbca 100644 --- a/src/joshua/decoder/ff/tm/Grammar.java +++ b/src/joshua/decoder/ff/tm/Grammar.java @@ -92,6 +92,11 @@ public interface Grammar { * Return the grammar's owner. */ int getOwner(); + + /** + * Return the maximum source phrase length (terminals + nonterminals). + */ + int getMaxSourcePhraseLength(); /** * Add an OOV rule for the requested word for the grammar. diff --git a/src/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java b/src/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java index 79bc70e4f..33e56bdd3 100644 --- a/src/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java +++ b/src/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java @@ -49,9 +49,6 @@ public class MemoryBasedBatchGrammar extends AbstractGrammar { private GrammarReader modelReader; - /* Maximum source phrase length */ - protected int maxSourcePhraseLength = 0; - /* Whether the grammar's rules contain regular expressions. */ private boolean isRegexpGrammar = false; @@ -228,16 +225,7 @@ public boolean isRegexpGrammar() { public void setRegexpGrammar(boolean value) { this.isRegexpGrammar = value; } - - /** - * Returns the longest source phrase read. - * - * @return the longest source phrase read (nonterminal + terminal symbols). - */ - public int getMaxSourcePhraseLength() { - return maxSourcePhraseLength; - } - + /*** * Takes an input word and creates an OOV rule in the current grammar for that word. * diff --git a/src/joshua/decoder/ff/tm/packed/PackedGrammar.java b/src/joshua/decoder/ff/tm/packed/PackedGrammar.java index 6fef54f4e..0bc9e79ef 100644 --- a/src/joshua/decoder/ff/tm/packed/PackedGrammar.java +++ b/src/joshua/decoder/ff/tm/packed/PackedGrammar.java @@ -1,7 +1,7 @@ package joshua.decoder.ff.tm.packed; /*** - * This package implements Joshua's packed grammar structure, which enables the efficient loading + * This package implements Joshua's packed grammar structure, which enables the efficient loading * and accessing of grammars. It is described in the paper: * * @article{ganitkevitch2012joshua, @@ -61,6 +61,7 @@ import joshua.decoder.ff.tm.hash_based.ExtensionIterator; import joshua.util.encoding.EncoderConfiguration; import joshua.util.encoding.FloatEncoder; +import joshua.util.io.LineReader; public class PackedGrammar extends AbstractGrammar { @@ -83,6 +84,13 @@ public PackedGrammar(String grammar_dir, int span_limit, String owner, String ty Decoder.LOG(1, String.format("Reading vocabulary: %s", vocabFile)); Vocabulary.read(vocabFile); + // Read the config + String configFile = grammar_dir + File.separator + "config"; + if (new File(configFile).exists()) { + Decoder.LOG(1, String.format("Reading packed config: %s", configFile)); + readConfig(configFile); + } + // Read the quantizer setup. Decoder.LOG(1, String.format("Reading encoder configuration: %s%sencoding", grammar_dir, File.separator)); encoding = new EncoderConfiguration(); @@ -837,4 +845,12 @@ public boolean isRegexpGrammar() { public void addOOVRules(int word, List featureFunctions) { throw new RuntimeException("PackedGrammar: I can't add OOV rules"); } + + private void readConfig(String config) throws IOException { + for (String line: new LineReader(config)) { + String[] tokens = line.split(" = "); + if (tokens[0].equals("max-source-len")) + this.maxSourcePhraseLength = Integer.parseInt(tokens[1]); + } + } } diff --git a/src/joshua/decoder/phrase/PhraseTable.java b/src/joshua/decoder/phrase/PhraseTable.java index fd8577d49..ef694a3f9 100644 --- a/src/joshua/decoder/phrase/PhraseTable.java +++ b/src/joshua/decoder/phrase/PhraseTable.java @@ -24,7 +24,6 @@ public class PhraseTable implements Grammar { private JoshuaConfiguration config; private Grammar backend; - private int maxSourcePhraseLength = -1; /** * Chain to the super with a number of defaults. For example, we only use a single nonterminal, @@ -42,12 +41,11 @@ public PhraseTable(String grammarFile, String owner, JoshuaConfiguration config, if (new File(grammarFile).isDirectory()) { this.backend = new PackedGrammar(grammarFile, spanLimit, owner, "moses", config); - if (maxSource == -1) { - System.err.println("FATAL: Using a packed grammar for a phrase table backend requires"); - System.err.println(" you to specify -max-source-len in the tm line"); + if (this.backend.getMaxSourcePhraseLength() == -1) { + System.err.println("FATAL: Using a packed grammar for a phrase table backend requires that you"); + System.err.println(" packed the grammar with Joshua 6.0.2 or greater"); System.exit(-1); } - setMaxSourcePhraseLength(maxSource); } else { this.backend = new MemoryBasedBatchGrammar("moses", grammarFile, owner, "[X]", spanLimit, config); @@ -59,28 +57,20 @@ public PhraseTable(String owner, JoshuaConfiguration config) { this.backend = new MemoryBasedBatchGrammar(owner, config); } - - /** - * The maximum length of a source phrase found in the grammar. - * - * @param max - */ - public void setMaxSourcePhraseLength(int max) { - this.maxSourcePhraseLength = max; - } /** - * Returns the longest source phrase read, subtracting off the nonterminal that was added. - * The {@link MemoryBasedBatchGrammar} computes this as the grammar is read in, whereas - * a {@link PackedGrammar} has to be told the maximum length via the config file. + * Returns the longest source phrase read. For {@link MemoryBasedBatchGrammar}s, we subtract 1 + * since the grammar includes the nonterminal. For {@link PackedGrammar}s, the value was either + * in the packed config file (Joshua 6.0.2+) or was passed in via the TM config line. * * @return */ + @Override public int getMaxSourcePhraseLength() { if (backend instanceof MemoryBasedBatchGrammar) - return ((MemoryBasedBatchGrammar)backend).getMaxSourcePhraseLength() - 1; + return this.backend.getMaxSourcePhraseLength() - 1; else - return this.maxSourcePhraseLength; + return this.backend.getMaxSourcePhraseLength(); } /** diff --git a/src/joshua/tools/GrammarPacker.java b/src/joshua/tools/GrammarPacker.java index 351f3c6b9..72280e274 100644 --- a/src/joshua/tools/GrammarPacker.java +++ b/src/joshua/tools/GrammarPacker.java @@ -4,6 +4,7 @@ import java.io.DataOutputStream; import java.io.File; import java.io.FileOutputStream; +import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.nio.ByteBuffer; @@ -51,6 +52,8 @@ public class GrammarPacker { private String dump; + private int max_source_len; + static { SLICE_SIZE = 1000000; DATA_SIZE_LIMIT = (int) (Integer.MAX_VALUE * 0.8); @@ -65,6 +68,7 @@ public GrammarPacker(String grammar_filename, String config_filename, String out this.output = output_filename; this.dump = featuredump_filename; this.grammarAlignments = grammar_alignments; + this.max_source_len = 0; // TODO: Always open encoder config? This is debatable. this.types = new FeatureTypeAnalyzer(true); @@ -152,6 +156,13 @@ public void pack() throws IOException { logger.info("Writing vocab."); Vocabulary.write(output + File.separator + "vocabulary"); + String configFile = output + File.separator + "config"; + logger.info(String.format("Writing config to '%s'", configFile)); + // Write config options + FileWriter config = new FileWriter(configFile); + config.write(String.format("max-source-len = %d\n", max_source_len)); + config.close(); + // Read previously written encoder configuration to match up to changed // vocabulary id's. logger.info("Reading encoding."); @@ -204,6 +215,8 @@ private void explore(LineReader grammar) { String[] source = fields.get(0).split("\\s"); String[] target = fields.get(1).split("\\s"); String[] features = fields.get(2).split("\\s"); + + max_source_len = Math.max(max_source_len, source.length); Vocabulary.id(lhs); try { diff --git a/test/decoder/phrase/constrained/config b/test/decoder/phrase/constrained/config index 2b6eae4a5..3979bd6d3 100644 --- a/test/decoder/phrase/constrained/config +++ b/test/decoder/phrase/constrained/config @@ -1,5 +1,5 @@ tm = moses pt 0 ../decode/rules.1.gz -tm = thrax pt 0 glue.grammar + lm = kenlm 5 true false 100 ../decode/lm.1.gz mark-oovs = false diff --git a/test/decoder/phrase/decode/rules.packed/config b/test/decoder/phrase/decode/rules.packed/config new file mode 100644 index 000000000..287da2d63 --- /dev/null +++ b/test/decoder/phrase/decode/rules.packed/config @@ -0,0 +1 @@ +max-source-len = 5