Skip to content

Commit

Permalink
Grammar packer now records max source phrase length, big simplification
Browse files Browse the repository at this point in the history
  • Loading branch information
mjpost committed Apr 11, 2015
1 parent 45f2a19 commit fb16123
Show file tree
Hide file tree
Showing 8 changed files with 62 additions and 34 deletions.
15 changes: 15 additions & 0 deletions src/joshua/decoder/ff/tm/AbstractGrammar.java
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,21 @@ public abstract class AbstractGrammar implements Grammar {
*/
protected int owner = -1;

/*
* The maximum length of a source-side phrase. Mostly used by the phrase-based decoder.
*/
protected int maxSourcePhraseLength = -1;

/**
* Returns the longest source phrase read.
*
* @return the longest source phrase read (nonterminal + terminal symbols).
*/
@Override
public int getMaxSourcePhraseLength() {
return maxSourcePhraseLength;
}

@Override
public int getOwner() {
return owner;
Expand Down
5 changes: 5 additions & 0 deletions src/joshua/decoder/ff/tm/Grammar.java
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,11 @@ public interface Grammar {
* Return the grammar's owner.
*/
int getOwner();

/**
* Return the maximum source phrase length (terminals + nonterminals).
*/
int getMaxSourcePhraseLength();

/**
* Add an OOV rule for the requested word for the grammar.
Expand Down
14 changes: 1 addition & 13 deletions src/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,6 @@ public class MemoryBasedBatchGrammar extends AbstractGrammar {

private GrammarReader<Rule> modelReader;

/* Maximum source phrase length */
protected int maxSourcePhraseLength = 0;

/* Whether the grammar's rules contain regular expressions. */
private boolean isRegexpGrammar = false;

Expand Down Expand Up @@ -228,16 +225,7 @@ public boolean isRegexpGrammar() {
public void setRegexpGrammar(boolean value) {
this.isRegexpGrammar = value;
}

/**
* Returns the longest source phrase read.
*
* @return the longest source phrase read (nonterminal + terminal symbols).
*/
public int getMaxSourcePhraseLength() {
return maxSourcePhraseLength;
}


/***
* Takes an input word and creates an OOV rule in the current grammar for that word.
*
Expand Down
18 changes: 17 additions & 1 deletion src/joshua/decoder/ff/tm/packed/PackedGrammar.java
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package joshua.decoder.ff.tm.packed;

/***
* This package implements Joshua's packed grammar structure, which enables the efficient loading
* This package implements Joshua's packed grammar structure, which enables the efficient loading
* and accessing of grammars. It is described in the paper:
*
* @article{ganitkevitch2012joshua,
Expand Down Expand Up @@ -61,6 +61,7 @@
import joshua.decoder.ff.tm.hash_based.ExtensionIterator;
import joshua.util.encoding.EncoderConfiguration;
import joshua.util.encoding.FloatEncoder;
import joshua.util.io.LineReader;

public class PackedGrammar extends AbstractGrammar {

Expand All @@ -83,6 +84,13 @@ public PackedGrammar(String grammar_dir, int span_limit, String owner, String ty
Decoder.LOG(1, String.format("Reading vocabulary: %s", vocabFile));
Vocabulary.read(vocabFile);

// Read the config
String configFile = grammar_dir + File.separator + "config";
if (new File(configFile).exists()) {
Decoder.LOG(1, String.format("Reading packed config: %s", configFile));
readConfig(configFile);
}

// Read the quantizer setup.
Decoder.LOG(1, String.format("Reading encoder configuration: %s%sencoding", grammar_dir, File.separator));
encoding = new EncoderConfiguration();
Expand Down Expand Up @@ -837,4 +845,12 @@ public boolean isRegexpGrammar() {
public void addOOVRules(int word, List<FeatureFunction> featureFunctions) {
throw new RuntimeException("PackedGrammar: I can't add OOV rules");
}

private void readConfig(String config) throws IOException {
for (String line: new LineReader(config)) {
String[] tokens = line.split(" = ");
if (tokens[0].equals("max-source-len"))
this.maxSourcePhraseLength = Integer.parseInt(tokens[1]);
}
}
}
28 changes: 9 additions & 19 deletions src/joshua/decoder/phrase/PhraseTable.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ public class PhraseTable implements Grammar {

private JoshuaConfiguration config;
private Grammar backend;
private int maxSourcePhraseLength = -1;

/**
* Chain to the super with a number of defaults. For example, we only use a single nonterminal,
Expand All @@ -42,12 +41,11 @@ public PhraseTable(String grammarFile, String owner, JoshuaConfiguration config,

if (new File(grammarFile).isDirectory()) {
this.backend = new PackedGrammar(grammarFile, spanLimit, owner, "moses", config);
if (maxSource == -1) {
System.err.println("FATAL: Using a packed grammar for a phrase table backend requires");
System.err.println(" you to specify -max-source-len in the tm line");
if (this.backend.getMaxSourcePhraseLength() == -1) {
System.err.println("FATAL: Using a packed grammar for a phrase table backend requires that you");
System.err.println(" packed the grammar with Joshua 6.0.2 or greater");
System.exit(-1);
}
setMaxSourcePhraseLength(maxSource);

} else {
this.backend = new MemoryBasedBatchGrammar("moses", grammarFile, owner, "[X]", spanLimit, config);
Expand All @@ -59,28 +57,20 @@ public PhraseTable(String owner, JoshuaConfiguration config) {

this.backend = new MemoryBasedBatchGrammar(owner, config);
}

/**
* The maximum length of a source phrase found in the grammar.
*
* @param max
*/
public void setMaxSourcePhraseLength(int max) {
this.maxSourcePhraseLength = max;
}

/**
* Returns the longest source phrase read, subtracting off the nonterminal that was added.
* The {@link MemoryBasedBatchGrammar} computes this as the grammar is read in, whereas
* a {@link PackedGrammar} has to be told the maximum length via the config file.
* Returns the longest source phrase read. For {@link MemoryBasedBatchGrammar}s, we subtract 1
* since the grammar includes the nonterminal. For {@link PackedGrammar}s, the value was either
* in the packed config file (Joshua 6.0.2+) or was passed in via the TM config line.
*
* @return
*/
@Override
public int getMaxSourcePhraseLength() {
if (backend instanceof MemoryBasedBatchGrammar)
return ((MemoryBasedBatchGrammar)backend).getMaxSourcePhraseLength() - 1;
return this.backend.getMaxSourcePhraseLength() - 1;
else
return this.maxSourcePhraseLength;
return this.backend.getMaxSourcePhraseLength();
}

/**
Expand Down
13 changes: 13 additions & 0 deletions src/joshua/tools/GrammarPacker.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.ByteBuffer;
Expand Down Expand Up @@ -51,6 +52,8 @@ public class GrammarPacker {

private String dump;

private int max_source_len;

static {
SLICE_SIZE = 1000000;
DATA_SIZE_LIMIT = (int) (Integer.MAX_VALUE * 0.8);
Expand All @@ -65,6 +68,7 @@ public GrammarPacker(String grammar_filename, String config_filename, String out
this.output = output_filename;
this.dump = featuredump_filename;
this.grammarAlignments = grammar_alignments;
this.max_source_len = 0;

// TODO: Always open encoder config? This is debatable.
this.types = new FeatureTypeAnalyzer(true);
Expand Down Expand Up @@ -152,6 +156,13 @@ public void pack() throws IOException {
logger.info("Writing vocab.");
Vocabulary.write(output + File.separator + "vocabulary");

String configFile = output + File.separator + "config";
logger.info(String.format("Writing config to '%s'", configFile));
// Write config options
FileWriter config = new FileWriter(configFile);
config.write(String.format("max-source-len = %d\n", max_source_len));
config.close();

// Read previously written encoder configuration to match up to changed
// vocabulary id's.
logger.info("Reading encoding.");
Expand Down Expand Up @@ -204,6 +215,8 @@ private void explore(LineReader grammar) {
String[] source = fields.get(0).split("\\s");
String[] target = fields.get(1).split("\\s");
String[] features = fields.get(2).split("\\s");

max_source_len = Math.max(max_source_len, source.length);

Vocabulary.id(lhs);
try {
Expand Down
2 changes: 1 addition & 1 deletion test/decoder/phrase/constrained/config
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
tm = moses pt 0 ../decode/rules.1.gz
tm = thrax pt 0 glue.grammar

lm = kenlm 5 true false 100 ../decode/lm.1.gz

mark-oovs = false
Expand Down
1 change: 1 addition & 0 deletions test/decoder/phrase/decode/rules.packed/config
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
max-source-len = 5

0 comments on commit fb16123

Please sign in to comment.