kenlm update

- Fix case where "foo bar baz" appears but "bar baz" does not. Previously probing silently returned the wrong answer and trie silently broke. - More aggressive recombination: if "baz quux" is never followed by any word, then do not include "bar" in the state. - kenlm assumes that "foo bar" is present if "foo bar baz" is. This is now checked. - Binary format version number bump because the format has changed to support the above. - Lower memory consumption trie building. But it will take longer for to ensure correct handling of blanks and aggressive recombination. - Fix progress bar newlines on trie building. Agrees with SRI's 1-best outputs on the WMT 10 evaluation set. git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3847 1f5c12ca-751b-0410-a591-d2e778427230
obo · Jan 25, 2011 · 8c13881 · 8c13881
1 parent 46bc5bc
commit 8c13881
Show file tree

Hide file tree

Showing 35 changed files with 1,158 additions and 446 deletions.
diff --git a/kenlm/Makefile.am b/kenlm/Makefile.am
@@ -13,14 +13,12 @@ libkenlm_la_SOURCES = \
 	lm/read_arpa.cc \
 	lm/virtual_interface.cc \
 	lm/vocab.cc \
-	util/string_piece.cc \
 	util/scoped.cc \
 	util/murmur_hash.cc   \
 	util/mmap.cc   \
 	util/file_piece.cc   \
 	util/ersatz_progress.cc   \
 	util/exception.cc   \
-	util/string_piece.cc \
 	util/bit_packing.cc 
 
 query_SOURCES = lm/ngram_query.cc

diff --git a/kenlm/lm/binary_format.cc b/kenlm/lm/binary_format.cc
@@ -18,8 +18,8 @@ namespace lm {
 namespace ngram {
 namespace {
 const char kMagicBeforeVersion[] = "mmap lm http://kheafield.com/code format version";
-const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 1\n\0";
-const long int kMagicVersion = 1;
+const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 4\n\0";
+const long int kMagicVersion = 4;
 
 // Test values.  
 struct Sanity {
@@ -76,6 +76,45 @@ void WriteHeader(void *to, const Parameters &params) {
 }
 
 } // namespace
+
+uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing) {
+  if (config.write_mmap) {
+    std::size_t total = TotalHeaderSize(order) + memory_size;
+    backing.vocab.reset(util::MapZeroedWrite(config.write_mmap, total, backing.file), total, util::scoped_memory::MMAP_ALLOCATED);
+    return reinterpret_cast<uint8_t*>(backing.vocab.get()) + TotalHeaderSize(order);
+  } else {
+    backing.vocab.reset(util::MapAnonymous(memory_size), memory_size, util::scoped_memory::MMAP_ALLOCATED);
+    return reinterpret_cast<uint8_t*>(backing.vocab.get());
+  }
+}
+
+uint8_t *GrowForSearch(const Config &config, ModelType model_type, const std::vector<uint64_t> &counts, std::size_t memory_size, Backing &backing) {
+  if (config.write_mmap) {
+    // header and vocab share the same mmap.  The header is written here because we know the counts.  
+    Parameters params;
+    params.counts = counts;
+    params.fixed.order = counts.size();
+    params.fixed.probing_multiplier = config.probing_multiplier;
+    params.fixed.model_type = model_type;
+    params.fixed.has_vocabulary = config.include_vocab;
+    WriteHeader(backing.vocab.get(), params);
+
+    // Grow the file to accomodate the search, using zeros.  
+    if (-1 == ftruncate(backing.file.get(), backing.vocab.size() + memory_size))
+      UTIL_THROW(util::ErrnoException, "ftruncate on " << config.write_mmap << " to " << (backing.vocab.size() + memory_size) << " failed");
+
+    // We're skipping over the header and vocab for the search space mmap.  mmap likes page aligned offsets, so some arithmetic to round the offset down.  
+    off_t page_size = sysconf(_SC_PAGE_SIZE);
+    off_t alignment_cruft = backing.vocab.size() % page_size;
+    backing.search.reset(util::MapOrThrow(alignment_cruft + memory_size, true, util::kFileFlags, false, backing.file.get(), backing.vocab.size() - alignment_cruft), alignment_cruft + memory_size, util::scoped_memory::MMAP_ALLOCATED);
+
+    return reinterpret_cast<uint8_t*>(backing.search.get()) + alignment_cruft;
+  } else {
+    backing.search.reset(util::MapAnonymous(memory_size), memory_size, util::scoped_memory::MMAP_ALLOCATED);
+    return reinterpret_cast<uint8_t*>(backing.search.get());
+  } 
+}
+
 namespace detail {
 
 bool IsBinaryFormat(int fd) {
@@ -128,7 +167,7 @@ uint8_t *SetupBinary(const Config &config, const Parameters &params, std::size_t
   if (file_size != util::kBadSize && static_cast<uint64_t>(file_size) < total_map)
     UTIL_THROW(FormatLoadException, "Binary file has size " << file_size << " but the headers say it should be at least " << total_map);
 
-  util::MapRead(config.load_method, backing.file.get(), 0, total_map, backing.memory);
+  util::MapRead(config.load_method, backing.file.get(), 0, total_map, backing.search);
 
   if (config.enumerate_vocab && !params.fixed.has_vocabulary)
     UTIL_THROW(FormatLoadException, "The decoder requested all the vocabulary strings, but this binary file does not have them.  You may need to rebuild the binary file with an updated version of build_binary.");
@@ -137,33 +176,7 @@ uint8_t *SetupBinary(const Config &config, const Parameters &params, std::size_t
     if ((off_t)-1 == lseek(backing.file.get(), total_map, SEEK_SET))
       UTIL_THROW(util::ErrnoException, "Failed to seek in binary file to vocab words");
   }
-  return reinterpret_cast<uint8_t*>(backing.memory.get()) + TotalHeaderSize(params.counts.size());
-}
-
-uint8_t *SetupZeroed(const Config &config, ModelType model_type, const std::vector<uint64_t> &counts, std::size_t memory_size, Backing &backing) {
-  if (config.write_mmap) {
-    std::size_t total_map = TotalHeaderSize(counts.size()) + memory_size;
-    // Write out an mmap file.  
-    backing.memory.reset(util::MapZeroedWrite(config.write_mmap, total_map, backing.file), total_map, util::scoped_memory::MMAP_ALLOCATED);
-
-    Parameters params;
-    params.counts = counts;
-    params.fixed.order = counts.size();
-    params.fixed.probing_multiplier = config.probing_multiplier;
-    params.fixed.model_type = model_type;
-    params.fixed.has_vocabulary = config.include_vocab;
-
-    WriteHeader(backing.memory.get(), params);
-
-    if (params.fixed.has_vocabulary) {
-      if ((off_t)-1 == lseek(backing.file.get(), total_map, SEEK_SET))
-        UTIL_THROW(util::ErrnoException, "Failed to seek in binary file " << config.write_mmap << " to vocab words");
-    }
-    return reinterpret_cast<uint8_t*>(backing.memory.get()) + TotalHeaderSize(counts.size());
-  } else {
-    backing.memory.reset(util::MapAnonymous(memory_size), memory_size, util::scoped_memory::MMAP_ALLOCATED);
-    return reinterpret_cast<uint8_t*>(backing.memory.get());
-  } 
+  return reinterpret_cast<uint8_t*>(backing.search.get()) + TotalHeaderSize(params.counts.size());
 }
 
 void ComplainAboutARPA(const Config &config, ModelType model_type) {

diff --git a/kenlm/lm/binary_format.hh b/kenlm/lm/binary_format.hh
@@ -35,10 +35,16 @@ struct Parameters {
 struct Backing {
   // File behind memory, if any.  
   util::scoped_fd file;
+  // Vocabulary lookup table.  Not to be confused with the vocab words themselves.  
+  util::scoped_memory vocab;
   // Raw block of memory backing the language model data structures
-  util::scoped_memory memory;
+  util::scoped_memory search;
 };
 
+uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing);
+// Grow the binary file for the search data structure and set backing.search, returning the memory address where the search data structure should begin.  
+uint8_t *GrowForSearch(const Config &config, ModelType model_type, const std::vector<uint64_t> &counts, std::size_t memory_size, Backing &backing);
+
 namespace detail {
 
 bool IsBinaryFormat(int fd);
@@ -49,8 +55,6 @@ void MatchCheck(ModelType model_type, const Parameters &params);
 
 uint8_t *SetupBinary(const Config &config, const Parameters &params, std::size_t memory_size, Backing &backing);
 
-uint8_t *SetupZeroed(const Config &config, ModelType model_type, const std::vector<uint64_t> &counts, std::size_t memory_size, Backing &backing);
-
 void ComplainAboutARPA(const Config &config, ModelType model_type);
 
 } // namespace detail
@@ -61,26 +65,20 @@ template <class To> void LoadLM(const char *file, const Config &config, To &to)
   Backing &backing = to.MutableBacking();
   backing.file.reset(util::OpenReadOrThrow(file));
 
-  Parameters params;
-
   try {
     if (detail::IsBinaryFormat(backing.file.get())) {
+      Parameters params;
       detail::ReadHeader(backing.file.get(), params);
       detail::MatchCheck(To::kModelType, params);
-      // Replace the probing_multiplier.  
+      // Replace the run-time configured probing_multiplier with the one in the file.  
       Config new_config(config);
       new_config.probing_multiplier = params.fixed.probing_multiplier;
       std::size_t memory_size = To::Size(params.counts, new_config);
       uint8_t *start = detail::SetupBinary(new_config, params, memory_size, backing);
       to.InitializeFromBinary(start, params, new_config, backing.file.get());
     } else {
       detail::ComplainAboutARPA(config, To::kModelType);
-      util::FilePiece f(backing.file.release(), file, config.messages);
-      ReadARPACounts(f, params.counts);
-      std::size_t memory_size = To::Size(params.counts, config);
-      uint8_t *start = detail::SetupZeroed(config, To::kModelType, params.counts, memory_size, backing);
-
-      to.InitializeFromARPA(file, f, start, params, config);
+      to.InitializeFromARPA(file, config);
     }
   } catch (util::Exception &e) {
     e << " in file " << file;

diff --git a/kenlm/lm/blank.hh b/kenlm/lm/blank.hh
@@ -0,0 +1,53 @@
+#ifndef LM_BLANK__
+#define LM_BLANK__
+
+#include <limits>
+
+#include <inttypes.h>
+#include <math.h>
+
+namespace lm {
+namespace ngram {
+
+/* Suppose "foo bar" appears with zero backoff but there is no trigram
+ * beginning with these words.  Then, when scoring "foo bar", the model could
+ * return out_state containing "bar" or even null context if "bar" also has no
+ * backoff and is never followed by another word.  Then the backoff is set to
+ * kNoExtensionBackoff.  If the n-gram might be extended, then out_state must
+ * contain the full n-gram, in which case kExtensionBackoff is set.  In any
+ * case, if an n-gram has non-zero backoff, the full state is returned so
+ * backoff can be properly charged.  
+ * These differ only in sign bit because the backoff is in fact zero in either
+ * case.   
+ */
+const float kNoExtensionBackoff = -0.0;
+const float kExtensionBackoff = 0.0;
+
+inline void SetExtension(float &backoff) {
+  if (backoff == kNoExtensionBackoff) backoff = kExtensionBackoff;
+}
+
+// This compiles down nicely.  
+inline bool HasExtension(const float &backoff) {
+  typedef union { float f; uint32_t i; } UnionValue;
+  UnionValue compare, interpret;
+  compare.f = kNoExtensionBackoff;
+  interpret.f = backoff;
+  return compare.i != interpret.i;
+}
+
+/* Suppose "foo bar baz quux" appears in the ARPA but not "bar baz quux" or
+ * "baz quux" (because they were pruned).  1.2% of n-grams generated by SRI
+ * with default settings on the benchmark data set are like this.  Since search
+ * proceeds by finding "quux", "baz quux", "bar baz quux", and finally 
+ * "foo bar baz quux" and the trie needs pointer nodes anyway, blanks are
+ * inserted.  The blanks have probability kBlankProb and backoff kBlankBackoff.
+ * A blank is recognized by kBlankProb in the probability field; kBlankBackoff
+ * must be 0 so that inference asseses zero backoff from these blanks.  
+ */
+const float kBlankProb = -std::numeric_limits<float>::infinity();
+const float kBlankBackoff = kNoExtensionBackoff;
+
+} // namespace ngram
+} // namespace lm
+#endif // LM_BLANK__
diff --git a/kenlm/lm/build_binary.cc b/kenlm/lm/build_binary.cc
@@ -21,10 +21,10 @@ void Usage(const char *name) {
 "memory and is still faster than SRI or IRST.  Building the trie format uses an\n"
 "on-disk sort to save memory.\n"
 "-t is the temporary directory prefix.  Default is the output file name.\n"
-"-m is the amount of memory to use, in MB.  Default is 1024MB (1GB).\n\n"
-"sorted is like probing but uses a sorted uniform map instead of a hash table.\n"
+"-m limits memory use for sorting.  Measured in MB.  Default is 1024MB.\n\n"
+/*"sorted is like probing but uses a sorted uniform map instead of a hash table.\n"
 "It uses more memory than trie and is also slower, so there's no real reason to\n"
-"use it.\n\n"
+"use it.\n\n"*/
 "See http://kheafield.com/code/kenlm/benchmark/ for data structure benchmarks.\n"
 "Passing only an input file will print memory usage of each data structure.\n"
 "If the ARPA file does not have <unk>, -u sets <unk>'s probability; default 0.0.\n";
@@ -52,13 +52,13 @@ void ShowSizes(const char *file, const lm::ngram::Config &config) {
   std::size_t probing_size = ProbingModel::Size(counts, config);
   // probing is always largest so use it to determine number of columns.  
   long int length = std::max<long int>(5, lrint(ceil(log10(probing_size))));
-  std::cout << "Memory usage:\ntype    ";
+  std::cout << "Memory estimate:\ntype    ";
   // right align bytes.  
   for (long int i = 0; i < length - 5; ++i) std::cout << ' ';
   std::cout << "bytes\n"
     "probing " << std::setw(length) << probing_size << " assuming -p " << config.probing_multiplier << "\n"
-    "trie    " << std::setw(length) << TrieModel::Size(counts, config) << "\n"
-    "sorted  " << std::setw(length) << SortedModel::Size(counts, config) << "\n";
+    "trie    " << std::setw(length) << TrieModel::Size(counts, config) << "\n";
+/*    "sorted  " << std::setw(length) << SortedModel::Size(counts, config) << "\n";*/
 }
 
 } // namespace ngram

diff --git a/kenlm/lm/config.hh b/kenlm/lm/config.hh
@@ -39,7 +39,7 @@ struct Config {
   // this.  Time is probing_multiplier / (probing_multiplier - 1).  No effect
   // for sorted variant.  
   // If you find yourself setting this to a low number, consider using the
-  // Sorted version instead which has lower memory consumption.  
+  // TrieModel which has lower memory consumption.  
   float probing_multiplier;
 
   // Amount of memory to use for building.  The actual memory usage will be
@@ -53,7 +53,7 @@ struct Config {
   // defaults to input file name.  
   const char *temporary_directory_prefix;
 
-  // Level of complaining to do when an ARPA instead of a binary format.
+  // Level of complaining to do when loading from ARPA instead of binary format.
   typedef enum {ALL, EXPENSIVE, NONE} ARPALoadComplain;
   ARPALoadComplain arpa_complain;
 

diff --git a/kenlm/lm/max_order.hh b/kenlm/lm/max_order.hh
@@ -0,0 +1,14 @@
+#ifndef LM_MAX_ORDER__
+#define LM_MAX_ORDER__
+namespace lm {
+namespace ngram {
+// If you need higher order, change this and recompile.  
+// Having this limit means that State can be
+// (kMaxOrder - 1) * sizeof(float) bytes instead of
+// sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead
+const unsigned char kMaxOrder = 6;
+
+} // namespace ngram
+} // namespace lm
+
+#endif // LM_MAX_ORDER__