Skip to content

Commit

Permalink
kenlm update
Browse files Browse the repository at this point in the history
- Fix case where "foo bar baz" appears but "bar baz" does not.  Previously probing silently returned the wrong answer and trie silently broke.  
- More aggressive recombination: if "baz quux" is never followed by any word, then do not include "bar" in the state.  
- kenlm assumes that "foo bar" is present if "foo bar baz" is.  This is now checked.  
- Binary format version number bump because the format has changed to support the above.  
- Lower memory consumption trie building.  But it will take longer for to ensure correct handling of blanks and aggressive recombination.  
- Fix progress bar newlines on trie building.

Agrees with SRI's 1-best outputs on the WMT 10 evaluation set.  



git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3847 1f5c12ca-751b-0410-a591-d2e778427230
  • Loading branch information
heafield committed Jan 25, 2011
1 parent 46bc5bc commit 8c13881
Show file tree
Hide file tree
Showing 35 changed files with 1,158 additions and 446 deletions.
2 changes: 0 additions & 2 deletions kenlm/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,12 @@ libkenlm_la_SOURCES = \
lm/read_arpa.cc \
lm/virtual_interface.cc \
lm/vocab.cc \
util/string_piece.cc \
util/scoped.cc \
util/murmur_hash.cc \
util/mmap.cc \
util/file_piece.cc \
util/ersatz_progress.cc \
util/exception.cc \
util/string_piece.cc \
util/bit_packing.cc

query_SOURCES = lm/ngram_query.cc
Expand Down
73 changes: 43 additions & 30 deletions kenlm/lm/binary_format.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ namespace lm {
namespace ngram {
namespace {
const char kMagicBeforeVersion[] = "mmap lm http://kheafield.com/code format version";
const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 1\n\0";
const long int kMagicVersion = 1;
const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 4\n\0";
const long int kMagicVersion = 4;

// Test values.
struct Sanity {
Expand Down Expand Up @@ -76,6 +76,45 @@ void WriteHeader(void *to, const Parameters &params) {
}

} // namespace

uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing) {
if (config.write_mmap) {
std::size_t total = TotalHeaderSize(order) + memory_size;
backing.vocab.reset(util::MapZeroedWrite(config.write_mmap, total, backing.file), total, util::scoped_memory::MMAP_ALLOCATED);
return reinterpret_cast<uint8_t*>(backing.vocab.get()) + TotalHeaderSize(order);
} else {
backing.vocab.reset(util::MapAnonymous(memory_size), memory_size, util::scoped_memory::MMAP_ALLOCATED);
return reinterpret_cast<uint8_t*>(backing.vocab.get());
}
}

uint8_t *GrowForSearch(const Config &config, ModelType model_type, const std::vector<uint64_t> &counts, std::size_t memory_size, Backing &backing) {
if (config.write_mmap) {
// header and vocab share the same mmap. The header is written here because we know the counts.
Parameters params;
params.counts = counts;
params.fixed.order = counts.size();
params.fixed.probing_multiplier = config.probing_multiplier;
params.fixed.model_type = model_type;
params.fixed.has_vocabulary = config.include_vocab;
WriteHeader(backing.vocab.get(), params);

// Grow the file to accomodate the search, using zeros.
if (-1 == ftruncate(backing.file.get(), backing.vocab.size() + memory_size))
UTIL_THROW(util::ErrnoException, "ftruncate on " << config.write_mmap << " to " << (backing.vocab.size() + memory_size) << " failed");

// We're skipping over the header and vocab for the search space mmap. mmap likes page aligned offsets, so some arithmetic to round the offset down.
off_t page_size = sysconf(_SC_PAGE_SIZE);
off_t alignment_cruft = backing.vocab.size() % page_size;
backing.search.reset(util::MapOrThrow(alignment_cruft + memory_size, true, util::kFileFlags, false, backing.file.get(), backing.vocab.size() - alignment_cruft), alignment_cruft + memory_size, util::scoped_memory::MMAP_ALLOCATED);

return reinterpret_cast<uint8_t*>(backing.search.get()) + alignment_cruft;
} else {
backing.search.reset(util::MapAnonymous(memory_size), memory_size, util::scoped_memory::MMAP_ALLOCATED);
return reinterpret_cast<uint8_t*>(backing.search.get());
}
}

namespace detail {

bool IsBinaryFormat(int fd) {
Expand Down Expand Up @@ -128,7 +167,7 @@ uint8_t *SetupBinary(const Config &config, const Parameters &params, std::size_t
if (file_size != util::kBadSize && static_cast<uint64_t>(file_size) < total_map)
UTIL_THROW(FormatLoadException, "Binary file has size " << file_size << " but the headers say it should be at least " << total_map);

util::MapRead(config.load_method, backing.file.get(), 0, total_map, backing.memory);
util::MapRead(config.load_method, backing.file.get(), 0, total_map, backing.search);

if (config.enumerate_vocab && !params.fixed.has_vocabulary)
UTIL_THROW(FormatLoadException, "The decoder requested all the vocabulary strings, but this binary file does not have them. You may need to rebuild the binary file with an updated version of build_binary.");
Expand All @@ -137,33 +176,7 @@ uint8_t *SetupBinary(const Config &config, const Parameters &params, std::size_t
if ((off_t)-1 == lseek(backing.file.get(), total_map, SEEK_SET))
UTIL_THROW(util::ErrnoException, "Failed to seek in binary file to vocab words");
}
return reinterpret_cast<uint8_t*>(backing.memory.get()) + TotalHeaderSize(params.counts.size());
}

uint8_t *SetupZeroed(const Config &config, ModelType model_type, const std::vector<uint64_t> &counts, std::size_t memory_size, Backing &backing) {
if (config.write_mmap) {
std::size_t total_map = TotalHeaderSize(counts.size()) + memory_size;
// Write out an mmap file.
backing.memory.reset(util::MapZeroedWrite(config.write_mmap, total_map, backing.file), total_map, util::scoped_memory::MMAP_ALLOCATED);

Parameters params;
params.counts = counts;
params.fixed.order = counts.size();
params.fixed.probing_multiplier = config.probing_multiplier;
params.fixed.model_type = model_type;
params.fixed.has_vocabulary = config.include_vocab;

WriteHeader(backing.memory.get(), params);

if (params.fixed.has_vocabulary) {
if ((off_t)-1 == lseek(backing.file.get(), total_map, SEEK_SET))
UTIL_THROW(util::ErrnoException, "Failed to seek in binary file " << config.write_mmap << " to vocab words");
}
return reinterpret_cast<uint8_t*>(backing.memory.get()) + TotalHeaderSize(counts.size());
} else {
backing.memory.reset(util::MapAnonymous(memory_size), memory_size, util::scoped_memory::MMAP_ALLOCATED);
return reinterpret_cast<uint8_t*>(backing.memory.get());
}
return reinterpret_cast<uint8_t*>(backing.search.get()) + TotalHeaderSize(params.counts.size());
}

void ComplainAboutARPA(const Config &config, ModelType model_type) {
Expand Down
22 changes: 10 additions & 12 deletions kenlm/lm/binary_format.hh
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,16 @@ struct Parameters {
struct Backing {
// File behind memory, if any.
util::scoped_fd file;
// Vocabulary lookup table. Not to be confused with the vocab words themselves.
util::scoped_memory vocab;
// Raw block of memory backing the language model data structures
util::scoped_memory memory;
util::scoped_memory search;
};

uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing);
// Grow the binary file for the search data structure and set backing.search, returning the memory address where the search data structure should begin.
uint8_t *GrowForSearch(const Config &config, ModelType model_type, const std::vector<uint64_t> &counts, std::size_t memory_size, Backing &backing);

namespace detail {

bool IsBinaryFormat(int fd);
Expand All @@ -49,8 +55,6 @@ void MatchCheck(ModelType model_type, const Parameters &params);

uint8_t *SetupBinary(const Config &config, const Parameters &params, std::size_t memory_size, Backing &backing);

uint8_t *SetupZeroed(const Config &config, ModelType model_type, const std::vector<uint64_t> &counts, std::size_t memory_size, Backing &backing);

void ComplainAboutARPA(const Config &config, ModelType model_type);

} // namespace detail
Expand All @@ -61,26 +65,20 @@ template <class To> void LoadLM(const char *file, const Config &config, To &to)
Backing &backing = to.MutableBacking();
backing.file.reset(util::OpenReadOrThrow(file));

Parameters params;

try {
if (detail::IsBinaryFormat(backing.file.get())) {
Parameters params;
detail::ReadHeader(backing.file.get(), params);
detail::MatchCheck(To::kModelType, params);
// Replace the probing_multiplier.
// Replace the run-time configured probing_multiplier with the one in the file.
Config new_config(config);
new_config.probing_multiplier = params.fixed.probing_multiplier;
std::size_t memory_size = To::Size(params.counts, new_config);
uint8_t *start = detail::SetupBinary(new_config, params, memory_size, backing);
to.InitializeFromBinary(start, params, new_config, backing.file.get());
} else {
detail::ComplainAboutARPA(config, To::kModelType);
util::FilePiece f(backing.file.release(), file, config.messages);
ReadARPACounts(f, params.counts);
std::size_t memory_size = To::Size(params.counts, config);
uint8_t *start = detail::SetupZeroed(config, To::kModelType, params.counts, memory_size, backing);

to.InitializeFromARPA(file, f, start, params, config);
to.InitializeFromARPA(file, config);
}
} catch (util::Exception &e) {
e << " in file " << file;
Expand Down
53 changes: 53 additions & 0 deletions kenlm/lm/blank.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#ifndef LM_BLANK__
#define LM_BLANK__

#include <limits>

#include <inttypes.h>
#include <math.h>

namespace lm {
namespace ngram {

/* Suppose "foo bar" appears with zero backoff but there is no trigram
* beginning with these words. Then, when scoring "foo bar", the model could
* return out_state containing "bar" or even null context if "bar" also has no
* backoff and is never followed by another word. Then the backoff is set to
* kNoExtensionBackoff. If the n-gram might be extended, then out_state must
* contain the full n-gram, in which case kExtensionBackoff is set. In any
* case, if an n-gram has non-zero backoff, the full state is returned so
* backoff can be properly charged.
* These differ only in sign bit because the backoff is in fact zero in either
* case.
*/
const float kNoExtensionBackoff = -0.0;
const float kExtensionBackoff = 0.0;

inline void SetExtension(float &backoff) {
if (backoff == kNoExtensionBackoff) backoff = kExtensionBackoff;
}

// This compiles down nicely.
inline bool HasExtension(const float &backoff) {
typedef union { float f; uint32_t i; } UnionValue;
UnionValue compare, interpret;
compare.f = kNoExtensionBackoff;
interpret.f = backoff;
return compare.i != interpret.i;
}

/* Suppose "foo bar baz quux" appears in the ARPA but not "bar baz quux" or
* "baz quux" (because they were pruned). 1.2% of n-grams generated by SRI
* with default settings on the benchmark data set are like this. Since search
* proceeds by finding "quux", "baz quux", "bar baz quux", and finally
* "foo bar baz quux" and the trie needs pointer nodes anyway, blanks are
* inserted. The blanks have probability kBlankProb and backoff kBlankBackoff.
* A blank is recognized by kBlankProb in the probability field; kBlankBackoff
* must be 0 so that inference asseses zero backoff from these blanks.
*/
const float kBlankProb = -std::numeric_limits<float>::infinity();
const float kBlankBackoff = kNoExtensionBackoff;

} // namespace ngram
} // namespace lm
#endif // LM_BLANK__
12 changes: 6 additions & 6 deletions kenlm/lm/build_binary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ void Usage(const char *name) {
"memory and is still faster than SRI or IRST. Building the trie format uses an\n"
"on-disk sort to save memory.\n"
"-t is the temporary directory prefix. Default is the output file name.\n"
"-m is the amount of memory to use, in MB. Default is 1024MB (1GB).\n\n"
"sorted is like probing but uses a sorted uniform map instead of a hash table.\n"
"-m limits memory use for sorting. Measured in MB. Default is 1024MB.\n\n"
/*"sorted is like probing but uses a sorted uniform map instead of a hash table.\n"
"It uses more memory than trie and is also slower, so there's no real reason to\n"
"use it.\n\n"
"use it.\n\n"*/
"See http://kheafield.com/code/kenlm/benchmark/ for data structure benchmarks.\n"
"Passing only an input file will print memory usage of each data structure.\n"
"If the ARPA file does not have <unk>, -u sets <unk>'s probability; default 0.0.\n";
Expand Down Expand Up @@ -52,13 +52,13 @@ void ShowSizes(const char *file, const lm::ngram::Config &config) {
std::size_t probing_size = ProbingModel::Size(counts, config);
// probing is always largest so use it to determine number of columns.
long int length = std::max<long int>(5, lrint(ceil(log10(probing_size))));
std::cout << "Memory usage:\ntype ";
std::cout << "Memory estimate:\ntype ";
// right align bytes.
for (long int i = 0; i < length - 5; ++i) std::cout << ' ';
std::cout << "bytes\n"
"probing " << std::setw(length) << probing_size << " assuming -p " << config.probing_multiplier << "\n"
"trie " << std::setw(length) << TrieModel::Size(counts, config) << "\n"
"sorted " << std::setw(length) << SortedModel::Size(counts, config) << "\n";
"trie " << std::setw(length) << TrieModel::Size(counts, config) << "\n";
/* "sorted " << std::setw(length) << SortedModel::Size(counts, config) << "\n";*/
}

} // namespace ngram
Expand Down
4 changes: 2 additions & 2 deletions kenlm/lm/config.hh
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ struct Config {
// this. Time is probing_multiplier / (probing_multiplier - 1). No effect
// for sorted variant.
// If you find yourself setting this to a low number, consider using the
// Sorted version instead which has lower memory consumption.
// TrieModel which has lower memory consumption.
float probing_multiplier;

// Amount of memory to use for building. The actual memory usage will be
Expand All @@ -53,7 +53,7 @@ struct Config {
// defaults to input file name.
const char *temporary_directory_prefix;

// Level of complaining to do when an ARPA instead of a binary format.
// Level of complaining to do when loading from ARPA instead of binary format.
typedef enum {ALL, EXPENSIVE, NONE} ARPALoadComplain;
ARPALoadComplain arpa_complain;

Expand Down
14 changes: 14 additions & 0 deletions kenlm/lm/max_order.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#ifndef LM_MAX_ORDER__
#define LM_MAX_ORDER__
namespace lm {
namespace ngram {
// If you need higher order, change this and recompile.
// Having this limit means that State can be
// (kMaxOrder - 1) * sizeof(float) bytes instead of
// sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead
const unsigned char kMaxOrder = 6;

} // namespace ngram
} // namespace lm

#endif // LM_MAX_ORDER__
Loading

0 comments on commit 8c13881

Please sign in to comment.