Skip to content

Commit

Permalink
Merge branch 'moses-svn'
Browse files Browse the repository at this point in the history
  • Loading branch information
obo committed Feb 3, 2011
2 parents bfe0915 + 83b9e97 commit c3fe381
Show file tree
Hide file tree
Showing 180 changed files with 5,392 additions and 2,227 deletions.
4 changes: 2 additions & 2 deletions CreateOnDisk/src/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ bin_PROGRAMS = CreateOnDiskPt
CreateOnDiskPt_SOURCES = Main.cpp
AM_CPPFLAGS = -W -Wall -ffor-scope -D_FILE_OFFSET_BITS=64 -D_LARGE_FILES -DUSE_HYPO_POOL -I$(top_srcdir)/moses/src $(BOOST_CPPFLAGS)

CreateOnDiskPt_LDADD = -L$(top_srcdir)/OnDiskPt/src -L$(top_srcdir)/moses/src -lOnDiskPt -lmoses
CreateOnDiskPt_DEPENDENCIES = $(top_srcdir)/OnDiskPt/src/libOnDiskPt.a $(top_srcdir)/moses/src/libmoses.la
CreateOnDiskPt_LDADD = -L$(top_srcdir)/OnDiskPt/src -L$(top_srcdir)/moses/src -lOnDiskPt -lmoses $(BOOST_THREAD_LDFLAGS) $(BOOST_THREAD_LIBS) @KENLM_LDFLAGS@
CreateOnDiskPt_DEPENDENCIES = $(top_srcdir)/OnDiskPt/src/libOnDiskPt.a $(top_srcdir)/moses/src/libmoses.la @KENLM_DEPS@



2 changes: 1 addition & 1 deletion Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ endif
if WITH_SERVER
SERVER = server
endif
SUBDIRS = moses/src moses-chart/src OnDiskPt/src kenlm moses-cmd/src misc moses-chart-cmd/src CreateOnDisk/src $(MERT) $(SERVER)
SUBDIRS = kenlm moses/src moses-chart/src OnDiskPt/src moses-cmd/src misc moses-chart-cmd/src CreateOnDisk/src $(MERT) $(SERVER)
7 changes: 7 additions & 0 deletions OnDiskPt/OnDiskPt.xcodeproj/project.pbxproj
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,14 @@
isa = PBXProject;
buildConfigurationList = 1DEB91EF08733DB70010E9CD /* Build configuration list for PBXProject "OnDiskPt" */;
compatibilityVersion = "Xcode 3.1";
developmentRegion = English;
hasScannedForEncodings = 1;
knownRegions = (
English,
Japanese,
French,
German,
);
mainGroup = 08FB7794FE84155DC02AAC07 /* OnDiskPt */;
projectDirPath = "";
projectRoot = "";
Expand Down
6 changes: 3 additions & 3 deletions config.h.in
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
/* config.h.in. Generated from configure.in by autoheader. */

/* define if the Boost library is available */
/* Defined if the requested minimum BOOST version is satisfied */
#undef HAVE_BOOST

/* define if the Boost::Thread library is available */
#undef HAVE_BOOST_THREAD
/* Define to 1 if you have <boost/thread.hpp> */
#undef HAVE_BOOST_THREAD_HPP

/* Define to 1 if you have the <dlfcn.h> header file. */
#undef HAVE_DLFCN_H
Expand Down
26 changes: 20 additions & 6 deletions configure.in
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,12 @@ AC_ARG_WITH(zlib,
[with_zlib=no]
)

AC_ARG_WITH(tcmalloc,
[AC_HELP_STRING([--with-tcmalloc], [(optional) link with tcmalloc; default is no])],
[with_tcmalloc=$withval],
[with_tcmalloc=no],
)

AM_CONDITIONAL([INTERNAL_LM], false)
AM_CONDITIONAL([SRI_LM], false)
AM_CONDITIONAL([IRST_LM], false)
Expand All @@ -106,13 +112,13 @@ fi
if test "x$with_threads" = 'xyes' || test "x$enable_boost" = 'xyes'
then
AC_MSG_NOTICE([Using Boost library])
AX_BOOST_BASE([1.36.0])
BOOST_REQUIRE([1.36.0])
fi

if test "x$with_threads" = 'xyes'
then
AC_MSG_NOTICE([Building threaded moses])
AX_BOOST_THREAD
BOOST_THREADS
CPPFLAGS="$CPPFLAGS -DWITH_THREADS"
AM_CONDITIONAL([WITH_THREADS],true)
else
Expand Down Expand Up @@ -197,12 +203,16 @@ then
[AC_DEFINE([HAVE_KENLM], [], [flag for KENLM])],
[AC_MSG_ERROR([Cannot find KEN-LM in ${PWD}/kenlm])])

LIB_KENLM="-lkenlm"
LDFLAGS="$LDFLAGS -L${PWD}/kenlm"
LIBS="$LIBS $LIB_KENLM"
FMTLIBS="$FMTLIBS libkenlm.a"
KENLM_LDFLAGS="-L\$(top_srcdir)/kenlm -lkenlm -lz"
KENLM_DEPS="\$(top_srcdir)/kenlm/libkenlm.la"
FMTLIBS="$FMTLIBS libkenlm.la"
AM_CONDITIONAL([KEN_LM], true)
else
KENLM_LDFLAGS=""
KENLM_DEPS=""
fi
AC_SUBST(KENLM_LDFLAGS)
AC_SUBST(KENLM_DEPS)

if test "x$with_randlm" != 'xno'
then
Expand All @@ -222,6 +232,10 @@ then
AM_CONDITIONAL([RAND_LM], true)
fi

if test "x$with_tcmalloc" != 'xno'
then
AC_CHECK_LIB([tcmalloc], [malloc], [], [AC_MSG_ERROR([Cannot find tcmalloc])])
fi


AM_CONDITIONAL([WITH_MERT],false)
Expand Down
12 changes: 4 additions & 8 deletions kenlm/Makefile.am
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
lib_LIBRARIES = libkenlm.a
lib_LTLIBRARIES = libkenlm.la
bin_PROGRAMS = query build_binary

AM_CPPFLAGS = -W -Wall -ffor-scope -D_FILE_OFFSET_BITS=64 -D_LARGE_FILES $(BOOST_CPPFLAGS)
libkenlm_a_SOURCES = \
libkenlm_la_SOURCES = \
lm/lm_exception.cc \
lm/config.cc \
lm/model.cc \
Expand All @@ -13,21 +13,17 @@ libkenlm_a_SOURCES = \
lm/read_arpa.cc \
lm/virtual_interface.cc \
lm/vocab.cc \
util/string_piece.cc \
util/scoped.cc \
util/murmur_hash.cc \
util/mmap.cc \
util/file_piece.cc \
util/ersatz_progress.cc \
util/exception.cc \
util/string_piece.cc \
util/bit_packing.cc

query_SOURCES = lm/ngram_query.cc
query_DEPENDENCIES = libkenlm.a
query_LDADD = -L$(top_srcdir)/kenlm -lkenlm -lz
query_LDADD = libkenlm.la

build_binary_SOURCES = lm/build_binary.cc
build_binary_DEPENDENCIES = libkenlm.a
build_binary_LDADD = -L$(top_srcdir)/kenlm -lkenlm -lz
build_binary_LDADD = libkenlm.la

12 changes: 5 additions & 7 deletions kenlm/README
Original file line number Diff line number Diff line change
@@ -1,24 +1,22 @@
Language model inference code by Kenneth Heafield <infer at kheafield.com>
The official website is http://kheafield.com/code/mt/infer.html . If you're a decoder developer, please download the latest version from there instead of copying from another decoder.
The official website is http://kheafield.com/code/mt/infer.html . If you're a decoder developer, please download the latest version from there instead of copying from Moses.

This documentation is directed at decoder developers.

Currently, it loads an ARPA file in 2/3 the time SRI takes and uses 6.5 GB when SRI takes 11 GB. These are compared to the default SRI build (i.e. without their smaller structures). I'm working on optimizing this even further.

Binary format via mmap is supported. Run ./build_binary to make one then pass the binary file name instead.

Currently, it assumes POSIX APIs for errno, sterror_r, open, close, mmap, munmap, ftruncate, fstat, and read. This is tested on Linux and the non-UNIX Mac OS X. I welcome submissions porting (via #ifdef) to other systems (e.g. Windows) but proudly have no machine on which to test it.

A brief note to Mac OS X users: your gcc is too old to recognize the pack pragma. The warning effectively means that, on 64-bit machines, the model will use 16 bytes instead of 12 bytes per n-gram of maximum order (those of lower order are already 16 bytes) in the probing and sorted models. The trie is not impacted by this.

It does not depend on Boost or ICU. However, if you use Boost and/or ICU in the rest of your code, you should define HAVE_BOOST and/or HAVE_ICU in util/string_piece.hh. Defining HAVE_BOOST will let you hash StringPiece. Defining HAVE_ICU will use ICU's StringPiece to prevent a conflict with the one provided here. By the way, ICU's StringPiece is buggy and I reported this bug: http://bugs.icu-project.org/trac/ticket/7924 .
It does not depend on Boost or ICU. However, if you use Boost and/or ICU in the rest of your code, you should define HAVE_BOOST and/or HAVE_ICU in util/have.hh. Defining HAVE_BOOST will let you hash StringPiece. Defining HAVE_ICU will use ICU's StringPiece to prevent a conflict with the one provided here.

The recommend way to use this:
Copy the code and distribute with your decoder.
Set HAVE_ICU and HAVE_BOOST at the top of util/string_piece.hh as instructed above.
Set HAVE_ICU and HAVE_BOOST at the top of util/have.hh as instructed above.
Look at compile.sh and reimplement using your build system.
Use either the interface in lm/ngram.hh or lm/virtual_interface.hh
Interface documentation is in comments of lm/virtual_interface.hh (including for lm/ngram.hh).
Use either the interface in lm/model.hh or lm/virtual_interface.hh
Interface documentation is in comments of lm/virtual_interface.hh (including for lm/model.hh).

I recommend copying the code and distributing it with your decoder. However, please send improvements to me so that they can be integrated into the package.

Expand Down
11 changes: 7 additions & 4 deletions kenlm/kenlm.xcodeproj/project.pbxproj
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@
1EBB16EA126C158600AE6102 /* scoped.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1EBB16D2126C158600AE6102 /* scoped.hh */; };
1EBB16EB126C158600AE6102 /* sorted_uniform_test.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1EBB16D3126C158600AE6102 /* sorted_uniform_test.cc */; };
1EBB16EC126C158600AE6102 /* sorted_uniform.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1EBB16D4126C158600AE6102 /* sorted_uniform.hh */; };
1EBB16ED126C158600AE6102 /* string_piece.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1EBB16D5126C158600AE6102 /* string_piece.cc */; };
1EBB16EE126C158600AE6102 /* string_piece.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1EBB16D6126C158600AE6102 /* string_piece.hh */; };
1EBB1717126C15C500AE6102 /* facade.hh in Headers */ = {isa = PBXBuildFile; fileRef = 1EBB1708126C15C500AE6102 /* facade.hh */; };
1EBB171A126C15C500AE6102 /* ngram_query.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1EBB170B126C15C500AE6102 /* ngram_query.cc */; };
Expand Down Expand Up @@ -106,7 +105,6 @@
1EBB16D2126C158600AE6102 /* scoped.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = scoped.hh; path = util/scoped.hh; sourceTree = "<group>"; };
1EBB16D3126C158600AE6102 /* sorted_uniform_test.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = sorted_uniform_test.cc; path = util/sorted_uniform_test.cc; sourceTree = "<group>"; };
1EBB16D4126C158600AE6102 /* sorted_uniform.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = sorted_uniform.hh; path = util/sorted_uniform.hh; sourceTree = "<group>"; };
1EBB16D5126C158600AE6102 /* string_piece.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = string_piece.cc; path = util/string_piece.cc; sourceTree = "<group>"; };
1EBB16D6126C158600AE6102 /* string_piece.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = string_piece.hh; path = util/string_piece.hh; sourceTree = "<group>"; };
1EBB1708126C15C500AE6102 /* facade.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = facade.hh; path = lm/facade.hh; sourceTree = "<group>"; };
1EBB170B126C15C500AE6102 /* ngram_query.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = ngram_query.cc; path = lm/ngram_query.cc; sourceTree = "<group>"; };
Expand Down Expand Up @@ -198,7 +196,6 @@
1EBB16D2126C158600AE6102 /* scoped.hh */,
1EBB16D3126C158600AE6102 /* sorted_uniform_test.cc */,
1EBB16D4126C158600AE6102 /* sorted_uniform.hh */,
1EBB16D5126C158600AE6102 /* string_piece.cc */,
1EBB16D6126C158600AE6102 /* string_piece.hh */,
1E2B85C112555DB1000770D6 /* lm_exception.cc */,
1E2B85C212555DB1000770D6 /* lm_exception.hh */,
Expand Down Expand Up @@ -287,7 +284,14 @@
isa = PBXProject;
buildConfigurationList = 1DEB91EF08733DB70010E9CD /* Build configuration list for PBXProject "kenlm" */;
compatibilityVersion = "Xcode 3.1";
developmentRegion = English;
hasScannedForEncodings = 1;
knownRegions = (
English,
Japanese,
French,
German,
);
mainGroup = 08FB7794FE84155DC02AAC07 /* kenlm */;
projectDirPath = "";
projectRoot = "";
Expand All @@ -314,7 +318,6 @@
1EBB16E6126C158600AE6102 /* probing_hash_table_test.cc in Sources */,
1EBB16E9126C158600AE6102 /* scoped.cc in Sources */,
1EBB16EB126C158600AE6102 /* sorted_uniform_test.cc in Sources */,
1EBB16ED126C158600AE6102 /* string_piece.cc in Sources */,
1EBB171A126C15C500AE6102 /* ngram_query.cc in Sources */,
1EBB171C126C15C500AE6102 /* read_arpa.cc in Sources */,
1EBB171E126C15C500AE6102 /* sri_test.cc in Sources */,
Expand Down
74 changes: 43 additions & 31 deletions kenlm/lm/binary_format.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ namespace lm {
namespace ngram {
namespace {
const char kMagicBeforeVersion[] = "mmap lm http://kheafield.com/code format version";
const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 1\n\0";
const long int kMagicVersion = 1;
const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 4\n\0";
const long int kMagicVersion = 4;

// Test values.
struct Sanity {
Expand Down Expand Up @@ -76,6 +76,45 @@ void WriteHeader(void *to, const Parameters &params) {
}

} // namespace

uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing) {
if (config.write_mmap) {
std::size_t total = TotalHeaderSize(order) + memory_size;
backing.vocab.reset(util::MapZeroedWrite(config.write_mmap, total, backing.file), total, util::scoped_memory::MMAP_ALLOCATED);
return reinterpret_cast<uint8_t*>(backing.vocab.get()) + TotalHeaderSize(order);
} else {
backing.vocab.reset(util::MapAnonymous(memory_size), memory_size, util::scoped_memory::MMAP_ALLOCATED);
return reinterpret_cast<uint8_t*>(backing.vocab.get());
}
}

uint8_t *GrowForSearch(const Config &config, ModelType model_type, const std::vector<uint64_t> &counts, std::size_t memory_size, Backing &backing) {
if (config.write_mmap) {
// header and vocab share the same mmap. The header is written here because we know the counts.
Parameters params;
params.counts = counts;
params.fixed.order = counts.size();
params.fixed.probing_multiplier = config.probing_multiplier;
params.fixed.model_type = model_type;
params.fixed.has_vocabulary = config.include_vocab;
WriteHeader(backing.vocab.get(), params);

// Grow the file to accomodate the search, using zeros.
if (-1 == ftruncate(backing.file.get(), backing.vocab.size() + memory_size))
UTIL_THROW(util::ErrnoException, "ftruncate on " << config.write_mmap << " to " << (backing.vocab.size() + memory_size) << " failed");

// We're skipping over the header and vocab for the search space mmap. mmap likes page aligned offsets, so some arithmetic to round the offset down.
off_t page_size = sysconf(_SC_PAGE_SIZE);
off_t alignment_cruft = backing.vocab.size() % page_size;
backing.search.reset(util::MapOrThrow(alignment_cruft + memory_size, true, util::kFileFlags, false, backing.file.get(), backing.vocab.size() - alignment_cruft), alignment_cruft + memory_size, util::scoped_memory::MMAP_ALLOCATED);

return reinterpret_cast<uint8_t*>(backing.search.get()) + alignment_cruft;
} else {
backing.search.reset(util::MapAnonymous(memory_size), memory_size, util::scoped_memory::MMAP_ALLOCATED);
return reinterpret_cast<uint8_t*>(backing.search.get());
}
}

namespace detail {

bool IsBinaryFormat(int fd) {
Expand Down Expand Up @@ -128,7 +167,7 @@ uint8_t *SetupBinary(const Config &config, const Parameters &params, std::size_t
if (file_size != util::kBadSize && static_cast<uint64_t>(file_size) < total_map)
UTIL_THROW(FormatLoadException, "Binary file has size " << file_size << " but the headers say it should be at least " << total_map);

util::MapRead(config.load_method, backing.file.get(), 0, total_map, backing.memory);
util::MapRead(config.load_method, backing.file.get(), 0, total_map, backing.search);

if (config.enumerate_vocab && !params.fixed.has_vocabulary)
UTIL_THROW(FormatLoadException, "The decoder requested all the vocabulary strings, but this binary file does not have them. You may need to rebuild the binary file with an updated version of build_binary.");
Expand All @@ -137,34 +176,7 @@ uint8_t *SetupBinary(const Config &config, const Parameters &params, std::size_t
if ((off_t)-1 == lseek(backing.file.get(), total_map, SEEK_SET))
UTIL_THROW(util::ErrnoException, "Failed to seek in binary file to vocab words");
}
return reinterpret_cast<uint8_t*>(backing.memory.get()) + TotalHeaderSize(params.counts.size());
}

uint8_t *SetupZeroed(const Config &config, ModelType model_type, const std::vector<uint64_t> &counts, std::size_t memory_size, Backing &backing) {
if (config.probing_multiplier <= 1.0) UTIL_THROW(FormatLoadException, "probing multiplier must be > 1.0");
if (config.write_mmap) {
std::size_t total_map = TotalHeaderSize(counts.size()) + memory_size;
// Write out an mmap file.
backing.memory.reset(util::MapZeroedWrite(config.write_mmap, total_map, backing.file), total_map, util::scoped_memory::MMAP_ALLOCATED);

Parameters params;
params.counts = counts;
params.fixed.order = counts.size();
params.fixed.probing_multiplier = config.probing_multiplier;
params.fixed.model_type = model_type;
params.fixed.has_vocabulary = config.include_vocab;

WriteHeader(backing.memory.get(), params);

if (params.fixed.has_vocabulary) {
if ((off_t)-1 == lseek(backing.file.get(), total_map, SEEK_SET))
UTIL_THROW(util::ErrnoException, "Failed to seek in binary file " << config.write_mmap << " to vocab words");
}
return reinterpret_cast<uint8_t*>(backing.memory.get()) + TotalHeaderSize(counts.size());
} else {
backing.memory.reset(util::MapAnonymous(memory_size), memory_size, util::scoped_memory::MMAP_ALLOCATED);
return reinterpret_cast<uint8_t*>(backing.memory.get());
}
return reinterpret_cast<uint8_t*>(backing.search.get()) + TotalHeaderSize(params.counts.size());
}

void ComplainAboutARPA(const Config &config, ModelType model_type) {
Expand Down
29 changes: 15 additions & 14 deletions kenlm/lm/binary_format.hh
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,16 @@ struct Parameters {
struct Backing {
// File behind memory, if any.
util::scoped_fd file;
// Vocabulary lookup table. Not to be confused with the vocab words themselves.
util::scoped_memory vocab;
// Raw block of memory backing the language model data structures
util::scoped_memory memory;
util::scoped_memory search;
};

uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing);
// Grow the binary file for the search data structure and set backing.search, returning the memory address where the search data structure should begin.
uint8_t *GrowForSearch(const Config &config, ModelType model_type, const std::vector<uint64_t> &counts, std::size_t memory_size, Backing &backing);

namespace detail {

bool IsBinaryFormat(int fd);
Expand All @@ -49,8 +55,6 @@ void MatchCheck(ModelType model_type, const Parameters &params);

uint8_t *SetupBinary(const Config &config, const Parameters &params, std::size_t memory_size, Backing &backing);

uint8_t *SetupZeroed(const Config &config, ModelType model_type, const std::vector<uint64_t> &counts, std::size_t memory_size, Backing &backing);

void ComplainAboutARPA(const Config &config, ModelType model_type);

} // namespace detail
Expand All @@ -61,23 +65,20 @@ template <class To> void LoadLM(const char *file, const Config &config, To &to)
Backing &backing = to.MutableBacking();
backing.file.reset(util::OpenReadOrThrow(file));

Parameters params;

try {
if (detail::IsBinaryFormat(backing.file.get())) {
Parameters params;
detail::ReadHeader(backing.file.get(), params);
detail::MatchCheck(To::kModelType, params);
std::size_t memory_size = To::Size(params.counts, config);
uint8_t *start = detail::SetupBinary(config, params, memory_size, backing);
to.InitializeFromBinary(start, params, config, backing.file.get());
// Replace the run-time configured probing_multiplier with the one in the file.
Config new_config(config);
new_config.probing_multiplier = params.fixed.probing_multiplier;
std::size_t memory_size = To::Size(params.counts, new_config);
uint8_t *start = detail::SetupBinary(new_config, params, memory_size, backing);
to.InitializeFromBinary(start, params, new_config, backing.file.get());
} else {
detail::ComplainAboutARPA(config, To::kModelType);
util::FilePiece f(backing.file.release(), file, config.messages);
ReadARPACounts(f, params.counts);
std::size_t memory_size = To::Size(params.counts, config);
uint8_t *start = detail::SetupZeroed(config, To::kModelType, params.counts, memory_size, backing);

to.InitializeFromARPA(file, f, start, params, config);
to.InitializeFromARPA(file, config);
}
} catch (util::Exception &e) {
e << " in file " << file;
Expand Down
Loading

0 comments on commit c3fe381

Please sign in to comment.