Skip to content

Commit

Permalink
kenlm:
Browse files Browse the repository at this point in the history
Fix can't find lm/model.hh from ./configure introduced in 3849
Remove some cruft from read_arpa
Avoid some error messages inside progress bars
FilePiece correctness (did not impact existing code)



git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3859 1f5c12ca-751b-0410-a591-d2e778427230
  • Loading branch information
heafield committed Jan 28, 2011
1 parent 678b182 commit 17adc3f
Show file tree
Hide file tree
Showing 7 changed files with 39 additions and 92 deletions.
76 changes: 12 additions & 64 deletions kenlm/lm/read_arpa.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,14 @@ bool IsEntirelyWhiteSpace(const StringPiece &line) {
return true;
}

template <class F> void GenericReadARPACounts(F &in, std::vector<uint64_t> &number) {
} // namespace

void ReadARPACounts(util::FilePiece &in, std::vector<uint64_t> &number) {
number.clear();
StringPiece line;
if (!IsEntirelyWhiteSpace(line = in.ReadLine())) {
if ((line.size() >= 2) && (line.data()[0] == 0x1f) && (static_cast<unsigned char>(line.data()[1]) == 0x8b)) {
UTIL_THROW(FormatLoadException, "Looks like a gzip file. If this is an ARPA file, run\nzcat " << in.FileName() << " |kenlm/build_binary /dev/stdin " << in.FileName() << ".binary\nIf this already in binary format, you need to decompress it because mmap doesn't work on top of gzip.");
UTIL_THROW(FormatLoadException, "Looks like a gzip file. If this is an ARPA file, pipe " << in.FileName() << " through zcat. If this already in binary format, you need to decompress it because mmap doesn't work on top of gzip.");
}
UTIL_THROW(FormatLoadException, "First line was \"" << static_cast<int>(line.data()[1]) << "\" not blank");
}
Expand All @@ -49,66 +51,14 @@ template <class F> void GenericReadARPACounts(F &in, std::vector<uint64_t> &numb
}
}

template <class F> void GenericReadNGramHeader(F &in, unsigned int length) {
StringPiece line;
void ReadNGramHeader(util::FilePiece &in, unsigned int length) {
StringPiece line;
while (IsEntirelyWhiteSpace(line = in.ReadLine())) {}
std::stringstream expected;
expected << '\\' << length << "-grams:";
if (line != expected.str()) UTIL_THROW(FormatLoadException, "Was expecting n-gram header " << expected.str() << " but got " << line << " instead");
}

template <class F> void GenericReadEnd(F &in) {
StringPiece line;
do {
line = in.ReadLine();
} while (IsEntirelyWhiteSpace(line));
if (line != "\\end\\") UTIL_THROW(FormatLoadException, "Expected \\end\\ but the ARPA file has " << line);
}

class FakeFilePiece {
public:
explicit FakeFilePiece(std::istream &in) : in_(in) {
in_.exceptions(std::ios::failbit | std::ios::badbit | std::ios::eofbit);
}

StringPiece ReadLine() throw(util::EndOfFileException) {
getline(in_, buffer_);
return StringPiece(buffer_);
}

float ReadFloat() {
float ret;
in_ >> ret;
return ret;
}

const char *FileName() const {
// This only used for error messages and we don't know the file name. . .
return "$file";
}

private:
std::istream &in_;
std::string buffer_;
};

} // namespace

void ReadARPACounts(util::FilePiece &in, std::vector<uint64_t> &number) {
GenericReadARPACounts(in, number);
}
void ReadARPACounts(std::istream &in, std::vector<uint64_t> &number) {
FakeFilePiece fake(in);
GenericReadARPACounts(fake, number);
}
void ReadNGramHeader(util::FilePiece &in, unsigned int length) {
GenericReadNGramHeader(in, length);
}
void ReadNGramHeader(std::istream &in, unsigned int length) {
FakeFilePiece fake(in);
GenericReadNGramHeader(fake, length);
}

void ReadBackoff(util::FilePiece &in, Prob &/*weights*/) {
switch (in.get()) {
case '\t':
Expand Down Expand Up @@ -146,20 +96,18 @@ void ReadBackoff(util::FilePiece &in, ProbBackoff &weights) {
}

void ReadEnd(util::FilePiece &in) {
GenericReadEnd(in);
StringPiece line;
do {
line = in.ReadLine();
} while (IsEntirelyWhiteSpace(line));
if (line != "\\end\\") UTIL_THROW(FormatLoadException, "Expected \\end\\ but the ARPA file has " << line);

try {
while (true) {
line = in.ReadLine();
if (!IsEntirelyWhiteSpace(line)) UTIL_THROW(FormatLoadException, "Trailing line " << line);
}
} catch (const util::EndOfFileException &e) {
return;
}
}
void ReadEnd(std::istream &in) {
FakeFilePiece fake(in);
GenericReadEnd(fake);
} catch (const util::EndOfFileException &e) {}
}

} // namespace lm
3 changes: 0 additions & 3 deletions kenlm/lm/read_arpa.hh
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,12 @@
namespace lm {

void ReadARPACounts(util::FilePiece &in, std::vector<uint64_t> &number);
void ReadARPACounts(std::istream &in, std::vector<uint64_t> &number);
void ReadNGramHeader(util::FilePiece &in, unsigned int length);
void ReadNGramHeader(std::istream &in, unsigned int length);

void ReadBackoff(util::FilePiece &in, Prob &weights);
void ReadBackoff(util::FilePiece &in, ProbBackoff &weights);

void ReadEnd(util::FilePiece &in);
void ReadEnd(std::istream &in);

extern const bool kARPASpaces[256];

Expand Down
1 change: 1 addition & 0 deletions kenlm/lm/search_hashed.cc
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ template <class MiddleT, class LongestT> template <class Voc> void TemplateHashe
} catch (util::ProbingSizeException &e) {
UTIL_THROW(util::ProbingSizeException, "Avoid pruning n-grams like \"bar baz quux\" when \"foo bar baz quux\" is still in the model. KenLM will work when this pruning happens, but the probing model assumes these events are rare enough that using blank space in the probing hash table will cover all of them. Increase probing_multiplier (-p to build_binary) to add more blank spaces.\n");
}
ReadEnd(f);
}

template void TemplateHashedSearch<ProbingHashedSearch::Middle, ProbingHashedSearch::Longest>::InitializeFromARPA(const char *, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &, ProbingVocabulary &vocab, Backing &backing);
Expand Down
2 changes: 1 addition & 1 deletion kenlm/lm/search_trie.cc
Original file line number Diff line number Diff line change
Expand Up @@ -501,7 +501,7 @@ void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const st
}
// Sort full records by full n-gram.
EntryProxy proxy_begin(begin, entry_size), proxy_end(out_end, entry_size);
// Tried __gnu_parallel::sort here but it took too much memory.
// parallel_sort uses too much RAM
std::sort(NGramIter(proxy_begin), NGramIter(proxy_end), CompareRecords<EntryProxy>(order));
files.push_back(DiskFlush(begin, out_end, file_prefix, batch, order, weights_size));
WriteContextFile(begin, out_end, files.back(), entry_size, order);
Expand Down
33 changes: 24 additions & 9 deletions kenlm/util/file_piece.cc
Original file line number Diff line number Diff line change
Expand Up @@ -79,22 +79,22 @@ FilePiece::~FilePiece() {
}

StringPiece FilePiece::ReadLine(char delim) throw (GZException, EndOfFileException) {
const char *start = position_;
do {
for (const char *i = start; i < position_end_; ++i) {
size_t skip = 0;
while (true) {
for (const char *i = position_ + skip; i < position_end_; ++i) {
if (*i == delim) {
StringPiece ret(position_, i - position_);
position_ = i + 1;
return ret;
}
}
size_t skip = position_end_ - position_;
if (at_end_) {
if (position_ == position_end_) Shift();
return Consume(position_end_);
}
skip = position_end_ - position_;
Shift();
start = position_ + skip;
} while (!at_end_);
StringPiece ret(position_, position_end_ - position_);
position_ = position_end_;
return ret;
}
}

float FilePiece::ReadFloat() throw(GZException, EndOfFileException, ParseNumberException) {
Expand Down Expand Up @@ -186,6 +186,21 @@ template <class T> T FilePiece::ReadNumber() throw(GZException, EndOfFileExcepti
return ret;
}

const char *FilePiece::FindDelimiterOrEOF(const bool *delim) throw (GZException, EndOfFileException) {
size_t skip = 0;
while (true) {
for (const char *i = position_ + skip; i < position_end_; ++i) {
if (delim[static_cast<unsigned char>(*i)]) return i;
}
if (at_end_) {
if (position_ == position_end_) Shift();
return position_end_;
}
skip = position_end_ - position_;
Shift();
}
}

void FilePiece::Shift() throw(GZException, EndOfFileException) {
if (at_end_) {
progress_.Finished();
Expand Down
14 changes: 1 addition & 13 deletions kenlm/util/file_piece.hh
Original file line number Diff line number Diff line change
Expand Up @@ -99,19 +99,7 @@ class FilePiece {
return ret;
}

const char *FindDelimiterOrEOF(const bool *delim = kSpaces) throw (GZException, EndOfFileException) {
for (const char *i = position_; i < position_end_; ++i) {
if (delim[static_cast<unsigned char>(*i)]) return i;
}
while (!at_end_) {
size_t skip = position_end_ - position_;
Shift();
for (const char *i = position_ + skip; i < position_end_; ++i) {
if (delim[static_cast<unsigned char>(*i)]) return i;
}
}
return position_end_;
}
const char *FindDelimiterOrEOF(const bool *delim = kSpaces) throw (GZException, EndOfFileException);

void Shift() throw (EndOfFileException, GZException);
// Backends to Shift().
Expand Down
2 changes: 0 additions & 2 deletions kenlm/util/have.hh
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
#ifndef UTIL_HAVE__
#define UTIL_HAVE__

#include "../config.h"

#define HAVE_ZLIB

#endif // UTIL_HAVE__

0 comments on commit 17adc3f

Please sign in to comment.