-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Fix case where "foo bar baz" appears but "bar baz" does not. Previously probing silently returned the wrong answer and trie silently broke. - More aggressive recombination: if "baz quux" is never followed by any word, then do not include "bar" in the state. - kenlm assumes that "foo bar" is present if "foo bar baz" is. This is now checked. - Binary format version number bump because the format has changed to support the above. - Lower memory consumption trie building. But it will take longer for to ensure correct handling of blanks and aggressive recombination. - Fix progress bar newlines on trie building. Agrees with SRI's 1-best outputs on the WMT 10 evaluation set. git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3847 1f5c12ca-751b-0410-a591-d2e778427230
- Loading branch information
heafield
committed
Jan 25, 2011
1 parent
46bc5bc
commit 8c13881
Showing
35 changed files
with
1,158 additions
and
446 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
#ifndef LM_BLANK__ | ||
#define LM_BLANK__ | ||
|
||
#include <limits> | ||
|
||
#include <inttypes.h> | ||
#include <math.h> | ||
|
||
namespace lm { | ||
namespace ngram { | ||
|
||
/* Suppose "foo bar" appears with zero backoff but there is no trigram | ||
* beginning with these words. Then, when scoring "foo bar", the model could | ||
* return out_state containing "bar" or even null context if "bar" also has no | ||
* backoff and is never followed by another word. Then the backoff is set to | ||
* kNoExtensionBackoff. If the n-gram might be extended, then out_state must | ||
* contain the full n-gram, in which case kExtensionBackoff is set. In any | ||
* case, if an n-gram has non-zero backoff, the full state is returned so | ||
* backoff can be properly charged. | ||
* These differ only in sign bit because the backoff is in fact zero in either | ||
* case. | ||
*/ | ||
const float kNoExtensionBackoff = -0.0; | ||
const float kExtensionBackoff = 0.0; | ||
|
||
inline void SetExtension(float &backoff) { | ||
if (backoff == kNoExtensionBackoff) backoff = kExtensionBackoff; | ||
} | ||
|
||
// This compiles down nicely. | ||
inline bool HasExtension(const float &backoff) { | ||
typedef union { float f; uint32_t i; } UnionValue; | ||
UnionValue compare, interpret; | ||
compare.f = kNoExtensionBackoff; | ||
interpret.f = backoff; | ||
return compare.i != interpret.i; | ||
} | ||
|
||
/* Suppose "foo bar baz quux" appears in the ARPA but not "bar baz quux" or | ||
* "baz quux" (because they were pruned). 1.2% of n-grams generated by SRI | ||
* with default settings on the benchmark data set are like this. Since search | ||
* proceeds by finding "quux", "baz quux", "bar baz quux", and finally | ||
* "foo bar baz quux" and the trie needs pointer nodes anyway, blanks are | ||
* inserted. The blanks have probability kBlankProb and backoff kBlankBackoff. | ||
* A blank is recognized by kBlankProb in the probability field; kBlankBackoff | ||
* must be 0 so that inference asseses zero backoff from these blanks. | ||
*/ | ||
const float kBlankProb = -std::numeric_limits<float>::infinity(); | ||
const float kBlankBackoff = kNoExtensionBackoff; | ||
|
||
} // namespace ngram | ||
} // namespace lm | ||
#endif // LM_BLANK__ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
#ifndef LM_MAX_ORDER__ | ||
#define LM_MAX_ORDER__ | ||
namespace lm { | ||
namespace ngram { | ||
// If you need higher order, change this and recompile. | ||
// Having this limit means that State can be | ||
// (kMaxOrder - 1) * sizeof(float) bytes instead of | ||
// sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead | ||
const unsigned char kMaxOrder = 6; | ||
|
||
} // namespace ngram | ||
} // namespace lm | ||
|
||
#endif // LM_MAX_ORDER__ |
Oops, something went wrong.