diff --git a/kenlm/lm/search_trie.cc b/kenlm/lm/search_trie.cc index 77f0d101..780f37f8 100644 --- a/kenlm/lm/search_trie.cc +++ b/kenlm/lm/search_trie.cc @@ -544,10 +544,10 @@ void ARPAToSortedFiles(util::FilePiece &f, const std::vector &counts, // Only use as much buffer as we need. size_t buffer_use = 0; for (unsigned int order = 2; order < counts.size(); ++order) { - buffer_use = std::max(buffer_use, (sizeof(WordIndex) * order + 2 * sizeof(float)) * counts[order - 1]); + buffer_use = std::max(buffer_use, (size_t)((sizeof(WordIndex) * order + 2 * sizeof(float)) * counts[order - 1])); } - buffer_use = std::max(buffer_use, (sizeof(WordIndex) * counts.size() + sizeof(float)) * counts.back()); - buffer = std::min(buffer, buffer_use); + buffer_use = std::max(buffer_use, (size_t)((sizeof(WordIndex) * counts.size() + sizeof(float)) * counts.back())); + buffer = std::min((size_t)buffer, buffer_use); util::scoped_memory mem; mem.reset(malloc(buffer), buffer, util::scoped_memory::MALLOC_ALLOCATED); diff --git a/moses/src/PCNTools.h b/moses/src/PCNTools.h index ad572609..a9cf9831 100644 --- a/moses/src/PCNTools.h +++ b/moses/src/PCNTools.h @@ -28,7 +28,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include /** A couple of utilities to read .pcn files. A python-compatible format - * for encoding confusion networks. + * for encoding confusion networks and word lattices. */ namespace PCN { @@ -36,8 +36,8 @@ namespace PCN { typedef std::vector CNCol; typedef std::vector CN; - /** Given a string ((('foo',0.1),('bar',0.9)),...) representation of a - * confusion net in PCN format, return a CN object + /** Given a string ((('foo',0.1,1),('bar',0.9,2)),...) representation of a + * word lattice in PCN format, return a CN object representing the lattice */ CN parsePCN(const std::string& in); diff --git a/moses/src/WordLattice.cpp b/moses/src/WordLattice.cpp index ca5e8a16..93e53ab7 100644 --- a/moses/src/WordLattice.cpp +++ b/moses/src/WordLattice.cpp @@ -30,40 +30,33 @@ void WordLattice::Print(std::ostream& out) const { out<<"\n\n"; } -int WordLattice::Read(std::istream& in,const std::vector& factorOrder) +int WordLattice::InitializeFromPCNDataType(const PCN::CN& cn, const std::vector& factorOrder, const std::string& debug_line) { - Clear(); - std::string line; - if(!getline(in,line)) return 0; - std::map meta=ProcessAndStripSGML(line); - if (meta.find("id") != meta.end()) { this->SetTranslationId(atol(meta["id"].c_str())); } size_t numLinkParams = StaticData::Instance().GetNumLinkParams(); size_t numLinkWeights = StaticData::Instance().GetNumInputScores(); size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength(); //when we have one more weight than params, we add a word count feature bool addRealWordCount = ((numLinkParams + 1) == numLinkWeights); - - PCN::CN cn = PCN::parsePCN(line); data.resize(cn.size()); next_nodes.resize(cn.size()); for(size_t i=0;i::iterator probsIterator; + std::vector::const_iterator probsIterator; data[i][j].second = std::vector(0); for(probsIterator = alt.first.second.begin(); probsIterator < alt.first.second.end(); probsIterator++) { IFVERBOSE(1) { @@ -114,6 +107,18 @@ int WordLattice::Read(std::istream& in,const std::vector& factorOrde return !cn.empty(); } +int WordLattice::Read(std::istream& in,const std::vector& factorOrder) +{ + Clear(); + std::string line; + if(!getline(in,line)) return 0; + std::map meta=ProcessAndStripSGML(line); + if (meta.find("id") != meta.end()) { this->SetTranslationId(atol(meta["id"].c_str())); } + + PCN::CN cn = PCN::parsePCN(line); + return InitializeFromPCNDataType(cn, factorOrder, line); +} + void WordLattice::GetAsEdgeMatrix(std::vector >& edges) const { edges.resize(data.size()+1,std::vector(data.size()+1, false)); diff --git a/moses/src/WordLattice.h b/moses/src/WordLattice.h index 0dca92dc..88be7bb4 100644 --- a/moses/src/WordLattice.h +++ b/moses/src/WordLattice.h @@ -3,6 +3,7 @@ #include #include "ConfusionNet.h" +#include "PCNTools.h" namespace Moses { @@ -23,6 +24,12 @@ class WordLattice: public ConfusionNet { // is it possible to get from the edge of the previous word range to the current word range virtual bool CanIGetFromAToB(size_t start, size_t end) const; + /** Given a lattice represented using the PCN::CN data type (topologically sorted agency list + * representation), initialize the WordLattice object + */ + int InitializeFromPCNDataType(const PCN::CN& cn, const std::vector& factorOrder, const std::string& debug_line = ""); + /** Read from PLF format (1 lattice per line) + */ int Read(std::istream& in,const std::vector& factorOrder); /** Convert internal representation into an edge matrix