Skip to content

Commit

Permalink
facilitate programmatic creation of word lattices
Browse files Browse the repository at this point in the history
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3848 1f5c12ca-751b-0410-a591-d2e778427230
  • Loading branch information
redpony committed Jan 25, 2011
1 parent 8c13881 commit 93caa3d
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 18 deletions.
6 changes: 3 additions & 3 deletions kenlm/lm/search_trie.cc
Original file line number Diff line number Diff line change
Expand Up @@ -544,10 +544,10 @@ void ARPAToSortedFiles(util::FilePiece &f, const std::vector<uint64_t> &counts,
// Only use as much buffer as we need.
size_t buffer_use = 0;
for (unsigned int order = 2; order < counts.size(); ++order) {
buffer_use = std::max(buffer_use, (sizeof(WordIndex) * order + 2 * sizeof(float)) * counts[order - 1]);
buffer_use = std::max(buffer_use, (size_t)((sizeof(WordIndex) * order + 2 * sizeof(float)) * counts[order - 1]));
}
buffer_use = std::max(buffer_use, (sizeof(WordIndex) * counts.size() + sizeof(float)) * counts.back());
buffer = std::min(buffer, buffer_use);
buffer_use = std::max(buffer_use, (size_t)((sizeof(WordIndex) * counts.size() + sizeof(float)) * counts.back()));
buffer = std::min((size_t)buffer, buffer_use);

util::scoped_memory mem;
mem.reset(malloc(buffer), buffer, util::scoped_memory::MALLOC_ALLOCATED);
Expand Down
6 changes: 3 additions & 3 deletions moses/src/PCNTools.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,16 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <cstdlib>

/** A couple of utilities to read .pcn files. A python-compatible format
* for encoding confusion networks.
* for encoding confusion networks and word lattices.
*/
namespace PCN {

typedef std::pair<std::pair<std::string, std::vector<float> >, size_t> CNAlt;
typedef std::vector<CNAlt> CNCol;
typedef std::vector<CNCol> CN;

/** Given a string ((('foo',0.1),('bar',0.9)),...) representation of a
* confusion net in PCN format, return a CN object
/** Given a string ((('foo',0.1,1),('bar',0.9,2)),...) representation of a
* word lattice in PCN format, return a CN object representing the lattice
*/
CN parsePCN(const std::string& in);

Expand Down
29 changes: 17 additions & 12 deletions moses/src/WordLattice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,40 +30,33 @@ void WordLattice::Print(std::ostream& out) const {
out<<"\n\n";
}

int WordLattice::Read(std::istream& in,const std::vector<FactorType>& factorOrder)
int WordLattice::InitializeFromPCNDataType(const PCN::CN& cn, const std::vector<FactorType>& factorOrder, const std::string& debug_line)
{
Clear();
std::string line;
if(!getline(in,line)) return 0;
std::map<std::string, std::string> meta=ProcessAndStripSGML(line);
if (meta.find("id") != meta.end()) { this->SetTranslationId(atol(meta["id"].c_str())); }
size_t numLinkParams = StaticData::Instance().GetNumLinkParams();
size_t numLinkWeights = StaticData::Instance().GetNumInputScores();
size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();

//when we have one more weight than params, we add a word count feature
bool addRealWordCount = ((numLinkParams + 1) == numLinkWeights);

PCN::CN cn = PCN::parsePCN(line);
data.resize(cn.size());
next_nodes.resize(cn.size());
for(size_t i=0;i<cn.size();++i) {
PCN::CNCol& col = cn[i];
const PCN::CNCol& col = cn[i];
if (col.empty()) return false;
data[i].resize(col.size());
next_nodes[i].resize(col.size());
for (size_t j=0;j<col.size();++j) {
PCN::CNAlt& alt = col[j];
const PCN::CNAlt& alt = col[j];


//check for correct number of link parameters
if (alt.first.second.size() != numLinkParams) {
TRACE_ERR("ERROR: need " << numLinkParams << " link parameters, found " << alt.first.second.size() << " while reading column " << i << " from " << line << "\n");
TRACE_ERR("ERROR: need " << numLinkParams << " link parameters, found " << alt.first.second.size() << " while reading column " << i << " from " << debug_line << "\n");
return false;
}

//check each element for bounds
std::vector<float>::iterator probsIterator;
std::vector<float>::const_iterator probsIterator;
data[i][j].second = std::vector<float>(0);
for(probsIterator = alt.first.second.begin(); probsIterator < alt.first.second.end(); probsIterator++) {
IFVERBOSE(1) {
Expand Down Expand Up @@ -114,6 +107,18 @@ int WordLattice::Read(std::istream& in,const std::vector<FactorType>& factorOrde
return !cn.empty();
}

int WordLattice::Read(std::istream& in,const std::vector<FactorType>& factorOrder)
{
Clear();
std::string line;
if(!getline(in,line)) return 0;
std::map<std::string, std::string> meta=ProcessAndStripSGML(line);
if (meta.find("id") != meta.end()) { this->SetTranslationId(atol(meta["id"].c_str())); }

PCN::CN cn = PCN::parsePCN(line);
return InitializeFromPCNDataType(cn, factorOrder, line);
}

void WordLattice::GetAsEdgeMatrix(std::vector<std::vector<bool> >& edges) const
{
edges.resize(data.size()+1,std::vector<bool>(data.size()+1, false));
Expand Down
7 changes: 7 additions & 0 deletions moses/src/WordLattice.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include <vector>
#include "ConfusionNet.h"
#include "PCNTools.h"

namespace Moses
{
Expand All @@ -23,6 +24,12 @@ class WordLattice: public ConfusionNet {
// is it possible to get from the edge of the previous word range to the current word range
virtual bool CanIGetFromAToB(size_t start, size_t end) const;

/** Given a lattice represented using the PCN::CN data type (topologically sorted agency list
* representation), initialize the WordLattice object
*/
int InitializeFromPCNDataType(const PCN::CN& cn, const std::vector<FactorType>& factorOrder, const std::string& debug_line = "");
/** Read from PLF format (1 lattice per line)
*/
int Read(std::istream& in,const std::vector<FactorType>& factorOrder);

/** Convert internal representation into an edge matrix
Expand Down

0 comments on commit 93caa3d

Please sign in to comment.