Skip to content

Commit

Permalink
When scoring phrase pairs, store copies of the active pairs' PHRASE o…
Browse files Browse the repository at this point in the history
…bjects

instead of inserting them into a PhraseTable.  In a test on a 21GB
target-syntax extract file, this reduced user time from 195 to 120 mins.


git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3777 1f5c12ca-751b-0410-a591-d2e778427230
  • Loading branch information
pjwilliams committed Dec 14, 2010
1 parent 15b2c88 commit f6d7379
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 54 deletions.
23 changes: 5 additions & 18 deletions scripts/training/phrase-extract/PhraseAlignment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,17 @@ using namespace std;
extern Vocabulary vcbT;
extern Vocabulary vcbS;

extern PhraseTable phraseTableT;
extern PhraseTable phraseTableS;
extern bool hierarchicalFlag;

PhraseAlignment::PhraseAlignment()
:sourcePhraseId(999999)
,targetPhraseId(999999)
{}

// read in a phrase pair and store it
void PhraseAlignment::create( char line[], int lineID )
{
assert(phraseS.empty());
assert(phraseT.empty());

//cerr << "processing " << line;
vector< string > token = tokenize( line );
int item = 1;
PHRASE phraseS, phraseT;
for (int j=0; j<token.size(); j++)
{
if (token[j] == "|||") item++;
Expand Down Expand Up @@ -74,11 +69,6 @@ void PhraseAlignment::create( char line[], int lineID )

createAlignVec(phraseS.size(), phraseT.size());

assert(sourcePhraseId == 999999);
assert(targetPhraseId == 999999);
sourcePhraseId = phraseTableS.storeIfNew( phraseS );
targetPhraseId = phraseTableT.storeIfNew( phraseT );

if (item == 3)
{
count = 1.0;
Expand Down Expand Up @@ -107,9 +97,8 @@ void PhraseAlignment::createAlignVec(size_t sourceSize, size_t targetSize)

void PhraseAlignment::clear()
{
sourcePhraseId = 999999;
targetPhraseId = 999999;

phraseS.clear();
phraseT.clear();
alignedToT.clear();
alignedToS.clear();
}
Expand All @@ -134,8 +123,6 @@ bool PhraseAlignment::match( const PhraseAlignment& other )
if (other.GetSource() != GetSource()) return false;
if (!hierarchicalFlag) return true;

PHRASE phraseT = phraseTableT.getPhrase( GetTarget() );

assert(phraseT.size() == alignedToT.size() + 1);
assert(alignedToT.size() == other.alignedToT.size());

Expand Down
14 changes: 6 additions & 8 deletions scripts/training/phrase-extract/PhraseAlignment.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,31 +7,29 @@
* Copyright 2010 __MyCompanyName__. All rights reserved.
*
*/
#include "tables-core.h"

#include <vector>
#include <set>

// data structure for a single phrase pair
class PhraseAlignment
{
protected:
int targetPhraseId, sourcePhraseId;
PHRASE phraseS;
PHRASE phraseT;

void createAlignVec(size_t sourceSize, size_t targetSize);
public:
float count;
std::vector< std::set<size_t> > alignedToT;
std::vector< std::set<size_t> > alignedToS;

PhraseAlignment();

void create( char*, int );
void clear();
bool equals( const PhraseAlignment& );
bool match( const PhraseAlignment& );

int GetTarget() const
{ return targetPhraseId; }
int GetSource() const
{ return sourcePhraseId; }

const PHRASE &GetSource() const { return phraseS; }
const PHRASE &GetTarget() const { return phraseT; }
};
36 changes: 8 additions & 28 deletions scripts/training/phrase-extract/score.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,11 @@ void computeCountOfCounts( char* fileNameExtract, int maxLines );
void processPhrasePairs( vector< PhraseAlignment > & );
PhraseAlignment* findBestAlignment( vector< PhraseAlignment* > & );
void outputPhrasePair( vector< PhraseAlignment * > &, float );
double computeLexicalTranslation( PHRASE &, PHRASE &, PhraseAlignment * );
double computeLexicalTranslation( const PHRASE &, const PHRASE &, PhraseAlignment * );

ofstream phraseTableFile;

LexicalTable lexTable;
PhraseTable phraseTableT;
PhraseTable phraseTableS;
bool inverseFlag = false;
bool hierarchicalFlag = false;
bool wordAlignmentFlag = false;
Expand Down Expand Up @@ -166,7 +164,6 @@ int main(int argc, char* argv[])
}

// loop through all extracted phrase translations
int lastSource = -1;
float lastCount = 0.0f;
vector< PhraseAlignment > phrasePairsWithSameF;
int i=0;
Expand Down Expand Up @@ -196,27 +193,20 @@ int main(int argc, char* argv[])
if (lastPhrasePair != NULL && lastPhrasePair->equals( phrasePair ))
{
lastPhrasePair->count += phrasePair.count;
phrasePair.clear();
continue;
}

// if new source phrase, process last batch
if (lastSource >= 0 && lastSource != phrasePair.GetSource()) {
if (lastPhrasePair != NULL &&
lastPhrasePair->GetSource() != phrasePair.GetSource()) {
processPhrasePairs( phrasePairsWithSameF );
for(int j=0;j<phrasePairsWithSameF.size();j++)
phrasePairsWithSameF[j].clear();
phrasePairsWithSameF.clear();
phraseTableT.clear();
phraseTableS.clear();
// process line again, since phrase tables flushed
phrasePair.clear();
phrasePair.create( line, i );
lastPhrasePair = NULL;
}

// add phrase pairs to list, it's now the last one
lastSource = phrasePair.GetSource();
phrasePairsWithSameF.push_back( phrasePair );
lastPhrasePair = &phrasePairsWithSameF[phrasePairsWithSameF.size()-1];
lastPhrasePair = &phrasePairsWithSameF.back();
}
processPhrasePairs( phrasePairsWithSameF );
phraseTableFile.close();
Expand Down Expand Up @@ -275,16 +265,6 @@ void computeCountOfCounts( char* fileNameExtract, int maxLines )
continue;
}

// periodically house cleaning
if (phrasePair->GetSource() != lastPhrasePair->GetSource())
{
phraseTableT.clear(); // these would get too big
phraseTableS.clear(); // these would get too big
// process line again, since phrase tables flushed
phrasePair->clear();
phrasePair->create( line, lineNum );
}

int count = lastPhrasePair->count + 0.99999;
if(count <= GT_MAX)
countOfCounts[ count ]++;
Expand Down Expand Up @@ -382,8 +362,8 @@ void outputPhrasePair( vector< PhraseAlignment* > &phrasePair, float totalCount
count += phrasePair[i]->count;
}

PHRASE phraseS = phraseTableS.getPhrase( phrasePair[0]->GetSource() );
PHRASE phraseT = phraseTableT.getPhrase( phrasePair[0]->GetTarget() );
const PHRASE &phraseS = phrasePair[0]->GetSource();
const PHRASE &phraseT = phrasePair[0]->GetTarget();

// labels (if hierarchical)

Expand Down Expand Up @@ -470,7 +450,7 @@ void outputPhrasePair( vector< PhraseAlignment* > &phrasePair, float totalCount
phraseTableFile << endl;
}

double computeLexicalTranslation( PHRASE &phraseS, PHRASE &phraseT, PhraseAlignment *alignment ) {
double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment ) {
// lexical translation probability
double lexScore = 1.0;
int null = vcbS.getWordID("NULL");
Expand Down

0 comments on commit f6d7379

Please sign in to comment.