-
Notifications
You must be signed in to change notification settings - Fork 0
/
ChunkMT.php
138 lines (115 loc) · 5.13 KB
/
ChunkMT.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
<?php
// This is the full Syntactic Multi-System Hybrid Translator workflow all in one file
// It requires three parameters - the language model, input sentences, grammar file
// It is run with the following command:
// php ChunkMT.php <language model> <input sentences> <grammar>
// For example:
// php ChunkMT.php languageModel.binary inputSentences.txt eng_sm6.gr
//Configuration
include 'config.php';
if(!isset($argv[1]) || !isset($argv[2]) || !isset($argv[3]) || $argv[1]=="" || $argv[2]=="" || $argv[3]==""){
echo "Please provide file names the language model, grammar and input sentences!\n";
die;
}
//Input parameters
$languageModel = $argv[1];
$inputFile = $argv[2];
$grammar = $argv[3];
//Count output chunks
$totalChunks = 0;
$googleChunks = 0;
$bingChunks = 0;
$letsmtChunks = 0;
$yandexChunks = 0;
//Parse the input sentences
shell_exec('java -Xmx1024m -jar included/BerkeleyParser.jar -gr '.$grammar.' < '.$inputFile.' > '.$inputFile.'.parsed');
//Chunk the parsed sentences
include 'chunk.php';
chunkAfile($inputFile.'.parsed', $inputFile.'.chunked');
$inCh = fopen($inputFile.".chunked", "r") or die("Can't open input file!"); //Chunked input sentences
$outg = fopen("output.google.txt", "a") or die("Can't create output file!"); //Google output sentences
$outb = fopen("output.bing.txt", "a") or die("Can't create output file!"); //Bing output sentences
$outl = fopen("output.letsmt.txt", "a") or die("Can't create output file!"); //LetsMT output sentences
$outy = fopen("output.yandex.txt", "a") or die("Can't create output file!"); //Yandex output sentences
$outh = fopen("output.hybrid.txt", "a") or die("Can't create output file!"); //Hybrid output sentences
include 'API/googleTranslate.php';
include 'API/bingTranslator.php';
include 'API/LetsMT.php';
include 'API/yandexTranslator.php';
//Process input file by line
if ($inCh) {
while (($sourceSentence = fgets($inCh)) !== false) {
if($sourceSentence == "\n"){
fwrite($outg, "\n");
fwrite($outb, "\n");
fwrite($outl, "\n");
fwrite($outy, "\n");
fwrite($outh, "\n");
}else{
$sourceSentence = str_replace(array("\r", "\n"), '', $sourceSentence);
//Translate with the APIs
$sentenceOne = translateWithGoogle($sourceLanguage, $targetLanguage, $sourceSentence);
$sentenceTwo = translateWithBing($sourceLanguage, $targetLanguage, $sourceSentence);
$sentenceThree = translateWithLetsMT($sourceSentence);
$sentenceFour = translateWithYandex($sourceLanguage, $targetLanguage, $sourceSentence);
$sentenceOne = trim($sentenceOne)." ";
$sentenceTwo = trim($sentenceTwo)." ";
$sentenceThree = trim($sentenceThree)." ";
$sentenceFour = trim($sentenceFour)." ";
fwrite($outg, $sentenceOne);
fwrite($outb, $sentenceTwo);
fwrite($outl, $sentenceThree);
fwrite($outy, $sentenceFour);
$sentences = array();
$perplexities = array();
$sentences[] = str_replace(array("\r", "\n"), '', $sentenceOne);
$sentences[] = str_replace(array("\r", "\n"), '', $sentenceTwo);
$sentences[] = str_replace(array("\r", "\n"), '', $sentenceThree);
$sentences[] = str_replace(array("\r", "\n"), '', $sentenceFour);
//Get the perplexities of the translations
$perplexities[] = shell_exec('./queryKenLM.sh '.$languageModel.' "'.$sentenceOne.'"');
$perplexities[] = shell_exec('./queryKenLM.sh '.$languageModel.' "'.$sentenceTwo.'"');
$perplexities[] = shell_exec('./queryKenLM.sh '.$languageModel.' "'.$sentenceThree.'"');
$perplexities[] = shell_exec('./queryKenLM.sh '.$languageModel.' "'.$sentenceFour.'"');
//Write the chunk with the smallest perplexity to the hybrid output
$outputString = $sentences[array_keys($perplexities, min($perplexities))[0]];
$outputString = trim($outputString)." ";
fwrite($outh, $outputString);
//Count chunks
$totalChunks++;
$googleSentence = str_replace(array("\r", "\n"), '', $sentenceOne);
$bingSentence = str_replace(array("\r", "\n"), '', $sentenceTwo);
$lesmtSentence = str_replace(array("\r", "\n"), '', $sentenceThree);
$yandexSentence = str_replace(array("\r", "\n"), '', $sentenceFour);
$googleSentence = trim($googleSentence)." ";
$bingSentence = trim($bingSentence)." ";
$lesmtSentence = trim($lesmtSentence)." ";
$yandexSentence = trim($yandexSentence)." ";
if (strcmp($outputString, $lesmtSentence) == 0){
$letsmtChunks++;
}elseif(strcmp($outputString, $bingSentence) == 0){
$bingChunks++;
}elseif(strcmp($outputString, $googleSentence) == 0){
$googleChunks++;
}elseif(strcmp($outputString, $yandexSentence) == 0){
$yandexChunks++;
}
}
}
fclose($inCh);
fclose($outg);
fclose($outb);
fclose($outl);
fclose($outy);
fclose($outh);
if($writeStats){
//Write chunk counts
$outCount = fopen("stats.txt", "a") or die("Can't create output file!"); //Hybrid count
fwrite($outCount, "Total chunk count: ".$totalChunks."\n");
fwrite($outCount, "Google chunk count: ".$googleChunks."\n");
fwrite($outCount, "Bing chunk count: ".$bingChunks."\n");
fwrite($outCount, "LetsMT chunk count: ".$letsmtChunks."\n");
fwrite($outCount, "Yandex chunk count: ".$yandexChunks."\n");
fclose($outCount);
}
}