-
Notifications
You must be signed in to change notification settings - Fork 0
/
extractionFromPosteriors.tex
1790 lines (1684 loc) · 91.3 KB
/
extractionFromPosteriors.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
\chapter{Hierarchical Phrase-Based Grammar Extraction from Alignment Posterior Probabilities}
\chaptermark{Hiero Extraction from Posteriors}
\label{chap:extractionFromPosteriors}
% TODOFINAL check too much blank space
% TODOFINAL notation for phrase pairs brackets (grep for '(f' for example)
% TODOFINAL grep for {\ (to detect for example {\it )
% TODOFINAL grep for output used as a verb and correct
% TODOFINAL grep \forall and replace comma maybe or find a better way
% TODOFINAL grep for consistent and use it consistently
% TODOFINAL ensure that equations don't go over margin
% TODOFINAL review count assigned to phrase pair in liu et al 2009
% TODOFINAL take look at Jamie's thesis background on HMM word alignment models
% TODOFINAL in grammar pattern section, say which grammars are used for exps
% TODOFINAL read through comments and address left out ones
% TODOFINAL either in this chap or background, add an equation describing the translation model: simply relative frequency src-trg / src
% TODOFINAL check resp. respectively usage
% TODOFINAL throughout refer to background chapter rather than re-citing papers
% TODOFINAL consistent use of hyphens in phrase pairs
% TODOFINAL review grammatical usage of citet citep
% TODOFINAL grep for abbreviations, grep for period and put right spacing
% TODOFINAL uniform notation for phrase pairs: brackets or parenthesis, let's say bracket is better
% TODOFINAL remove all latex compilation warnings
% TODOFINAL (already done ??) somewhere in the introduction, use the convention source and target.
% TODOFINAL try to get rid of htbp in floats and see the outcome
% TODOFINAL get rid of all vertical bars and replace by \mid
% TODOFINAL remove all mbox and replace by text
% TODOFINAL remove all HMM and replace with text{HMM} in equations
% TODOFINAL remove all \bf bu \bm
% TODOFINAL search replace A(i_1, i_2, j_1, j_2) by A(j_1, j_2, i_1, i_2)
% TODOFINAL (already done ??) search replace f_R f_C
In \autoref{chap:hfile}, we have described how to exploit the MapReduce
framework and the HFile format in order to generate very large
translation grammars
and retrieve rules from these grammars efficiently. The main contribution
was at the infrastructure level rather at the modelling level. In this chapter,
we will attempt to improve models for translation grammar extraction.
Standard practice in SMT systems is to decouple the word alignment phase from
the rule extraction phase. Typically, word alignment models are only used to
obtain a set of alignment links, then those alignment links determine
constraints that are followed in the rule extraction
step (see \autoref{sec:hierruleextract}). In
this chapter, we attempt to leverage the information
contained in alignment models by extracting rules from alignment posterior
probabilities~\citep{degispert-pino-byrne:2010:EMNLP}. These statistics are
computed from the HMM alignment model~\citep{vogel-ney-tillmann} and they
are used
both to generate constraints for rule extraction and for translation model
estimation.
%We extract a grammar and estimate a translation model that leads to translation improvements.
This chapter presents two rule extraction methods. With the first method, rules
are extracted from alignment link posterior probabilities. With the second
method, rules are extracted from alignment posterior probabilities
over phrase pairs. We
demonstrate improvements on a medium scale Chinese-English task with these
methods. We also investigate how best to exploit source-to-target and
target-to-source alignment models.
%\section{Notation}
%\label{sec:definitionAndNotation}
%In \autoref{sec:StatisticalMachineTranslationWordAlignment}, we
%have introduced the variable $\bm{a}$, which denotes either
%a set of alignment links or the random variable that defines the word
%alignment model. In this chapter, for clarity, we distinguish these
%two cases. Given a sentence pair $(\bm{f} = f_1^J, \bm{e} = e_1^I)$,
%an alignment \emph{link} is simply a pair $(j, i) \in [1, J] \times [1, I]$.
%We denote $\bm{L}$ to be a set of links for the sentence pair $(\bm{f}, \bm{e})$.
%$\bm{L}$ can be obtained by the Viterbi
%algorithm~\citep{brown-dellapietra-dellapietra-mercer-1993} or maximum a posteriori
%decoding~\citep{matusov-zens-ney:2004:COLING,kumar-och-macherey:2007:EMNLP} and by
%symmetrisation techniques (see \autoref{sec:symmetrisationHeuristics}).
%We also remind definitions from \autoref{sec:hieroTypesOfRules}:
%a \emph{phrase-based} rule is a translation
%rule that contains only terminals in its right hand side;
%a \emph{hierarchical} rule is a translation
%rule that contains at least one nonterminal in its right hand side.
\section{Introduction}
\label{sec:extractionFromPosteriorsIntro}
In state-of-the-art SMT systems, rules are extracted from % TODOFINAL (check abbr)
word-aligned parallel text. The alignments are typically generated by
applying symmetrisation
heuristics (see \autoref{sec:symmetrisationHeuristics}) to Viterbi
alignments (see \autoref{eq:viterbiAlignment}) obtained
from source-to-target and target-to-source word alignment models.
Additional information that these models could provide, such as posterior
probabilities over alignment links, is not used. For example, let us consider
the word-aligned German-English sentence pair in
\autoref{fig:wordalignedSentencePairMistake}.
%
\begin{figure}
\begin{center}
\begin{tikzpicture} [node distance = 2cm, text height=1.5ex, text depth=.25ex]
% place nodes
\node (Er) {Er};
\node [right of = Er] (hat) {hat};
\node [right of = hat] (den) {den};
\node [right of = den] (Ball) {Ball};
\node [right of = Ball] (gesehen) {gesehen};
%\node [right of = gesehen] (germanDot) {.};
\node [below of = Er] (He) {He};
\node [right of = He] (has) {has};
\node [right of = has] (seen) {seen};
\node [right of = seen] (the) {the};
\node [right of = the] (ball) {ball};
%\node [right of = ball] (englishDot) {.};
% draw edges
\draw (Er) -- (He);
\draw (hat) -- (has);
\draw (den) -- (the);
\draw (Ball) -- (ball);
\draw (gesehen) -- (seen);
%\draw (germanDot) -- (englishDot);
% spurious alignment link
\draw (hat) -- (seen);
\end{tikzpicture}
\end{center}
\caption{German-English word-aligned sentence pair. The spurious alignment
link between the German word \emph{hat} (\emph{has}) and the English word \emph{seen}
prevents the phrase pair $\langle$\emph{hat}, \emph{has}$\rangle$ to be extracted from this
sentence pair.}
\label{fig:wordalignedSentencePairMistake}
\end{figure}
%
Intuitively, we can tell that
there is a spurious alignment link between the German
word \emph{hat} (\emph{has}) and the
English word \emph{seen}. This link will prevent the extraction of the useful
phrase pair $\langle$\emph{hat}, \emph{has}$\rangle$
from this sentence pair. However, it is
possible that the \emph{posterior probability} of this spurious link according
to the alignment model is relatively low. We hypothesise that posterior
probability information from alignment models is more reliable than the links
obtained from Viterbi alignment.
In this chapter, we use HMM alignment
models (see \autoref{sec:statisticalMachineTranslationHmmAlignmentModel}) to
generate the statistics needed to both extract rules and estimate the
translation models. We hypothesise that this tighter coupling between alignment
and translation models will provide better translation quality.
We will evaluate the grammar we obtain in two ways. First, we will assess the
grammar's ability to generate a reference translation from a source sentence.
This is determined by the type of reordering allowed by the grammar and by
the choice of translations for each source side of a rule. We will then evaluate
translation quality provided by this grammar.
Conceptually, our extraction method consists in extracting all possible phrase
pairs and hierarchical phrase pairs given a sentence pair and selecting only
those that satisfy certain statistical criteria related to alignment posterior
probabilities. For example, we can select phrase pairs that contain a link with
a high posterior probability; or we can select phrase pairs that contain a link
with a high posterior probability and that have a high phrase pair posterior
probability. The selection process determines which rules the grammar will
contain and will therefore define the ability of the grammar to generate a
reference translation given a source sentence. We can also use statistics from
alignment models to estimate translation models in a novel way. In this work, we
will use phrase pair posterior probability instead of integer counts to estimate
translation models.
\section{Related Work}
\label{sec:extractionFromPosteriorRelated}
The limitations of extracting translation rules from Viterbi alignments, i.e.
that potentially useful information from the alignment models is ignored, has
been addressed previously. \citet{venugopal-zollmann-smith-vogel:2008:AMTA}
extract rules from
$n$-best lists of alignments and $n$-best lists of syntactic parses for a
syntax-augmented hierarchical system~\citep{zollmann-venugopal:2006:WMT}.
In the alignment step, an $n$-best list of alignments $\bm{a_1}, ..., \bm{a_n}$ is
produced with posterior probabilities
$p(\bm{a_1} \mid \bm{f}, \bm{e}), ..., p(\bm{a_n} \mid \bm{f}, \bm{e})$. These
posteriors are normalised to produce probabilities
$\hat{p}(\bm{a_1}), ..., \hat{p}(\bm{a_n})$. Similarly, probabilities
$\hat{p}(\bm{\pi_1}), ..., \hat{p}(\bm{\pi_{n'}})$ are obtained
for an $n'$-best list of parses $\bm{\pi_1}, ..., \bm{\pi_{n'}}$. For each alignment
$\bm{a_i}$ and parse $\bm{\pi_j}$, syntax-augmented hierarchical rules are extracted
with a count $\hat{p}(\bm{a_i}) \, \hat{p}(\bm{\pi_j})$.
Alignment $n$-best lists have also been used to create a
structure called
\emph{weighted alignment matrix}~\citep{liu-xia-xiao-liu:2009:EMNLP}.
Probabilities $\hat{p}(\bm{a_1}), ..., \hat{p}(\bm{a_n})$
for $n$-best alignments $\bm{a_1}, ..., \bm{a_n}$ are computed
as previously~\citep{venugopal-zollmann-smith-vogel:2008:AMTA}.
Then, for each word pair $(f_j, e_i)$, the alignment link posterior
probability $p_m(j, i)$ is computed in \autoref{eq:matrixLinkPosterior}.
%
\begin{equation}
p_m(j, i) = \sum_{k = 1}^n \hat{p}(\bm{a_k}) \delta(\bm{a_k}, i, j)
\label{eq:matrixLinkPosterior}
\end{equation}
%
$\delta(\bm{a_k}, i, j)$ indicates whether there is a link between $i$ and
$j$ in the alignment $\bm{a_k}$. Given a sentence pair, all phrase
pairs with a maximum source length and a maximum target length that
contain a link with a posterior greater than zero are extracted. The
fractional counts assigned to these phrase pairs are computed in terms
of the link posteriors and then used to estimate the translation models
by relative frequency. The fractional count computation approximates
the posterior probability of all alignments consistent with the phrase
pair. Our method also uses link posterior probabilities
to constrain the extraction but the posteriors are computed
exactly rather than approximated. In addition, posterior probabilities
of consistent alignments is also computed exactly. Finally, our method
is also applied to hierarchical phrase-based translation.
Alignment posterior probabilities without approximation have also been
used.
For a given test set, \citet{deng-and-byrne:2008:ASLP} first extract
phrase pairs in a standard manner. Then, source phrases in the test set
that do not have any corresponding target in the list of extracted
phrase pairs are selected. For each of these source phrases, sentence
pairs where the source phrase occurs are considered. For each such
sentence pair, all target phrases in the target sentence are assigned
phrase pair posterior
probabilities (see \autoref{sec:extractionFromPosteriorsPhrasePair})
according to the
source-to-target and target-to-source alignment models, then ranked
by the geometric average of the two probabilities. The top phrase pair
is retained if its scores are above specific thresholds.
Our definition of phrase pair posterior probabilities and the procedure to compute
them are directly inspired by the work we just described. However, we do not use
the word-to-phrase HMM model but the simpler word-to-word HMM model.
In addition, our method is applied to hierarchical phrase-based grammars
rather than simpler phrase-based grammars. Finally, our grammar
extraction scheme does not consist in
first extracting a standard grammar and then augmenting the grammar with additional rules: we
modify the extraction procedure to directly extract a hierarchical
grammar from alignment link posterior probabilities
or phrase pair posterior probabilities.
\citet{kumar-och-macherey:2007:EMNLP} also use exact computation
of alignment link posteriors in a different application setting.
First, instead of using the Viterbi criterion for word alignment
reminded in \autoref{eq:viterbiCriterion},
%
\begin{equation}
\bm{\hat{a}} = \argmax_{\bm{a}} p(\bm{f}, \bm{a} \mid \bm{e})
\label{eq:viterbiCriterion}
\end{equation}
%
the maximum a posteriori criterion~\citep{matusov-zens-ney:2004:COLING}, shown in \autoref{eq:mapCriterion}, is used:
%
\begin{equation}
\hat{a}_j = \argmax_{i} p(a_j = i \mid \bm{f}, \bm{e})
\label{eq:mapCriterion}
\end{equation}
%
Then, given a parallel corpus for three languages $F$, $G$, $E$, the
link posteriors for the language pair ($F$, $E$) are computed
in terms of the posteriors for the language pair ($F$, $G$) and ($G$, $E$).
G is called a \emph{bridge} language. The motivation is that
alignments for the $F$-$G$ language pair and the $G$-$E$ language
pair may inform alignment for $F$-$E$. Multiple bridge languages are used
and produce corresponding posterior matrices. The matrices are interpolated
and alignments are extracted for each bridge language and for the
interpolation. Translation gains are obtained in system combination.
We also note approaches to tighter coupling between hierarchical phrase-based
grammars and alignments or even direct modelling of phrase alignment.
\citet{marcu-wong:2002:EMNLP} introduce a joint phrase-based model that
does not make use of word alignments. In this generative model, a sentence
pair is produced by concatenating phrase pairs, or so-called \emph{concepts}.
The authors consider a simpler model with only joint phrase pair translation
probabilities and a more complex model with translation and distortion
probabilities. The parameter are trained with an approximate version of the
expectation-maximisation algorithm~\citep{dempster-laird-rubin:1977:JRSS}.
Experiments demonstrate translation improvements over IBM Model 4.
\citet{birch-callisonburch-osborne-koehn:2006:WMT} constrain this model
in order to be able to apply it to larger parallel corpora. When searching for
a set of phrase pairs to cover a training sentence pair, phrase pairs that
are consistent with the intersection of Viterbi
alignments (see \autoref{sec:symmetrisationHeuristics}) are considered first; other
phrase pairs are considered only when the sentence pair cannot be covered entirely.
Results close to standard phrase-based models are obtained.
\citet{denero-klein:2008:ACL} prove that phrase alignment is an
NP-hard problem. Given a sentence pair $(\bm{f}, \bm{e})$, a bijective phrase
alignment $\bm{a}$ is defined as a bijective mapping between source phrases that
form a partition of $\bm{f}$ and target phrases that form a partition of
$\bm{e}$. A scoring function $\phi$ is also defined that assigns a real-valued
score to any phrase pair $\langle$source phrase, target phrase$\rangle$. The score of a
bijective phrase alignment is simply the product of the scores of its phrase pairs.
Given $(\bm{f}, \bm{e}, \phi)$, the phrase alignment optimisation problem is to find
the best scoring alignment. \citet{denero-klein:2008:ACL} show that this problem
is NP-hard by showing that the corresponding decision problem is NP-complete via
reduction of the SAT problem. We give here an indication of the size of the search
space. The number of possible source partitions is $2^{|\bm{f}| - 1}$.
Given a source partition with $K + 1$ phrases, there are $(K + 1)!$ possible
permutation of the source phrases and $2^{{|e| - 1} \choose K}$ possible target
partitions with $K+1$ phrases. In conclusion, there is little hope to solve
the phrase alignment problem exactly.
\citet{saers-wu:2009:SSST} report an
improvement on a phrase-based system where word alignment has been trained with
an inversion transduction grammar rather than IBM or HMM models.
Phrase alignment is directly modelled with an inversion transduction
grammar. The phrase alignment search space is more restrictive than
the space considered in \citet{denero-klein:2008:ACL} and the
expectation maximisation algorithm can be carried out in $O(n^6)$
where $n$ is the number of tokens in a sentence.
\citet{pauls-klein-chiang-knight:2010:NAACL} also use an
inversion transduction grammar to directly align phrases to nodes in a
string-to-tree model. Bayesian methods have also been developed to induce a
grammar directly from an unaligned parallel
corpus~\citep{blunsom-cohn-osborne:2008:NIPS,blunsom-cohn-dyer-osborne:2009:ACL}.
Finally, \citet{Cmejrek2009} extract rules directly from
bilingual chart parses of the parallel corpus without using word alignments.
We take a different approach in that we aim to start with very strong alignment
models and use them to guide grammar extraction.
Finally, some work on smoothing, which could be complementary to the approach taken
in this thesis, has been conducted to address the
shortcomings of relative frequency estimation for translation models.
\citet{foster-kuhn-johnson:2006:EMNLP} conduct an extensive series of
experiments that either replace the relative frequency estimated phrase table by
a smoothed phrase table or add the smoothed phrase table as a feature and
observe improvement in translation quality.
\section{Rule Extraction}
\label{sec:extractionFromPosteriorsExtraction}
In \autoref{sec:extractionFromPosteriorRelated}, we have reviewed approaches
that ``widen'' the translation pipeline by using alignment $n$-best lists.
We have also reviewed applications of exact computation of alignment posterior
probabilities and attempts to directly model phrase alignment. We will now
describe our grammar extraction methods, based on exact computation of
alignment posterior probabilities under an alignment model.
As in previous work~\citep{hopkins-langmead-vo:2011:WMT},
we first present a general approach that encompasses both standard methods
based on rule extraction from Viterbi alignments as well as our methods.
For clarity of presentation, we first describe our methods in the simpler case
of phrase-based rule extraction, then extend them to hierarchical phrase-based
rule extraction.
\subsection{General Framework for Rule Extraction}
\label{sec:extractionFromPosteriorsExtractionGeneralApproach}
We first describe a general method for the extraction of phrase-based rules.
An extension of this procedure for
hierarchical rules is described in
\autoref{sec:extractionFromPosteriorsExtractionDisjoint}. The algorithm is
described in \autoref{alg:generalRuleXtract}.
%
\begin{figure}
\begin{algorithmic}[1]
\Function{ExtractRules}{$f_1^J, e_1^I, \bm{a}$}
\For{$1 \leq j_1 \leq j_2 \leq J$} \hypertarget{alg:line:sourcePhrase}{} \label{alg:line:sourcePhrase}
\For{$1 \leq i_1 \leq i_2 \leq I$} \hypertarget{alg:line:targetPhrase}{} \label{alg:line:targetPhrase}
\If{\Call{SourceConstraints}{$f_{j_1}^{j_2}$} \par
\hskip\algorithmicindent \hskip\algorithmicindent $\land$ \Call{AlignConstraints}{$f_{j_1}^{j_2}, e_{i_1}^{i_2}, \bm{a}$} \par
\hskip\algorithmicindent \hskip\algorithmicindent $\land$ \Call{TargetConstraints}{$e_{i_1}^{i_2}$}}
\hypertarget{alg:line:constraints}{} \label{alg:line:constraints}
\State{\Call{Extract}{\RT[$X$][$f_{j_1}^{j_2}$][$e_{i_1}^{i_2}$], \Call{Count}{$f_{j_1}^{j_2}, e_{i_1}^{i_2}$}}} \hypertarget{alg:line:extract}{} \label{alg:line:extract}
\EndIf
\EndFor
\EndFor
\EndFunction
\end{algorithmic}
\caption{General procedure for phrase-based rule extraction: both traditional
rule extraction from Viterbi alignment and our method are instances of this
procedure.}
\label{alg:generalRuleXtract}
\end{figure}
%
Given a
sentence pair $(f_1^J, e_1^I)$, for each source index pair $(j_1, j_2)$
defining a source phrase $f_{j_1}^{j_2}$
(\hyperlink{alg:line:sourcePhrase}{line \ref{alg:line:sourcePhrase}}), for each
target index pair $(i_1, i_2)$ defining a target phrase $e_{i_1}^{i_2}$
(\hyperlink{alg:line:targetPhrase}{line \ref{alg:line:targetPhrase}}), if source
constraints, target constraints and
alignment constraints are satisfied
(\hyperlink{alg:line:constraints}{line \ref{alg:line:constraints}}), then the
phrase pair ($f_{j_1}^{j_2}$, $e_{i_1}^{i_2}$) is extracted with a certain
count (\hyperlink{alg:line:extract}{line \ref{alg:line:extract}}): the phrase
pair is added to the list of phrase pairs used in translation, and the count
will be used subsequently to compute translation models by relative frequency.
The purpose
of the constraints is to obtain a manageable number of rules. If we did not
impose constraints, we would extract $\frac{I (I + 1) J (J + 1)}{4}$ (not necessarily distinct) rules
for the sentence pair $(f_1^J, e_1^I)$. During
translation, the decoder would need to apply more pruning, which would
potentially lead to more search errors and a decrease in translation
quality.
We will now refine this general procedure to make it more practical and
closer to a possible implementation.
Let us call the source constraints $\mathcal{C}_S$, the alignment constraints
$\mathcal{C}_A$ and the target constraints $\mathcal{C}_T$. These are
Boolean functions used to select phrase pairs. In practice, source
constraints are checked on the source phrases before looking
at the target phrases. If source constraints are not met, then we need not
consider target phrases for that source phrase.
In addition, target phrases are only considered if they
satisfy alignment constraints with the source phrase and, if they do, we rank
them according to a certain ranking function $\mathcal{R}$. Target constraints also
depend on the ranking $\mathcal{R}$, for example we can decide to keep only a certain
number of target phrases per source phrase. When a phrase pair is extracted, it
is assigned a count which will be used to estimate the source-to-target and
target-to-source translation models. The counting function is called $\mathcal{C}$.
With this notation, we obtain the revised extraction procedure in
\autoref{alg:generalRuleXtractSpecialized}.
%
\begin{figure}
\begin{algorithmic}[1]
\Function{ExtractRules}{$f_1^J, e_1^I, \bm{a}$}
\For{$1 \leq j_1 \leq j_2 \leq J$}
\If{$\lnot \mathcal{C}_S(f_{j_1}^{j_2})$} \Comment{Source constraints}
\State{\bf{continue}}
\EndIf
\State{$T \gets \emptyset$} \Comment{Sorted target phrases}
\For{$1 \leq i_1 \leq i_2 \leq I$}
\If{$\mathcal{C}_A(f_{j_1}^{j_2}, e_{i_1}^{i_2}, \bm{a})$} \Comment{Alignment constraints}
\State{$T \gets T \cup e_{i_1}^{i_2}$}
\EndIf
\EndFor
\State{\Call{Sort}{$T, \mathcal{R}$}} \Comment{Target phrases ranked according to $\mathcal{R}$}
\For{$e_{i_1}^{i_2} \in T$}
\If{$\mathcal{C}_T(e_{i_1}^{i_2}, T)$} \Comment{Target constraints}
\State{\Call{Extract}{\RT[$X$][$f_{j_1}^{j_2}$][$e_{i_1}^{i_2}$], $\mathcal{C}(f_{j_1}^{j_2}, e_{i_1}^{i_2})$}}
\EndIf
\EndFor
\EndFor
\EndFunction
\end{algorithmic}
\caption{General procedure for phrase-based rule extraction. This version
is more practical and closer to a possible implementation than the algorithm in \autoref{alg:generalRuleXtract}.
Source phrases are first considered. Only if source constraints $\mathcal{C}_S$ are satisfied, then
target phrases are considered. Targets that satisfy alignment constraints $\mathcal{C}_A$ with
their
source are ranked by $\mathcal{R}$. Finally, phrase pairs where the target
satisfies target constraints can be extracted with a certain count $\mathcal{C}$.
Note that the target constraints implicitly depend on the ranking of the
targets by $\emph{R}$.}
\label{alg:generalRuleXtractSpecialized}
\end{figure}
%
We will now describe different rule extraction strategies in terms of
the constraints $\mathcal{C}_S$, $\mathcal{C}_A$, $\mathcal{C_T}$,
the ranking function $\mathcal{R}$ and the counting function $\mathcal{C}$.
\subsection{Extraction from Viterbi Alignment Links}
\label{sec:extractionFromPosteriorsViterbi}
In this section, we describe the standard extraction procedure within
the framework introduced in
\autoref{sec:extractionFromPosteriorsExtractionGeneralApproach}.
Common practice takes a fixed set of word alignment links $\bm{L}$
and extracts
rules from this set. Alignment links $\bm{L}$ are obtained from
the alignment model $\bm{a}$ either by the Viterbi algorithm or
by maximum a posteriori
estimation~\citep{matusov-zens-ney:2004:COLING,kumar-och-macherey:2007:EMNLP}
and possibly using symmetrisation heuristics to combine links obtained
from source-to-target and target-to-source alignment
models (see \autoref{sec:symmetrisationHeuristics}). We can restate this
common approach in the framework proposed in
\autoref{sec:extractionFromPosteriorsExtractionGeneralApproach} and in
\autoref{alg:generalRuleXtractSpecialized} where constraints, ranking and
counting functions are defined as follows:
%
\begin{itemize}
\item source constraints $\mathcal{C}_S(f_{j_1}^{j_2})$:
%
\begin{equation}
j_2 - j_1 < s_{\text{max}}
\end{equation}
%
where $s_{\text{max}}$ is a integer threshold defined experimentally.
$s_{\text{max}}$ represents the maximum length of a source phrase.
\item alignment constraints $\mathcal{C}_A(f_{j_1}^{j_2}, e_{i_1}^{i_2}, \bm{a})$:
%
\begin{equation}
\Big( \forall (j,i) \in \bm{L}, j \in [j_1, j_2] \Leftrightarrow i \in [i_1,i_2] \Big) \land \Big( \bm{L} \cap [j_1, j_2] \times [i_1, i_2] \neq \emptyset \Big)
\label{eq:consistencyConstraint}
\end{equation}
%
Alignment constraints have already been described in \autoref{sec:phrasextract} as
the conditions required for phrase pair extraction.
The first bracketed constraint requires that there be no alignment link between a
word inside the phrase pair and a word outside of it. The second
bracketed constraint
requires that
there be at least one alignment link in the phrase pair. Sometimes, an
additional constraint specifies that the boundary words in the phrase pair
should be aligned. In this work, this constraint is not present. % TODOFINAL repeat in background ?
A phrase pair that satisfies \autoref{eq:consistencyConstraint} is said
to be \emph{consistent} with the alignment (see \autoref{sec:phrasextract}).
\item target constraints $\mathcal{C}_T(e_{i_1}^{i_2}, T)$: no constraint is
imposed in this work. Target constraints based on length may be imposed
depending on the implementation.
\item ranking and counting functions:
%
\begin{equation}
\mathcal{R}(f_{j_1}^{j_2},e_{i_1}^{i_2}) = \mathcal{C}(f_{j_1}^{j_2},e_{i_1}^{i_2}) = 1
\end{equation}
\end{itemize}
%
The above constraints, ranking and counting functions define the standard
approach to grammar extraction.
In the next sections, we depart from this approach and apply novel functions
to rank and count target-side translations according to their quality in the
context of each parallel sentence, as defined by the word alignment models. We
also depart from common practice in that we do not use a set of links as
alignment constraints. We thus have better control over the number of extracted
rules as well as the relative frequency estimates of the source-to-target and
target-to-source translation models.
\subsection[Extraction from Posteriors Probabilities over Alignment Links]{Extraction from Posteriors Probabilities over \\ Alignment Links}
\label{sec:extractionFromPosteriorsLink}
%We now consider the hidden random variable $\bm{a}$ that models the
%alignment process.
For presentation, we only consider source-to-target
alignment models: the random variable $\bm{a}$ that models the alignment
process
takes values in functions from source
word positions to target word positions. However, it it possible to apply
our method with any directional alignment model.
We will use the link
posterior probability $p(a_j = i \mid f_1^J, e_1^I)$ to guide
rule extraction. This statistic expresses how likely it is that
a word $f_j$ in source position $j$ and a word $e_i$ in
target position $i$ are aligned given the sentence pair $(f_1^J,e_1^I)$.
The link posterior probability can be computed efficiently for
Model 1, Model 2 and HMM. In our experiments, we only use the HMM model
to compute link posteriors but comparisons between link posteriors obtained
from various models may be interesting in the future. We will derive
a closed form solution for these models to compute the link posterior
probability.
Applying the definition of conditional
probability, we obtain the general form of the link posterior probability in
\autoref{eq:linkposdef}.
%
\begin{equation} \label{eq:linkposdef}
p(a_{j_0} = i_0 \mid f_1^J, e_1^I) = \frac{p(a_{j_0}=i_0,f_1^J \mid e_1^I)}{p(f_1^J \mid e_1^I)}
\end{equation}
%
Using \autoref{eq:linkposdef}, we will now derive the link posterior probability
$p_{M_1}(a_{j_0} = i_0 \mid f_1^J, e_1^I)$ for Model 1,
$p_{M_2}(a_{j_0} = i_0 \mid f_1^J, e_1^I)$ for Model 2 and
$p_{\text{HMM}}(a_{j_0} = i_0 \mid f_1^J, e_1^I)$ for the HMM model.
\subsubsection{Link Posterior Probability for Model 1}
Let us derive $p_{M_1}(a_{j_0} = i_0 \mid f_1^J, e_1^I)$,
the link posterior probability under Model 1. We use the notation
from~\citep{brown-dellapietra-dellapietra-mercer-1993}, where $t$ is
the word-to-word translation table and $\varepsilon$ is a constant. We compute the
numerator from
\autoref{eq:linkposdef} by marginalising over all possible alignments and
inverting sum and product signs to obtain \autoref{eq:linkposM1Numerator}:
%
\begin{align}
& p_{M_1}(a_{j_0}=i_0,f_1^J \mid e_1^I) \nonumber \\
&= \sum_{a_1 = 0}^{I} ... \sum_{a_{j_0-1} = 0}^{I} \sum_{a_{j_0+1} = 0}^{I} ... \sum_{a_J = 0}^{I} p_{M_1}(a_1 ... a_{j_0-1} i_0 a_{j_0+1} ... a_J,f_1^J \mid e_1^I) \nonumber \\
&= \sum_{a_1 = 0}^{I} ... \sum_{a_{j_0-1} = 0}^{I} \sum_{a_{j_0+1} = 0}^{I} ... \sum_{a_J = 0}^{I} \frac{\varepsilon}{(1+I)^J} t(f_{j_0} \mid e_{i_0}) \prod_{\substack{j = 1 \\ j \neq j_0}}^J t(f_j \mid e_{a_j}) \nonumber \\
&= \frac{\varepsilon}{(1+I)^J} t(f_{j_0} \mid e_{i_0}) \prod_{\substack{j = 1 \\ j \neq j_0}}^J \sum_{i=0}^I t(f_j \mid e_i) \label{eq:linkposM1Numerator}
\end{align}
%
We compute the denominator from \autoref{eq:linkposdef}
similarly (see Equation (15)
in~\citep{brown-dellapietra-dellapietra-mercer-1993}) and obtain
\autoref{eq:linkposM1Denominator}:
%
\begin{equation} \label{eq:linkposM1Denominator}
p_{M_1}(f_1^J \mid e_1^I) = \frac{\varepsilon}{(1+I)^J} \prod_{j=1}^J \sum_{i=0}^I t(f_j \mid e_i)
\end{equation}
%
After simplification, we obtain \autoref{eq:linkposM1} from
\autoref{eq:linkposM1Numerator} and \autoref{eq:linkposM1Denominator}:
%
\begin{equation} \label{eq:linkposM1}
p_{M_1}(a_{j_0} = i_0 \mid f_1^J, e_1^I) = \frac{t(f_{j_0}|e_{i_0})}{\sum_{i=0}^I t(f_{j_0}|e_i)}
\end{equation}
%
\subsubsection{Link Posterior Probability for Model 2}
We apply the same method to compute $p_{M_2}(a_{j_0} = i_0 \mid f_1^J, e_1^I)$, the
link posterior probability for Model 2. We also use
notation from~\citep{brown-dellapietra-dellapietra-mercer-1993} but
we replace the notation for the alignment probability $a(i \mid j, J, I)$
by $p_a(i \mid j, J, I)$ for clarity.
We compute the numerator from \autoref{eq:linkposdef} in
\autoref{eq:linkposM2Numerator}:
%
\begin{align}
& p_{M_2}(a_{j_0}=i_0,f_1^J \mid e_1^I) \nonumber \\
&= \sum_{a_1 = 0}^{I} ... \sum_{a_{j_0-1} = 0}^{I} \sum_{a_{j_0+1} = 0}^{I} ... \sum_{a_J = 0}^{I} p_{M_2}(a_1 ... a_{j_0-1} i_0 a_{j_0+1} ... a_J,f_1^J \mid e_1^I) \nonumber \\
&= \sum_{a_1 = 0}^{I} ... \sum_{a_{j_0-1} = 0}^{I} \sum_{a_{j_0+1} = 0}^{I} ... \sum_{a_J = 0}^{I} \varepsilon \ p_a(i_0 \mid j_0, J, I) \ t(f_{j_0} \mid e_{i_0}) \nonumber \\
& \; \; \qquad \qquad \qquad \qquad \qquad \qquad \prod_{\substack{j = 1 \\ j \neq j_0}}^J p_a(a_j \mid j, J, I) \ t(f_j \mid e_{a_j}) \nonumber \\
&= \varepsilon \ p_a(i_0 \mid j_0, J, I) \ t(f_{j_0} \mid e_{i_0}) \prod_{\substack{j = 1 \\ j \neq j_0}}^J \sum_{i=0}^I p_a(i \mid j, J, I) \ t(f_j \mid e_i) \label{eq:linkposM2Numerator}
\end{align}
%
We compute the denominator from \autoref{eq:linkposdef} similarly and obtain
\autoref{eq:linkposM2Denominator}:
%
\begin{equation} \label{eq:linkposM2Denominator}
p_{M_2}(f_1^J \mid e_1^I) = \varepsilon \ \prod_{j=1}^J \sum_{i=0}^I p_a(i \mid j, J, I) \ t(f_j \mid e_i)
\end{equation}
%
After simplification, we obtain \autoref{eq:linkposM2} from
\autoref{eq:linkposM2Numerator} and \autoref{eq:linkposM2Denominator}.
%
\begin{equation} \label{eq:linkposM2}
p_{M_2}(a_{j_0} = i_0 \mid f_1^J, e_1^I) = \frac{p_a(i_0 \mid j_0, J, I) \ t(f_{j_0} \mid e_{i_0})}{\sum_{i=0}^I p_a(i \mid j_0, J, I) \ t(f_{j_0} \mid e_i)}
\end{equation}
%
\subsubsection{Link Posterior Probability for the HMM Model}
\label{sec:linkPosteriorHMM}
We now derive $p_{\text{HMM}}(a_{j_0} = i_0 | f_1^J, e_1^I)$, the link posterior
probability for the HMM
model~\citep{vogel-ney-tillmann,rabiner:1989:IEEE}. These derivations
are standard once we realise that the observed sequence is the source
sentence $f_1^J$, the hidden sequence is $a_1^J$ and that in addition
to standard presentations of HMM, all probabilities are conditioned on
the target sentence $e_1^I$.
We compute the numerator from \autoref{eq:linkposdef} in
\autoref{eq:linkposHMMNumerator}:
%
\begin{align}
p_{\text{HMM}}(a_{j_0} = i_0, f_1^J \mid e_1^I) &= p_{\text{HMM}}(a_{j_0} = i_0, f_1^{j_0}, f_{j_0 + 1}^J \mid e_1^I) \nonumber \\
&= p_{\text{HMM}}(f_{j_0 + 1}^J \mid a_{j_0} = i_0, f_1^{j_0}, e_1^I) \ p_{\text{HMM}}(a_{j_0} = i_0, f_1^{j_0} \mid e_1^I) \nonumber \\
&= p_{\text{HMM}}(f_{j_0 + 1}^J \mid a_{j_0} = i_0, e_1^I) \ p_{\text{HMM}}(a_{j_0} = i_0, f_1^{j_0} \mid e_1^I) \nonumber \\
&= \beta_{j_0}(i_0) \ \alpha_{j_0}(i_0) \label{eq:linkposHMMNumerator}
\end{align}
%
where $\beta_{j_0}(i_0)$ and $\alpha_{j_0}(i_0)$ are respectively
the backward and forward HMM probabilities defined in \autoref{eq:backwardForward}:
%
\begin{equation}
\begin{split}
\beta_{j}(i) &= p_{\text{HMM}}(f_{j + 1}^J \mid a_{j} = i, e_1^I) \\
\alpha_{j}(i) &= p_{\text{HMM}}(a_{j} = i, f_1^{j} \mid e_1^I)
\end{split}
\label{eq:backwardForward}
\end{equation}
%
The forward and backward probabilities can be computed recursively as
shown in \autoref{eq:forwardRecursion} and \autoref{eq:backwardRecursion}:
%
\begin{align}
\alpha_{j}(i) &= p_{\text{HMM}}(a_{j} = i, f_1^{j} \mid e_1^I) \nonumber \\
&= \sum_{k = 0}^I p_{\text{HMM}}(a_{j} = i, a_{j - 1} = k, f_1^{j - 1}, f_j \mid e_1^I) \nonumber \\
&= \sum_{k = 0}^I p_{\text{HMM}}(f_j \mid a_j = i, a_{j - 1} = k, f_1^{j-1}, e_1^I) \ p_{\text{HMM}}(a_{j} = i, a_{j - 1} = k, f_1^{j - 1} \mid e_1^I) \nonumber \\
&= \sum_{k = 0}^I p_{\text{HMM}}(f_j \mid e_i) \ p_{\text{HMM}}(a_{j} = i \mid a_{j - 1} = k, f_1^{j - 1}, e_1^I) \ p_{\text{HMM}}(a_{j - 1} = k, f_1^{j - 1} \mid e_1^I) \nonumber \\
&= \sum_{k = 0}^I p_{\text{HMM}}(f_j \mid e_i) \ p_{\text{HMM}}(a_{j} = i \mid a_{j - 1} = k, I) \ \alpha_{j - 1}(k) \label{eq:forwardRecursion} \\
\beta_{j}(i) &= p_{\text{HMM}}(f_{j + 1}^J \mid a_{j} = i, e_1^I) \nonumber \\
&= \sum_{k = 0}^I p_{\text{HMM}}(f_{j+2}^J, a_{j + 1} = k, f_{j + 1} \mid a_j = i, e_1^I) \nonumber \\
&= \sum_{k = 0}^I p_{\text{HMM}}(f_{j+2}^J \mid a_{j + 1} = k, f_{j + 1}, a_j = i, e_1^I) \ p_{\text{HMM}}(a_{j + 1} = k, f_{j + 1} \mid a_j = i, e_1^I) \nonumber \\
&= \sum_{k = 0}^I p_{\text{HMM}}(f_{j+2}^J \mid a_{j + 1} = k, e_1^I) \ p_{\text{HMM}}(f_{j + 1} \mid a_{j + 1} = k, a_j = i, e_1^I) \nonumber \\
& p_{\text{HMM}}(a_{j + 1} = k \mid a_j = i, e_1^I) \nonumber \\
&= \sum_{k = 0}^I \beta_{j + 1}(k) \ p_{\text{HMM}}(f_{j + 1} \mid e_k) \ p_{\text{HMM}}(a_{j + 1} = k \mid a_j = i, I) \label{eq:backwardRecursion}
\end{align}
%
The denominator from \autoref{eq:linkposdef} is computed in
\autoref{eq:linkposHMMDenominator}:
%
\begin{align}
p_{\text{HMM}}(f_1^J \mid e_1^I) &= \sum_{k = 0}^I p_{\text{HMM}}(a_J = k, f_1^J \mid e_1^I) \nonumber \\
&= \sum_{k = 0}^I \alpha_J(k) \label{eq:linkposHMMDenominator}
\end{align}
%
%
%We now derive the link posterior probability for the WPHMM Model.
%TODONEVER(should refer here to a background section for notation, etc.)
% TODONEVER: link posteriors for WPHMM and phrase pair posteriors
%
We will use the link posterior probabilities under the HMM model
in order to define constraints, ranking and counting functions.
\subsubsection{Constraints, Ranking and Counting Functions from HMM Link Posterior Probabilities}
We use HMM link posterior probabilities computed in \autoref{sec:linkPosteriorHMM}
in order to define constraints, ranking and counting functions:
%
\begin{itemize}
\item source constraints $\mathcal{C}_S(f_{j_1}^{j_2})$:
%
\begin{equation}
j_2 - j_1 < s_{\text{max}}
\end{equation}
%
This is the same constraint as defined for standard Viterbi extraction in
\autoref{sec:extractionFromPosteriorsViterbi}.
\item alignment constraints $\mathcal{C}_A(f_{j_1}^{j_2}, e_{i_1}^{i_2}, \bm{a})$:
%
\begin{align}
& \exists (j,i) \in [j_1,j_2] \times [i_1,i_2], p(a_j = i \mid f_1^J,e_1^I) > \lambda \label{eq:firstAlignmentConstraintHmmLinkPosterior} \\
& \forall (j,i) \in [1, J] \times [1, I] \cap \{(j,i): p(a_j = i \mid f_1^J,e_1^I) > \lambda\} \label{eq:secondAlignmentConstraintHmmLinkPosterior} \\
& \hspace{3em} j \in [j_1, j_2] \Leftrightarrow i \in [i_1, i_2] \nonumber
\end{align}
%
where $\lambda$ is a threshold defined experimentally. Intuitively,
$\lambda$ is a high link posterior probability.
The first constraint (\autoref{eq:firstAlignmentConstraintHmmLinkPosterior}) means that we require at least one link with a
high posterior probability in the phrase pair considered. The second
constraint (\autoref{eq:secondAlignmentConstraintHmmLinkPosterior}) means that there should be no link with a high posterior probability
that be inconsistent with the phrase pair. Note that these constraints are
identical to the Viterbi alignment constraints defined
in \autoref{sec:extractionFromPosteriorsViterbi} if we choose $\bm{L}$ to be the
set of all links with high posterior defined in \autoref{eq:linksWithHighPosterior}:
%
\begin{equation}
\bm{L} = \{(j, i) \in [1, J] \times [1, I]: p(a_j = i \mid f_1^J,e_1^I) > \lambda\}
\label{eq:linksWithHighPosterior}
\end{equation}
%
Also note that the second
constraint does not consider links to the null
word (see \autoref{sec:StatisticalMachineTranslationWordAlignment}) relevant.
This is because we do not need to include the null word
in a translation rule.
\item target constraints $\mathcal{C}_T(e_{i_1}^{i_2}, T)$: we pick the
first $k$ translation candidates according to the ranking
function $\mathcal{R}$.
\item ranking function:
%
\begin{equation} \label{eq:linkPosRanking}
\mathcal{R}(f_{j_1}^{j_2},e_{i_1}^{i_2}) = \prod_{j=j_1}^{j_2} \sum_{i=i_1}^{i_2} \frac{p(a_j = i \mid f_1^J,e_1^I)}{i_2-i_1+1}
\end{equation}
%
This ranking function is very similar to the score used for lexical features
described in \autoref{sec:features}. Here,
we use link posteriors instead of Model 1 translation probabilities. This
function favours short target phrases, therefore we do not use it as a counting
function. Preliminary experiments found that this function is not appropriate for
counting rules and that it gives poor results. We therefore use the same counting
function as in standard practice described in
\autoref{sec:extractionFromPosteriorsViterbi}.
\item counting function:
%
\begin{equation}
\mathcal{C}(f_{j_1}^{j_2},e_{i_1}^{i_2}) = 1
\end{equation}
%
\end{itemize}
We have described rule extraction from alignment link posterior
probabilities. Next, we will describe rule extraction from alignment
posterior probabilities over phrase pairs. This method will use the
same source constraints, alignment constraints and target constraints
but different ranking and counting functions.
\subsection{Extraction from Posteriors over Phrase Pairs}
\label{sec:extractionFromPosteriorsPhrasePair}
In the previous section, we defined and gave closed form solutions to
alignment link posterior probabilities for Model 1, Model 2 and the HMM model.
We can also define alignment posterior probabilities over phrase pairs. Let us
consider the phrase pair $\langle f_{j_1}^{j_2}, e_{i_1}^{i_2} \rangle$ in the
sentence pair $(f_1^J, e_1^I)$. In
\autoref{eq:alignmentConsistentDefinition}, we define
$A(j_1, j_2; i_1, i_2)$, the set of alignments that have no links between
inside the phrase pair and outside the phrase pair:
%
\begin{equation}
A(j_1, j_2;i_1, i_2) = \{a_1^J : a_j \in [i_1, i_2] \Leftrightarrow j \in [j_1,j_2] \}
\label{eq:alignmentConsistentDefinition}
\end{equation}
%
Alignments in $A(j_1, j_2;i_1, i_2)$ satisfy the consistency constraint
defined in \autoref{eq:consistencyConstraint}
but do not require a link in $[j_1, j_2] \times [i_1, i_2]$.
The posterior probability of these alignments given the sentence
pair is defined in \autoref{eq:phrasePairPosteriorDefinition}:
%
\begin{equation}
\begin{split}
p(A(j_1, j_2; i_1, i_2) \mid e_1^I, f_1^J) &= \frac{p(f_1^J, A(j_1, j_2; i_1, i_2) \mid e_1^I)}{p(f_1^J \mid e_1^I)} \\
&= \frac{\sum_{a_1^J \in A(j_1, j_2; i_1, i_2)} p(f_1^J,a_1^J \mid e_1^I)}{\sum_{a_1^J} p(f_1^J,a_1^J \mid e_1^I)}
\end{split}
\label{eq:phrasePairPosteriorDefinition}
\end{equation}
%
We call this quantity the \emph{phrase pair posterior probability}.
We will now derive formula for the phrase pair posterior probability in the
case of Model 1, Model 2 and the HMM Model. Again, experiments only use phrase pair
posteriors computed from the HMM model, but comparing those with the posteriors
obtained from Model 1 and Model 2 may be interesting for future research.
\subsubsection{Phrase Pair Posterior Probability for Model 1}
Let us first define $J_{\text{in}}$, $J_{\text{out}}$, $I_{\text{in}}$ and
$I_{\text{out}}$ in \autoref{eq:iInsideOutside} for a set of indices
$i_1$, $i_2$, $j_1$, $j_2$:
%
\begin{equation}
\begin{split}
J_{\text{in}} &= [j_1, j_2] \\
J_{\text{out}} &= [1,J] \setminus J_{\text{in}} \\
I_{\text{in}} &= [i_1, i_2] \\
I_{\text{out}} &= [0, I] \setminus I_{\text{in}}
\end{split}
\label{eq:iInsideOutside}
\end{equation}
%
For Model 1, the numerator from \autoref{eq:phrasePairPosteriorDefinition} is
obtained in \autoref{eq:phrasePairPosteriorModel1Numerator}:
%
\begin{align}
& \sum_{a_1^J \in A(j_1, j_2; i_1, i_2)} p_{M_1}(f_1^J, a_1^J \mid e_1^I) \nonumber \\
&= \sum_{a_1 \in I_{\text{out}}} ... \sum_{a_{j_1-1} \in I_{\text{out}}} \sum_{a_{j_1} \in I_{\text{in}}} ... \sum_{a_{j_2} \in I_{\text{in}}} \sum_{a_{j_2 + 1} \in I_{\text{out}}} ... \sum_{a_J \in I_{\text{out}}} p_{M_1}(a_1^J, f_1^J \mid e_1^I) \nonumber \\
&= \sum_{a_1 \in I_{\text{out}}} ... \sum_{a_{j_1-1} \in I_{\text{out}}} \sum_{a_{j_1} \in I_{\text{in}}} ... \sum_{a_{j_2} \in I_{\text{in}}} \sum_{a_{j_2 + 1} \in I_{\text{out}}} ... \sum_{a_J \in I_{\text{out}}} \frac{\varepsilon}{(1+I)^J} \prod_{j = 1}^J t(f_j \mid e_{a_j}) \nonumber \\
&= \frac{\varepsilon}{(1+I)^J} \ \left( \prod_{j \in J_{\text{out}}} \sum_{i \in I_{\text{out}}} t(f_j \mid e_i) \right) \ \left( \prod_{j \in J_{\text{in}}} \sum_{i \in I_{\text{in}}} t(f_j \mid e_i) \right)
\label{eq:phrasePairPosteriorModel1Numerator}
\end{align}
%
The denominator from \autoref{eq:phrasePairPosteriorDefinition} has already been
computed in \autoref{eq:linkposM1Denominator}.
Simplifying \autoref{eq:phrasePairPosteriorModel1Numerator} and
\autoref{eq:linkposM1Denominator}, we obtain
\autoref{eq:phrasePairPosteriorModel1}:
%
\begin{align}
& p_{M_1}(A(j_1, j_2; i_1, i_2) \mid e_1^I, f_1^J) \nonumber \\
&= \left( \prod_{j \in J_{\text{out}}} \sum_{i \in I_{\text{out}}} \frac{t(f_j \mid e_i)}{\sum_{i' = 0}^I t(f_{j} \mid e_{i'})} \right) \left( \prod_{j \in J_{\text{in}}} \sum_{i \in I_{\text{in}}} \frac{t(f_j \mid e_i)}{\sum_{i' = 0}^I t(f_{j} \mid e_{i'})} \right) \nonumber \\
&= \left( \prod_{j \in J_{\text{out}}} \sum_{i \in I_{\text{out}}} p_{M_1}(a_j = i \mid f_1^J, e_1^I) \right) \left( \prod_{j \in J_{\text{in}}} \sum_{i \in I_{\text{in}}} p_{M_1}(a_j = i \mid f_1^J, e_1^I) \right)
\label{eq:phrasePairPosteriorModel1}
\end{align}
%
\subsubsection{Phrase Pair Posterior Probability for Model 2}
To avoid repetition, we skip the derivation which is analogous to
the derivation for Model 1. We obtain the phrase pair posterior in
\autoref{eq:phrasePairPosteriorModel2}:
%
\begin{align}
& p_{M_2}(A(j_1, j_2; i_1, i_2) \mid e_1^I, f_1^J) \nonumber \\
&= \left( \prod_{j \in J_{\text{out}}} \sum_{i \in I_{\text{out}}} p_{M_2}(a_j = i \mid f_1^J, e_1^I) \right) \left( \prod_{j \in J_{\text{in}}} \sum_{i \in I_{\text{in}}} p_{M_2}(a_j = i \mid f_1^J, e_1^I) \right)
\label{eq:phrasePairPosteriorModel2}
\end{align}
%
\subsubsection{Phrase Pair Posterior Probability for HMM}
Let us now compute the phrase pair posterior probability for the HMM model. The
denominator from \autoref{eq:phrasePairPosteriorDefinition} can be computed using
the forward algorithm while the numerator can be computed using a modified
forward algorithm~\citep{deng:2005:PHD}. Let us define $\tilde{\alpha}_j(i)$, the
modified forward probability in \autoref{eq:modifiedForwardProbabilityDefinition}:
%
\begin{equation}
\tilde{\alpha}_j(i) = p_{\text{HMM}}(A(j_1,j_2;i_1,i_2), f_1^j, a_j=i \mid e_1^I)
\label{eq:modifiedForwardProbabilityDefinition}
\end{equation}
%
The numerator from \autoref{eq:phrasePairPosteriorDefinition} can be computed
in \autoref{eq:phrasePairPosteriorHMMNumerator}:
%
\begin{equation}
p(A(j_1, j_2; i_1, i_2), f_1^J \mid e_1^I) = \sum_{i=0}^I \tilde{\alpha}_J(i)
\label{eq:phrasePairPosteriorHMMNumerator}
\end{equation}
%
The denominator from \autoref{eq:phrasePairPosteriorDefinition} can be computed
using the regular forward probability in
\autoref{eq:phrasePairPosteriorHMMDenominator}:
%
\begin{equation}
p(f_1^J \mid e_1^I) = \sum_{i=0}^I \alpha_J(i)
\label{eq:phrasePairPosteriorHMMDenominator}
\end{equation}
%
Like the regular forward probability, the modified forward probability can also
be computed recursively. We can also write the modified forward probability as
in \autoref{eq:rewriteModifiedForwardProbability}:
%
\begin{equation}
\begin{split}
\tilde{\alpha}_j(i) &= p_{\text{HMM}}(A(j_1,j_2;i_1,i_2), f_1^j, a_j=i \mid e_1^I) \\
&= \sum_{a_1^{j} \in A(j_1,j_2;i_1,i_2)} p_{\text{HMM}}(a_1^{j-1}, f_1^j, a_j=i \mid e_1^I) \\
&= \sum_{\substack{a_1^{j} \in A(j_1,j_2;i_1,i_2) \\ a_j=i}} p_{\text{HMM}}(a_1^{j-1}, f_1^j, a_j=i \mid e_1^I)
\end{split}
\label{eq:rewriteModifiedForwardProbability}
\end{equation}
%
The computation of $\tilde{\alpha}_j(i)$ is by a constrained forward algorithm where
the constraint is given in \autoref{eq:modifiedForwardConstraints}. This is because
an alignment in $A(j_1,j_2;i_1,i_2)$ cannot have a link from inside the phrase
pair to outside the phrase pair (see \autoref{eq:alignmentConsistentDefinition}):
%
\begin{equation}
\forall (j, i) \in J_{\text{out}} \times I_{\text{in}} \cup J_{\text{in}} \times I_{\text{out}}, \tilde \alpha_j(i) = 0
\label{eq:modifiedForwardConstraints}
\end{equation}
%
For a link $(j, i) \in J_{\text{out}} \times I_{\text{out}} \cup J_{\text{in}} \times I_{\text{in}}$ that satisfies the constraint from
\autoref{eq:modifiedForwardConstraints}, we can derive the modified
forward probability in \autoref{eq:modifiedForwardRecursion}:
%
\begin{align}
\tilde{\alpha}_j(i) &= p_{\text{HMM}}(A(j_1, j_2; i_1, i_2), f_1^j, a_j=i \mid e_1^I) \nonumber \\
&= \sum_{a_1^{j-1} \in A(j_1, j_2; i_1, i_2)}
p_{\text{HMM}}(f_j, f_1^{j-1}, a_j=i, a_1^{j-1} \mid e_1^I) \nonumber \\
&= \sum_{a_1^{j-1} \in A(j_1, j_2; i_1, i_2)}
p_{\text{HMM}}(f_j \mid f_1^{j-1}, a_j=i, a_1^{j-1}, e_1^I) \times \nonumber \\
& \hspace{7.7em} p_{\text{HMM}}(a_j=i \mid f_1^{j-1}, a_1^{j-1}, e_1^I) \times \nonumber \\
& \hspace{7.7em} p_{\text{HMM}}(f_1^{j-1}, a_1^{j-1} \mid e_1^I) \nonumber \\
&= p_{\text{HMM}}( f_j \mid e_i )
\sum_{a_1^{j-1} \in A(j_1, j_2; i_1, i_2)}
p_{\text{HMM}}(a_j=i \mid a_{j-1}, I) \
p_{\text{HMM}}(f_1^{j-1}, a_1^{j-1} \mid e_1^I) \nonumber \\
&= p_{\text{HMM}}(f_j \mid e_i) \
\sum_{k=0}^I \sum_{\substack{a_1^{j-1} \in A(j_1, j_2; i_1, i_2) \\ a_{j-1} = k}}
p_{\text{HMM}}(a_j=i \mid a_{j-1} = k, I) \nonumber \\
& \hspace{15.2em} p_{\text{HMM}}(f_1^{j-1}, a_{j-1} = k, a_1^{j-2} \mid e_1^I) \nonumber \\
&= p_{\text{HMM}}(f_j \mid e_i) \
\sum_{k = 0}^I
p_{\text{HMM}}(a_j = i \mid a_{j-1} = k, I) \nonumber \\
& \hspace{4.7em} \sum_{\substack{a_1^{j-1} \in A(j_1, j_2; i_1, i_2) \\ a_{j-1} = k}}
p_{\text{HMM}}(f_1^{j-1}, a_{j-1} = k, a_1^{j-2} \mid e_1^I) \nonumber \\
&= p_{\text{HMM}}(f_j \mid e_i) \
\sum_{k = 0}^I
p_{\text{HMM}}(a_j=i \mid a_{j-1} = k, I) \; \tilde \alpha_{j-1}(k)
\label{eq:modifiedForwardRecursion}
\end{align}
%
We will use the phrase pair posterior probabilities under the HMM model
in order to define ranking and counting functions.
\subsubsection{Constraints, Ranking and Counting Functions from HMM Link and Phrase Posterior Probabilities}
In order to keep the size of the rule set manageable, we use
the same source constraints, alignment constraints and target constraints
as for link posterior extraction defined in
\autoref{sec:extractionFromPosteriorsLink}.
We use the phrase pair posterior probabilities under the HMM model both for ranking
and scoring
extracted rules. This approach assigns a fractional count to each extracted
rule, which allows finer estimation of the source-to-target and target-to-source
translation models. The ranking and counting functions
are defined in \autoref{eq:rankingCountingPhrasePairExtraction}:
%
\begin{equation}
\mathcal{R}(f_{j_1}^{j_2},e_{i_1}^{i_2}) = \mathcal{C}(f_{j_1}^{j_2},e_{i_1}^{i_2}) = p_{\text{HMM}}(A(j_1, j_2; i_1, i_2) \mid f_1^j, e_1^J)
\label{eq:rankingCountingPhrasePairExtraction}
\end{equation}
Under the framework described in
\autoref{sec:extractionFromPosteriorsExtractionGeneralApproach}, we have
described standard rule extraction from Viterbi alignments and two novel
approaches to rule extraction based on link posterior probabilities and
phrase pair posterior probabilities. In order to expose concepts with more
clarity, we have restricted the presentation
to the extraction of phrase based rules as opposed to hierarchical rules.
We will now show how to generalise the techniques presented so far to
the extraction of hierarchical rules.
\subsection{Hierarchical Rule Extraction}
\label{sec:extractionFromPosteriorsExtractionDisjoint}
In this section, we extend the techniques presented so far to hierarchical
rule extraction. In order to avoid repetition, we describe these
techniques for the rule pattern
$\langle w X w, w X w \rangle$
only (see \autoref{sec:constraintsOnHierarhicalGrammars} for the
definition of patterns). We first rewrite the algorithm in
\autoref{alg:generalRuleXtractSpecialized} into the algorithm in
\autoref{alg:generalRuleXtractSpecializedHierarchical} for this pattern.
%
\begin{figure}
\begin{algorithmic}[1]
\Function{ExtractRules}{$f_1^J, e_1^I, \bm{a}$}
\For{$1 \leq j_1 \leq j_2 < j_3 \leq j_4 \leq J$}
\If{$\lnot \mathcal{C}_S(f_{j_1}^{j_2} X f_{j_3}^{j_4})$} \Comment{Source constraints}
\State{\bf{continue}}
\EndIf
\State{$T \gets \emptyset$} \Comment{Sorted hierarchical target phrases}
\For{$1 \leq i_1 \leq i_2 < i_3 \leq i_4 \leq I$}
\If{$\mathcal{C}_A(f_{j_1}^{j_2} X f_{j_3}^{j_4}, e_{i_1}^{i_2} X e_{i_3}^{i_4}, \bm{a})$} \Comment{Alignment constraints}
\State{$T \gets T \cup e_{i_1}^{i_2} X e_{i_3}^{i_4}$}
\EndIf
\EndFor
\State{\Call{Sort}{$T, \mathcal{R}$}} \Comment{Hierarchical target phrases ranked according to $\mathcal{R}$}
\For{$e_{i_1}^{i_2} X e_{i_3}^{i_4} \in T$}
\If{$\mathcal{C}_T(e_{i_1}^{i_2} X e_{i_3}^{i_4}, T)$} \Comment{Target constraints}
\State{\Call{Extract}{\RT[$X$][$f_{j_1}^{j_2} X f_{j_3}^{j_4}$][$e_{i_1}^{i_2} X e_{i_3}^{i_4}$], $\mathcal{C}(f_{j_1}^{j_2} X f_{j_3}^{j_4}, e_{i_1}^{i_2} X e_{i_3}^{i_4})$}}
\EndIf
\EndFor
\EndFor
\EndFunction
\end{algorithmic}
\caption{General procedure for hierarchical phrase-based rule extraction. The procedure is presented for the pattern $\langle w X w, w X w \rangle$ only. This algorithm is
analogous to the algorithm used to extract phrase-based rules and presented
in \autoref{alg:generalRuleXtractSpecialized}.}
\label{alg:generalRuleXtractSpecializedHierarchical}
\end{figure}
%