-
Notifications
You must be signed in to change notification settings - Fork 92
/
opus1m+bt-2021-05-01.yml
126 lines (126 loc) · 3.95 KB
/
opus1m+bt-2021-05-01.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
release: sem-eng/opus1m+bt-2021-05-01.zip
release-date: 2021-05-01
dataset-name: opus1m+bt
modeltype: transformer-align
vocabulary:
source: opus1m+bt.spm32k-spm32k.vocab.yml
target: opus1m+bt.spm32k-spm32k.vocab.yml
pre-processing: normalization + SentencePiece (spm32k,spm32k)
subwords:
source: spm32k
target: spm32k
subword-models:
source: source.spm
target: target.spm
source-languages:
- acm
- afb
- amh
- apc
- ara
- arq
- ary
- arz
- heb
- jpa
- mlt
- oar
- phn
- tir
- tmr
target-languages:
- eng
training-data:
amh-eng: Tatoeba-train (991645)
ara-eng: Tatoeba-train (1000000) wikibooks.aa.eng-ara (991794) wikinews.aa.eng-ara (457195) wikipedia.aa.eng-ara (982536) wikipedia.ab.eng-ara (982647) wikipedia.ac.eng-ara (982746) wikipedia.ad.eng-ara (982432) wikiquote.aa.eng-ara (996916)
arq-eng: Tatoeba-train (115)
arz-eng: Tatoeba-train (3)
heb-eng: Tatoeba-train (1000000) wikibooks.aa (991072) wikinews.aa (457147) wikipedia.aa (981820) wikipedia.ab (981988) wikipedia.ac (981995) wikipedia.ad (981835) wikiquote.aa (997096)
mlt-eng: Tatoeba-train (1000000) wikibooks.aa (991872) wikinews.aa (456863) wikipedia.aa (981034) wikipedia.ab (981120) wikipedia.ac (981201) wikipedia.ad (980973) wikiquote.aa (996263)
tir-eng: Tatoeba-train (359451)
validation-data:
acm-eng: Tatoeba-dev, 8
afb-eng: Tatoeba-dev, 45
amh-eng: Tatoeba-dev, 994
apc-eng: Tatoeba-dev, 10
ara-eng: Tatoeba-dev, 18138
arq-eng: Tatoeba-dev, 734
ary-eng: Tatoeba-dev, 36
arz-eng: Tatoeba-dev, 377
eng-heb: Tatoeba-dev, 153364
eng-mlt: Tatoeba-dev, 1000
eng-tir: Tatoeba-dev, 1000
total-size-shuffled: 6190
devset-selected: top 5000 lines of Tatoeba-dev.src.shuffled
test-data:
Tatoeba-test.acm-eng: 3/24
Tatoeba-test.afb-eng: 36/175
Tatoeba-test.amh-eng: 190/1001
Tatoeba-test.apc-eng: 5/29
Tatoeba-test.ara-eng: 10000/73964
Tatoeba-test.arq-eng: 403/3058
Tatoeba-test.ary-eng: 18/98
Tatoeba-test.arz-eng: 181/1178
Tatoeba-test.heb-eng: 10000/73559
Tatoeba-test.jpa-eng: 4/38
Tatoeba-test.mlt-eng: 203/1165
Tatoeba-test.oar-eng: 6/71
Tatoeba-test.oar_Hebr-eng: 3/39
Tatoeba-test.oar_Syrc-eng: 3/32
Tatoeba-test.multi-eng: 10000/73229
Tatoeba-test.phn-eng: 5/51
Tatoeba-test.tir-eng: 69/503
Tatoeba-test.tmr-eng: 19/147
tico19-test.amh-eng: 2100/56848
tico19-test.ara-eng: 2100/56347
tico19-test.en-ti_ER.tir-eng: 2100/56848
tico19-test.en-ti_ET.tir-eng: 2100/56848
tico19-test.tir-eng: 2100/56848
BLEU-scores:
Tatoeba-test.acm-eng: 16.0
Tatoeba-test.afb-eng: 34.0
Tatoeba-test.amh-eng: 0.2
Tatoeba-test.apc-eng: 14.3
Tatoeba-test.ara-eng: 37.5
Tatoeba-test.arq-eng: 7.2
Tatoeba-test.ary-eng: 34.8
Tatoeba-test.arz-eng: 11.1
Tatoeba-test.heb-eng: 43.0
Tatoeba-test.jpa-eng: 3.0
Tatoeba-test.mlt-eng: 30.9
Tatoeba-test.oar-eng: 0.8
Tatoeba-test.oar_Hebr-eng: 1.0
Tatoeba-test.oar_Syrc-eng: 1.5
Tatoeba-test.multi-eng: 39.7
Tatoeba-test.phn-eng: 1.1
Tatoeba-test.tir-eng: 0.3
Tatoeba-test.tmr-eng: 2.1
tico19-test.amh-eng: 1.7
tico19-test.ara-eng: 26.7
tico19-test.en-ti_ER.tir-eng: 2.1
tico19-test.en-ti_ET.tir-eng: 2.4
tico19-test.tir-eng: 2.3
chr-F-scores:
Tatoeba-test.acm-eng: 0.507
Tatoeba-test.afb-eng: 0.517
Tatoeba-test.amh-eng: 0.103
Tatoeba-test.apc-eng: 0.346
Tatoeba-test.ara-eng: 0.554
Tatoeba-test.arq-eng: 0.231
Tatoeba-test.ary-eng: 0.429
Tatoeba-test.arz-eng: 0.304
Tatoeba-test.heb-eng: 0.597
Tatoeba-test.jpa-eng: 0.185
Tatoeba-test.mlt-eng: 0.475
Tatoeba-test.oar-eng: 0.089
Tatoeba-test.oar_Hebr-eng: 0.085
Tatoeba-test.oar_Syrc-eng: 0.094
Tatoeba-test.multi-eng: 0.570
Tatoeba-test.phn-eng: 0.069
Tatoeba-test.tir-eng: 0.127
Tatoeba-test.tmr-eng: 0.139
tico19-test.amh-eng: 0.180
tico19-test.ara-eng: 0.548
tico19-test.en-ti_ER.tir-eng: 0.191
tico19-test.en-ti_ET.tir-eng: 0.194
tico19-test.tir-eng: 0.188