-
Notifications
You must be signed in to change notification settings - Fork 20
/
onlyvalidlatin.go
80 lines (68 loc) · 1.52 KB
/
onlyvalidlatin.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
package main
/*
This was used to strip non-Latin and invalid characters from the datasets used
for benchmarking. There's no point testing the performance of the pretrained
vocabs on those characters because they were specifically excluded from the
vocabularies during training with -only-latin & -only-valid.
Note that this doesn't mean the pretained vocabs can't tokenize these characters.
They can still tokenize them with single byte tokens. The point is that it'd be silly
to benchmark a vocabulary that was specifically trained for English, against a
dataset containing Chinese.
*/
import (
"os"
"unicode/utf8"
"unicode"
"io/ioutil"
"fmt"
)
const (
runeError = '\uFFFD'
)
func main() {
if len(os.Args) < 3 {
fmt.Println("Usage: ./onlyvalidlatin input.txt output.txt")
return
}
b, err := ioutil.ReadFile(os.Args[1])
if err != nil {
fmt.Printf("Error reading file: %s\n", err)
return
}
var r rune
var n, on int
out := b
for len(b) > 0 {
r, n = utf8.DecodeRune(b)
if r == runeError || (unicode.IsLetter(r) && !unicode.Is(unicode.Latin, r)) {
b = b[n:]
continue
}
switch n {
case 1:
out[on] = b[0]
on++
case 2:
out[on] = b[0]
out[on+1] = b[1]
on+=2
case 3:
out[on] = b[0]
out[on+1] = b[1]
out[on+2] = b[2]
on+=3
case 4:
out[on] = b[0]
out[on+1] = b[1]
out[on+2] = b[2]
out[on+3] = b[3]
on+=4
}
b = b[n:]
}
err = ioutil.WriteFile(os.Args[2], out[0:on], 0644)
if err != nil {
panic(err)
}
fmt.Println(`Done`, os.Args[2])
}