-
Notifications
You must be signed in to change notification settings - Fork 12
/
typo.go
470 lines (417 loc) · 11.9 KB
/
typo.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Typo is a modern version of the original Unix typo command, a scan of whose man page
// is at
//
// https://github.com/robpike/typo/blob/master/typo.png
//
// and whose early C source code is in the unix subdirectory of this repo.
//
// This Go version ignores nroff but handles Unicode and can strip simple HTML tags.
// It provides location information for each typo, including the byte number on the line.
// It also identifies repeated words, a a typographical error that occurs often.
//
// The -r flag suppresses reporting repeated words.
// The -n and -t flags control how many "typos" to print.'
// The -html flag enables simple filtering of HTML from the input.
//
// See the comments in the source for a description of the algorithm, extracted
// from Bell Labs CSTR 18 by Robert Morris and Lorinda L. Cherry.
package main // import "robpike.io/cmd/typo"
// A test case, with With punctuation. zxyz. Make sure the oddball on this line scores highly. (It does.)
// Also make sure that "with" shows up doubled despite the case change.
import (
"bufio"
"bytes"
_ "embed"
"flag"
"fmt"
"io"
"math"
"os"
"sort"
"strings"
"unicode"
"unicode/utf8"
)
//go:embed w2006.txt
var wordsFile []byte
var (
nTypos = flag.Int("n", 50, "maximum number of words to print")
noRepeats = flag.Bool("r", false, "don't show repeated words words")
threshold = flag.Int("t", 10, "cutoff threshold; smaller means more words")
filterHTML = flag.Bool("html", false, "filter HTML tags from input")
)
func init() {
if unicode.IsPunct('<') {
// This must be true for HTML filtering to work.
fmt.Fprintf(os.Stderr, "typo: unicode says < is punctuation")
*filterHTML = false
}
}
func main() {
flag.Parse()
for _, w := range read("<wordsFile>", bytes.NewReader(wordsFile), bufio.ScanWords) {
known[w] = true
}
if len(flag.Args()) == 0 {
add("<stdin>", os.Stdin)
}
for _, f := range flag.Args() {
add(f, nil)
}
repeats()
stats()
spell()
}
type Word struct {
text string // The original.
lower *string // The word in lower case; may point to original.
file string
lineNum int
byteNum int
score int
}
func (w Word) String() string {
if w.score == 0 {
return fmt.Sprintf("%s:%d:%d %s", w.file, w.lineNum, w.byteNum, w.text)
} else {
return fmt.Sprintf("%s:%d:%d [%d] %s", w.file, w.lineNum, w.byteNum, w.score, w.text)
}
}
// Sort interfaces for []*Word
type ByScore []*Word
func (t ByScore) Len() int {
return len(t)
}
func (t ByScore) Less(i, j int) bool {
w1 := t[i]
w2 := t[j]
return w1.score > w2.score // Sort down
}
func (t ByScore) Swap(i, j int) {
t[i], t[j] = t[j], t[i]
}
type ByWord []*Word
func (t ByWord) Len() int {
return len(t)
}
func (t ByWord) Less(i, j int) bool {
w1 := t[i]
w2 := t[j]
return w1.text < w2.text
}
func (t ByWord) Swap(i, j int) {
t[i], t[j] = t[j], t[i]
}
var words = make([]*Word, 0, 1000)
var known = make(map[string]bool) // loaded from knownWordsFiles
func read(file string, r io.Reader, split bufio.SplitFunc) []string {
if r == nil {
f, err := os.Open(file)
if err != nil {
fmt.Fprintf(os.Stdout, "typo: %s\n", err)
os.Exit(2)
}
defer f.Close()
r = f
}
scanner := bufio.NewScanner(r)
scanner.Split(split)
words := make([]string, 0, 1000)
for scanner.Scan() {
words = append(words, scanner.Text())
}
if err := scanner.Err(); err != nil {
fmt.Fprintf(os.Stdout, "typo: reading %s: %s\n", file, err)
os.Exit(2)
}
return words
}
func add(file string, r io.Reader) {
lines := read(file, r, bufio.ScanLines)
for lineNum, line := range lines {
inWord := false
wordStart := 1
for byteNum, c := range line {
switch {
case inWord && unicode.IsSpace(c):
addWord(line[wordStart:byteNum], file, lineNum+1, wordStart+1)
inWord = false
case !inWord && !unicode.IsSpace(c):
inWord = true
wordStart = byteNum
}
}
if inWord {
addWord(line[wordStart:], file, lineNum+1, wordStart+1)
}
}
}
// leadingHTMLLen returns the length of all HTML tags at the start of the text.
// It assumes a tag goes from an initial "<" to the first closing ">".
func leadingHTMLLen(text string) int {
if len(text) == 0 || text[0] != '<' {
return 0
}
n := strings.IndexRune(text, '>')
if n < 0 {
return 0
}
n++ // Absorb closing '>'.
return n + leadingHTMLLen(text[n:])
}
// trailingHTMLLen mirrors leadingHTMLLen at the end of the text.
func trailingHTMLLen(text string) int {
if len(text) == 0 || text[len(text)-1] != '>' {
return 0
}
// TODO: There should be a LastIndexRune.
n := strings.LastIndex(text, "<")
if n < 0 {
return 0
}
return len(text) - n + trailingHTMLLen(text[:len(text)-n])
}
func addWord(text, file string, lineNum, byteNum int) {
// Note: '<' is not punctuation according to Unicode.
n := len(text)
text = strings.TrimLeftFunc(text, unicode.IsPunct)
byteNum += n - len(text)
text = strings.TrimRightFunc(text, unicode.IsPunct)
if *filterHTML {
// Easily defeated by spaces and newlines, but gets things like <code><em>foo</em></code>.
n := leadingHTMLLen(text)
text = text[n:]
byteNum += n
n = trailingHTMLLen(text)
text = text[:len(text)-n]
if len(text) == 0 {
return
}
n = len(text)
text = strings.TrimLeftFunc(text, unicode.IsPunct)
byteNum += n - len(text)
text = strings.TrimRightFunc(text, unicode.IsPunct)
}
// There must be a letter.
hasLetter := false
for _, c := range text {
if unicode.IsLetter(c) {
hasLetter = true
break
}
}
if !hasLetter {
return
}
word := &Word{
text: text,
file: file,
lineNum: lineNum,
byteNum: byteNum,
}
if onlyLower(text) {
word.lower = &word.text
} else {
x := strings.ToLower(text)
word.lower = &x
}
words = append(words, word)
}
func onlyLower(s string) bool {
for _, c := range s {
if !unicode.IsLower(c) {
return false
}
}
return true
}
func repeats() {
if *noRepeats {
return
}
prev := ""
for _, word := range words {
w := *word.lower
if w == prev {
fmt.Printf("%s repeats\n", word)
}
prev = w
}
}
type digram [2]rune
type trigram [3]rune
var diCounts = make(map[digram]int)
var triCounts = make(map[trigram]int)
func stats() {
// Compute global digram and trigram counts.
for _, word := range words {
addDigrams(word.text)
scanTrigrams(word.text, incTrigrams)
}
// Compute the score for the word.
for _, word := range words {
if known[*word.lower] {
continue
}
word.score = int(score(word.text))
}
}
func scanTrigrams(word string, fn func(t trigram)) {
// For "once", we have ".on", "onc", "nce", "ce."
// Do the first one by hand to prime the pump.
rune, wid := utf8.DecodeRuneInString(word)
t := trigram{'.', '.', rune}
for _, r := range word[wid:] {
t[0] = t[1]
t[1] = t[2]
t[2] = r
fn(t)
}
// At this point, we have "nce"; make "ce.".
// If there was only one letter, "a", we have "..a" and this tail will give us ".a.", which is what we want.
// Final marker
t[0] = t[1]
t[1] = t[2]
t[2] = '.'
fn(t)
}
func addDigrams(word string) {
d := digram{'.', '.'}
// For "once", we have ".o", "on", "nc", "ce", "e."
for _, r := range word {
d[0] = d[1]
d[1] = r
diCounts[d]++
}
// Final marker
d[0] = d[1]
d[1] = '.'
diCounts[d]++
}
func incTrigrams(t trigram) {
triCounts[t]++
}
func triScore(t trigram) float64 {
nxy := float64(diCounts[digram{t[0], t[1]}] - 1)
nyz := float64(diCounts[digram{t[1], t[2]}] - 1)
nxyz := float64(triCounts[t] - 1)
// The paper says to use -10 for log(0), but its square is 100, so that can't be right.
if nxy == 0 || nyz == 0 || nxyz == 0 {
return 0
}
logNxy := math.Log(nxy)
logNyz := math.Log(nyz)
logNxyz := math.Log(nxyz)
return 0.5*(logNxy+logNyz) - logNxyz
}
func score(word string) float64 {
sumOfSquares := 0.0
n := 0
fn := func(t trigram) {
i := triScore(t)
sumOfSquares += i * i
n++
}
scanTrigrams(word, fn)
s := 10 / math.Sqrt(sumOfSquares/float64(n))
if math.IsInf(s, 0) {
s = 100.
}
return s
}
func spell() {
// Uniq the list: show each word only once; also drop known words.
sort.Sort(ByWord(words))
out := words[0:0]
prev := " "
for _, word := range words {
if word.text == prev {
continue
}
if known[*word.lower] {
continue
}
out = append(out, word)
prev = word.text
}
words = out
// Sort the words by unlikelihood and print the top few.
sort.Sort(ByScore(words))
for i, w := range words {
if i >= *nTypos {
break
}
if w.score < *threshold {
break
}
fmt.Println(words[i])
}
}
/*
Thanks to Doug McIlroy for digging this out for me:
From CSTR 18, Robert Morris and Lorinda L. Cherry, "Computer
detection of typographical errors", July 1974.
... a count is kept of each kind of digram and trigram in the
document. For example, if the word `once' appears in the text,
the counts corresponding to each of these five digrams and four
trigrams are incremented.
.o on nc ce e.
.on onc nce ce.
where the `.' signifies the beginning or ending of a word. These
statistics are retained for later processing.
... 2726 common technical English words as used at Murray Hill
... are removed. [The list was created] by processing about one
million words of technical text appearing in documents produced on
the UNIX time-sharing system at Murray Hill. The words selected were
those which have the highest probability of appearing in a given
document. This is not the same as a list of the most frequently
occurring words and, in fact, some very rare words occur in the
table. For example, `murray' and `abstract' are very uncommon words
in the documents sampled, yet they both appear once in virtually
every document.
... a number is attached to each word by consulting the table of
digrams and trigrams previously produced.
The number is an index of peculiarity and reflects the likelihood
of the hypothesis that the trigrams in the given word were produced
from the same source that produced the trigram table.
Each trigram in the current word is treated separately. An initial
and terminal trigram is treated for each word so that the number
of trigrams in a word becomes equal to the number of characters
in the word. For each trigram T = (xyz) in the current word,
the digram and trigram counts from the prepared table are accessed.
Each such count is reduced by one to remove the effect of the
current word on the statistics. The resulting counts are n(xy),
n(yz), and n(xyz). The index i(T) for the trigram T is set equal to
i(T) = (1/2)[log n(xy) + log n(yz)] - log n(xyz)
This index is invariant with the size of the document, i.e. it needs
no normalization. It measures the evidence against the hypothesis
that the trigram was produced by the same source that produced the
digram and trigram tables in the sense of References 1 and 2. In the
case that one of the digram or trigram counts is zero, the log of
that count is taken to be -10.
A variety of methods were tried to combine the trigram indices to
obtain an index for the whole word, including
- the average of trigram indices,
- the square root of the average of the squares of the trigram
indices, and
- the largest of the trigram indices.
All were moderately effective; the first tended to swamp out the
effect of a really strange trigram in a long word and the latter
was insensitive to a sequence of strange trigrams whose cumulative
evidence should have made a word suspcious. The second on trial
appeared to be a satisfactory compromise and was adopted.
The words with their attached indices are sorted on the index in
descending order and printed in a pleasing three-column format. [The
printed integer index values are] normalized in such a way that
a word with an index greater than 10 contains trigrams that are not
representative of the remainder of the document. Such a word almost
certainly appears just once in a document.
...
[1] Good, I. j. Probability and the Weighing of Evidence. Charles
Griffin & Co., London, 1950.
[2] Kullback, S. and Leibler, R. A. On Information and Sufficiency.
Annals of Math. Stat., 22 (1951), pp. 79-86
*/