-
Notifications
You must be signed in to change notification settings - Fork 5
/
transforms.go
64 lines (53 loc) · 1.15 KB
/
transforms.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
package topics
import (
"bufio"
"fmt"
"os"
"regexp"
"strings"
)
type Transformation func(word string) (new string, keep bool)
type Transformations []Transformation // i.e. stemming, lemmatization, etc
type Processor struct {
transformations Transformations
}
func GetStopwordFilter(path string) Transformation {
f, err := os.Open(path)
defer f.Close()
if err != nil {
fmt.Printf("%+v", err)
return nil
}
s := bufio.NewScanner(f)
stopWords := make(map[string]bool, 0)
for s.Scan() {
stopWords[s.Text()] = true
}
return func(stopWords map[string]bool) Transformation {
return func(w string) (string, bool) {
if stopWords[w] {
return "", false
}
return w, true
}
}(stopWords)
}
func ToLower(w string) (string, bool) {
return strings.ToLower(w), true
}
var wordReg = regexp.MustCompile("[a-zA-Z'åäöÅÄÖ]+")
func Sanitize(w string) (string, bool) {
return strings.TrimSpace(wordReg.FindString(w)), true
}
func MinLen(w string) (string, bool) {
if len(w) < 2 {
return "", false
}
return w, true
}
func RemoveTwitterUsernames(w string) (string, bool) {
if strings.Contains(w, "@") {
return "", false
}
return w, true
}