-
Notifications
You must be signed in to change notification settings - Fork 2
/
senateParse.go
120 lines (100 loc) · 2.77 KB
/
senateParse.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
package main
//http://play.golang.org/p/kaZrQ2HJas
import (
"code.google.com/p/go-charset/charset"
_ "code.google.com/p/go-charset/data"
"encoding/xml"
"fmt"
"io/ioutil"
"log"
"os"
"path/filepath"
"strconv"
"strings"
"sync"
"time"
)
type SenateRegistrant struct {
RegistrantName string `xml:",attr"`
RegistrantID string `xml:",attr"`
RegistrantCountry string `xml:",attr"`
}
type SenateClient struct {
ClientName string `xml:",attr"`
ClientID string `xml:",attr"`
ContactFullname string `xml:",attr"`
IsStateOrLocalGov bool `xml:",attr"`
ClientCountry string `xml:",attr"`
}
type SenateLobbyist struct {
LobbyistName string `xml:",attr"`
FirstName string
LastName string
}
type SenateFiling struct {
ID string `xml:",attr"`
Year string `xml:",attr"`
Type string `xml:",attr"`
Period string `xml:",attr"`
Client SenateClient `xml:Client"`
Registrant SenateRegistrant `xml:"Registrant"`
Lobbyists []SenateLobbyist `xml:"Lobbyists>Lobbyist"`
}
type SenateFile struct {
Filings []SenateFiling `xml:"Filing"`
}
func convertEncoding(input []byte) []byte {
reader, err := charset.NewReader("utf16", strings.NewReader(string(input)))
if err != nil {
log.Fatal(err)
}
output, err := ioutil.ReadAll(reader)
if err != nil {
log.Fatal(err)
}
return output
}
func parseSenateFilings(savePath string, combinedFilings *[]GenericFiling, mutex *sync.Mutex, wg *sync.WaitGroup) {
beginParseTime := time.Now()
files, err := ioutil.ReadDir(savePath)
if err != nil {
panic(err)
}
fmt.Println("Reading " + strconv.Itoa(len(files)) + " files from " + savePath + "...")
a := 0 //counter for number of files successfully read
for _, f := range files {
if strings.Contains(filepath.Ext(f.Name()), "xml") {
oneFile := SenateFile{}
data, err := ioutil.ReadFile(savePath + f.Name())
if err != nil {
fmt.Println("error reading", f.Name(), err)
continue
} else {
data = convertEncoding(data)
data = []byte(strings.Replace(string(data), "UTF-16", "UTF-8", -1))
if err := xml.Unmarshal(data, &oneFile); err != nil {
fmt.Println(f.Name(), err)
} else {
for _, t := range oneFile.Filings {
mutex.Lock()
combineSingleFiling(t, combinedFilings)
mutex.Unlock()
a++
if a%10000 == 0 {
fmt.Println(strconv.Itoa(a), "Senate filings read")
}
}
}
}
}
}
fmt.Println("Successfully read ", a, "Senate filings from", len(files), " files in", time.Since(beginParseTime).String())
fmt.Println("Removing record directory " + savePath + "...")
err = os.RemoveAll(savePath)
if err != nil {
fmt.Println(err)
}
fmt.Println("Removed record directory " + savePath)
//Waitgroup done
wg.Done()
}