forked from teamdandelion/RoboBuffett
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
181 lines (132 loc) · 4.95 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
from ftplib import FTP
import os
import sys
import zipfile
import re
import argparse
import threading
import Queue
def connect_to_SEC(index):
if index > 50:
print "Maximum number of attempts exceeded. Try again later."
else:
try:
return FTP('ftp.sec.gov')
except EOFError:
print "Connection refused on attempt {0}. Trying again...".format(index)
return connect_to_SEC(index + 1)
def download_file(serverpath, local_path):
global ftp
with open (local_path, 'w') as out_file:
command = 'RETR ' + serverpath.strip()
ftp.retrbinary(command, out_file.write)
def ensure(dir):
if not os.path.exists(dir):
os.makedirs(dir)
def extract_and_remove(zip_path, out_dir):
with zipfile.ZipFile(zip_path, 'r') as outzip:
outzip.extractall(out_dir)
os.remove(zip_path)
def download_index_files(out_dir):
years = ['1993', '1994', '1995', '1996',
'1997', '1998', '1999', '2000',
'2001', '2002', '2003', '2004',
'2005', '2006', '2007', '2008',
'2009', '2010', '2011', '2012']
quarters = ['QTR1', 'QTR2', 'QTR3', 'QTR4']
# Get the current working directory so that we can change it
# back when we're done
old_cwd = os.getcwd()
ensure(out_dir)
os.chdir(out_dir)
for year in years:
for quarter in quarters:
subdir = year + '/' + quarter
ensure(subdir)
path = subdir + '/form.zip'
download_file(path, path)
extract_and_remove(path, subdir)
os.chdir(old_cwd)
def split_list(xs, y, eq_func=lambda a, b: a == b):
for i, x in enumerate(xs):
if eq_func(x, y):
return [xs[:i], xs[i + 1:]]
else:
return [xs]
def paths_for_10ks(index_file):
paths = []
lines = index_file.read().splitlines()
lines = split_list(lines, '-+$', lambda a, b: re.match(b, a))[1]
for line in lines:
if line[:4] == '10-K' or line[:4] == '10-Q':
fields = re.split('\s\s+', line)
company, date, server_path = (fields[1], fields[3], fields[4])
paths.append((server_path, '{0}_{1}_{2}'.format(company.replace('/', '-'), date, fields[0].replace('/','-'))))
return paths
def download_forms_serially(paths):
global ftp
for server_path, local_path in paths:
try:
with open(local_path, 'w') as out_file:
ftp.retrlines('RETR ' + server_path, out_file.write)
print "Saved: {0}".format(local_path)
except Exception as e:
print e
print 'Download failed on file at: {0}'.format(server_path)
def download_10ks(data_directory):
for root, dirs, files in os.walk(data_directory):
for name in files:
path = os.path.join(root, name)
if path.split('.')[-1] != 'idx':
continue
with open(path, 'r') as index_file:
form_paths = [(s, os.path.join(root, l)) for s,l in paths_for_10ks(index_file)]
download_forms_serially(form_paths)
# A class to facilitate multithreaded downloading of data over FTP
class FTPThread(threading.Thread):
"""A class to download data over FTP in parallel threads"""
def __init__(self, server_path, local_path):
self.server_path = server_path
self.local_path = local_path
threading.Thread.__init__(self)
def run(self):
global ftp
try:
with open(self.local_path, 'w') as out_file:
ftp.retrlines('RETR ' + self.server_path, out_file.write)
print "Saved: {0}".format(self.local_path)
except Exception as e:
print e
print 'Download failed on file at: {0}'.format(self.server_path)
def download_forms(paths, max_threads):
finished = []
def producer(q, paths):
for server_path, local_path in paths:
thread = FTPThread(server_path, local_path)
thread.start()
q.put(thread, True)
def consumer(q, total_files):
while len(finished) < total_files:
thread = q.get(True)
thread.join()
finished.append(thread)
q = Queue.Queue(max_threads)
prod_thread = threading.Thread(target=producer, args=(q, paths))
cons_thread = threading.Thread(target=consumer, args=(q, len(paths)))
prod_thread.start()
cons_thread.start()
prod_thread.join()
cons_thread.join()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Download either index files (i) or form files (f) to a given directory.')
parser.add_argument('mode', type=str, choices=['i', 'f'])
parser.add_argument('directory', type=str)
args = parser.parse_args()
ftp = connect_to_SEC(0)
ftp.login()
if args.mode == 'i':
index_path = '/edgar/full-index'
ftp.cwd(index_path)
download_index_files(args.directory)
else:
download_10ks(args.directory)