-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
218 lines (176 loc) · 6.62 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
import webdev
import math
import json
# class that converts url data to json
class UrlDataEncoder(json.JSONEncoder):
def default(self, o):
return o.__dict__
# class that stores all necessary information for a url
class urlData:
def __init__(self):
self.name = ""
self.content = dict()
self.outgoingLinks = []
self.incomingLinks = []
self.numWords = 0
self.pagerank = -1
self.tf = dict()
self.tfidf = dict()
self.pos = -1
inverseDF = dict()
urlQueue = []
urlUsed = set()
allUrlData = dict()
urlList = []
pageRankResult = []
def crawl(seed):
# deletes the data from previous crawl
open("urlData.txt", "w")
open("inverseDf.txt", "w")
# adds seed as first URL
urlQueue.append(seed)
urlUsed.add(seed)
pos = 0
while len(urlQueue) > 0:
currUrl = urlQueue[0]
urlQueue.pop(0)
contents = webdev.read_url(currUrl)
# checks if the current url has already been initalized as an object
if currUrl in allUrlData:
currUrlData = allUrlData[currUrl]
else:
currUrlData = urlData()
currUrlData.pos = pos
urlList.append(currUrl)
pos += 1
# gets the name for the current URL
titleIndex = contents.index("<title>")
titleEnd = contents.index("</title>")
title = contents[titleIndex + 7 : titleEnd]
currUrlData.name = title
# gets all words from the current URL and organizes them into a dictionary
words = dict()
startP = [i for i in range(len(contents)) if contents.startswith("<p>", i)]
endP = [i for i in range(len(contents)) if contents.startswith("</p>", i)]
for i in range(len(startP)):
allWords = contents[startP[i] + 4 : endP[i]]
wordList = allWords.split()
for word in wordList:
if word not in words:
words[word] = 1
else:
words[word] = words[word] + 1
# records number of words in document for later use
currUrlData.numWords = currUrlData.numWords + len(wordList)
currUrlData.content = words
# gets all links from the current URL and adds them to the queue
# furthermore also finds outgoing and incoming links
outgoingLinks = []
delUrlIndex = currUrl.rindex("/")
blankUrl = currUrl[:delUrlIndex]
instances = [i for i in range(len(contents)) if contents.startswith("href=", i)]
for index in instances:
url = ""
i = index + 6
while contents[i] != '"':
url += contents[i]
i += 1
if url[0] == ".":
url = blankUrl + url[1:]
if url not in urlUsed:
urlQueue.append(url)
# code for incomign URLs
if currUrl != url:
if url in allUrlData:
incomingUrl = allUrlData[url]
else:
incomingUrl = urlData()
incomingUrl.incomingLinks.append(currUrl)
allUrlData[url] = incomingUrl
urlUsed.add(url)
# this is for outgoing URLs
outgoingLinks.append(url)
currUrlData.outgoingLinks = outgoingLinks
allUrlData[currUrl] = currUrlData
# after crawling through all documents, calculate the tf, idf, tfidf, and PageRank
calculateTf()
calculateIdf()
calculateTfidf()
calculatePageRank()
# stores the data of urlData and inverseDf into files
urlJson = UrlDataEncoder().encode(allUrlData)
open("urlData.txt", "w").write(urlJson)
urlJson = UrlDataEncoder().encode(inverseDF)
open("inverseDf.txt", "w").write(urlJson)
return len(urlUsed)
# function that calculates the term frequency for each word in each document
# no output or input
def calculateTf():
for url in urlUsed:
currUrl = allUrlData[url]
content = currUrl.content
for word in content:
currUrl.tf[word] = content[word] / currUrl.numWords
# calculating how many documents a word appears in
if word not in inverseDF:
inverseDF[word] = 1
else:
inverseDF[word] = inverseDF[word] + 1
allUrlData[url] = currUrl
# function that calculates the idf for each word
# no output or input
def calculateIdf():
for word in inverseDF:
inverseDF[word] = math.log(len(urlUsed) / (1 + inverseDF[word]), 2)
# function that calculates the tfidf for each word in a document
# no output or input
def calculateTfidf():
for url in urlUsed:
currUrl = allUrlData[url]
for word in currUrl.tf:
currUrl.tfidf[word] = math.log(1 + currUrl.tf[word], 2) * inverseDF[word]
allUrlData[url] = currUrl
# function for calculating the PageRank
# no output or input
def calculatePageRank():
# generates the probability matrix
matrix = [[0 for _ in range(len(urlList))] for _ in range(len(urlList))]
for url in urlList:
currUrl = allUrlData[url]
for outgoing in currUrl.outgoingLinks:
outUrl = allUrlData[outgoing]
index = outUrl.pos
matrix[currUrl.pos][outUrl.pos] = 1
numOnes = len(currUrl.outgoingLinks)
for j in range(len(matrix[currUrl.pos])):
matrix[currUrl.pos][j] = float(matrix[currUrl.pos][j]) / numOnes
matrix[currUrl.pos][j] = matrix[currUrl.pos][j] * (1 - 0.1)
matrix[currUrl.pos][j] = matrix[currUrl.pos][j] + 0.1 / len(urlList)
t0 = [[0 for _ in range(len(urlList))] for _ in range(1)]
t0[0][0] = 1
result = [[0 for _ in range(len(urlList))] for _ in range(1)]
for i in range(len(result)):
for j in range(len(matrix[0])):
for k in range(len(matrix)):
result[i][j] += t0[i][k] * matrix[k][j]
t0 = result
dis = 1
while dis > 0.0001:
result = [[0 for _ in range(len(urlList))] for _ in range(1)]
for i in range(len(result)):
for j in range(len(matrix[0])):
for k in range(len(matrix)):
result[i][j] += t0[i][k] * matrix[k][j]
sum = 0
for i in range(len(result[0])):
sum += pow(result[0][i] - t0[0][i], 2)
dis = math.sqrt(sum)
t0 = result
for i in range(len(result[0])):
pageRankResult.append(result[0][i])
for url in urlList:
currUrl = allUrlData[url]
currUrl.pagerank = pageRankResult[currUrl.pos]
allUrlData[url] = currUrl
return matrix
crawl("http://people.scs.carleton.ca/~davidmckenney/tinyfruits/N-0.html")