-
Notifications
You must be signed in to change notification settings - Fork 39
/
amazon_interview.py
72 lines (56 loc) · 1.83 KB
/
amazon_interview.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import httplib2
import pdfcrowd
from bs4 import BeautifulSoup, SoupStrainer
http = httplib2.Http()
s= 'http : // www. geeksforgeeks. org/ '
to_crawl=[]
crawled=[]
i=0
to_crawl.append(s)
status, response = http.request(s)
crawled.append(s)
for link in BeautifulSoup(response, parse_only=SoupStrainer('a')):
if link.has_attr('href'):
li=link['href']
#print li
if li.find('http : // www. geeksforgeeks. org/')==0 and li not in crawled and li.find('forums')<0:
to_crawl.append(li)
#print to_crawl
print len(to_crawl)
count=0
def get_page(page):
import urllib2
source=urllib2.urlopen(page)
return source.read()
def save_as_pdf(s):
global i
try:
client = pdfcrowd.Client("mkap1234", "fc5ada9fbd1c55f46822d6e9e985a9bb")
output_file = open('amazon'+str(i)+'.pdf', 'wb')
i=i+1
html=get_page(s)
client.convertHtml(html, output_file)
output_file.close()
except pdfcrowd.Error,why:
print 'Failed:', why
while len(to_crawl):
b=to_crawl.pop()
if b.find('http : // www. geeksforgeeks. org/')==0 and b not in crawled and b.find('forums')<0:
count=count+1
print count
crawled.append(b)
status, response = http.request(b)
for link in BeautifulSoup(response, parse_only=SoupStrainer('a')):
if link.has_attr('href'):
li=link['href']
if b.find('http : // www. geeksforgeeks. org/')==0 and li not in crawled:
to_crawl.append(li)
amazon=[ ]
for st in crawled:
if st.find('amazon')>=0 and st.find('#')<0 and st.find('tag')<0 and st.find('forum')<0:
print st
amazon.append(st)
print "Finished"
print len(amazon)
for page in amazon:
save_as_pdf(page)