-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
105 lines (71 loc) · 2.95 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import requests
from bs4 import BeautifulSoup
import time
import concurrent.futures
import threading
import os
import docx
from docx import Document
from docx.shared import Pt
from docx.enum.dml import MSO_THEME_COLOR_INDEX
def add_hyperlink(paragraph, text, url):
# This gets access to the document.xml.rels file and gets a new relation id value
part = paragraph.part
r_id = part.relate_to(url, docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True)
# Create the w:hyperlink tag and add needed values
hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink')
hyperlink.set(docx.oxml.shared.qn('r:id'), r_id, )
# Create a w:r element and a new w:rPr element
new_run = docx.oxml.shared.OxmlElement('w:r')
rPr = docx.oxml.shared.OxmlElement('w:rPr')
# Join all the xml elements together add add the required text to the w:r element
new_run.append(rPr)
new_run.text = text
hyperlink.append(new_run)
# Create a new Run object and add the hyperlink into it
r = paragraph.add_run ()
r._r.append (hyperlink)
# A workaround for the lack of a hyperlink style (doesn't go purple after using the link)
# Delete this if using a template that has the hyperlink style in it
r.font.color.theme_color = MSO_THEME_COLOR_INDEX.HYPERLINK
r.font.underline = True
return hyperlink
# document.add_heading('Test', 0)
input_string = input('Search?.... ')
search_query = 'https://hbr.org/search?N=0&Ntt=' + '+'.join(input_string.split(' '))
# search_query = 'https://hbr.org/search?N=0&Ntt=multiple+business+location'
start = time.time()
document = Document()
document.add_heading(input_string, 0)
page = requests.get(search_query)
soup = BeautifulSoup(page.content, 'html.parser')
# print(soup.prettify())
# finding how many pages HBR has + applying less than 100 page rule. for now!
for index, item in enumerate(soup.find_all('h3')[1].strings):
if index == 0:
number = [x for x in item if x!=',' and x!=' ']
number = int("".join(number))
number = int(number/10)
print(str(number) + ' Pages has found.')
if number > 100:
number = 100
print('Results of 100 pages will be saved.')
search_query = search_query + '&loaded=' + str(number)
page = requests.get(search_query)
soup = BeautifulSoup(page.content, 'html.parser')
temp = soup.find_all('search-stream')[0].find_all('stream-list')[1].find_all('stream-item')
# print(*temp, sep='\n\n\n')
counter = 0
for item in temp:
if item['data-list-price'] == '0':
link = 'https://hbr.org/' + item.find('a')['href']
title = item['data-title']
summary = item['data-summary']
counter += 1
h = document.add_heading('', level=1)
add_hyperlink(h, title, link)
p = document.add_paragraph(summary)
print('\n\n' + str(counter) + ' Free articles has been saved.' + '\n')
document.save('/Users/Reza/Desktop/' + input_string + '.docx')
end = time.time()
# print('time: ' + str((end - start)/60))