-
Notifications
You must be signed in to change notification settings - Fork 0
/
wikipedia_origins.py
40 lines (31 loc) · 1.04 KB
/
wikipedia_origins.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import urllib
import urllib2
from bs4 import BeautifulSoup
import pandas as pd
def get_location(article):
article = urllib.quote(article)
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')] #wikipedia needs this
str_ret = ''
try:
resource = opener.open("http://en.wikipedia.org/wiki/" + article)
data = resource.read()
resource.close()
soup = BeautifulSoup(data)
# get birth
try:
str_ret = soup.find("th", text="Born").parent.find("a").get_text()
except:
str_ret = "Not Available"
# get origin
try:
str_ret = str_ret + ' | ' + soup.find("th", text="Origin").parent.find("td").get_text()
except:
str_ret = str_ret + ' | ' + "Not Available"
except:
str_ret = "Artist not Found"
return str_ret
if __name__ == "__main__":
df = pd.read_csv('artists.csv')
for x in df.T.iteritems():
print x[1][0] + ' | ' + get_location(x[1][0])