-
Notifications
You must be signed in to change notification settings - Fork 3
/
scraping_wikidata.py
135 lines (113 loc) · 3.98 KB
/
scraping_wikidata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import requests
import pandas as pd
import sqlite3
header = {
'yukun tan': 'wikidata scraping'
}
# query from wikidata, followed the tutorial
# REMEMBER TO FIX THE "COUNTRY" ISSUE (MISSING REACTORS)
# WAIT IT SEEMS THAT IT'S FIXED....? CHECK LATER
query_reactors = '''
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wikibase: <http://wikiba.se/ontology#>
PREFIX p: <http://www.wikidata.org/prop/>
PREFIX v: <http://www.wikidata.org/prop/statement/>
PREFIX q: <http://www.wikidata.org/prop/qualifier/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT ?reactors ?reactorsLabel ?coord ?countryLabel
WHERE {
?reactors wdt:P31 wd:Q80877 .
?reactors wdt:P625 ?coord .
?reactors wdt:P17 ?country .
?reactors rdfs:label ?reactorsLabel filter (lang(?reactorsLabel) = "en")
?country rdfs:label ?countryLabel filter (lang(?countryLabel) = "en")
}
'''
url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
data_reactors = requests.get(url, headers=header,
params={'query': query_reactors,
'format': 'json'}).json()
query_plants = '''
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wikibase: <http://wikiba.se/ontology#>
PREFIX p: <http://www.wikidata.org/prop/>
PREFIX v: <http://www.wikidata.org/prop/statement/>
PREFIX q: <http://www.wikidata.org/prop/qualifier/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT ?reactors ?reactorsLabel ?coord ?countryLabel
WHERE {
?reactors wdt:P31 wd:Q134447 .
?reactors wdt:P625 ?coord .
?reactors wdt:P17 ?country .
?reactors rdfs:label ?reactorsLabel filter (lang(?reactorsLabel) = "en")
?country rdfs:label ?countryLabel filter (lang(?countryLabel) = "en")
}
'''
url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
data_plants = requests.get(url, headers=header,
params={'query': query_plants,
'format': 'json'}).json()
def set_up_df(data):
re = []
for item in data['results']['bindings']:
re.append({
'Country': item['countryLabel']['value'],
'Name': item['reactorsLabel']['value'],
'Coord': item['coord']['value']
})
df = pd.DataFrame(re)
return df
def extractCoord(df):
"""this function takes in a pandas df col
it substrings every element of the col 'Coord'
from the 6th to 2nd last character
"""
dfcol = df['Coord']
for i in range(len(dfcol)):
dfcol[i] = dfcol[i][6:-1]
return df
def seperateCoord(dfcol):
"""this function takes in a pandas df col with type string obj
it seperates thte string with delimiter whitespace
hold the 2 seperated strings using lists Long and Lat
"""
lon = []
lat = []
for i in range(len(dfcol)):
dfcol[i] = dfcol[i].split()
lon.append(dfcol[i][0])
lat.append(dfcol[i][1])
return lon, lat
def into_sql(df):
sqlite_file = 'coordinates.sqlite'
conn = sqlite3.connect(sqlite_file)
c = conn.cursor()
c.execute('DROP TABLE IF EXISTS reactors_coordinates;')
# THIS IS NOT WORKING "probably unsupported type"
sql = '''
CREATE TABLE reactors_coordinates(
'Country' TEXT, 'Name' TEXT, 'Long' REAL, 'Lat' REAL
)
'''
c.execute(sql)
df.to_sql(name='reactors_coordinates',
con=conn,
if_exists='append',
index=False)
conn.close()
return
def set_up_and_modify_df(data):
df = set_up_df(data)
df = extractCoord(df)
lon, lat = seperateCoord(df['Coord'])
df.drop('Coord', axis=1, inplace=True)
# typecast from string to float
df['Long'] = [float(i) for i in lon]
df['Lat'] = [float(i) for i in lat]
return df
df_reactors = set_up_and_modify_df(data_reactors)
df_plants = set_up_and_modify_df(data_plants)
df_whole = df_reactors.append(df_plants, ignore_index=True)
into_sql(df_whole)