-
Notifications
You must be signed in to change notification settings - Fork 1
/
wikitestparse.py
36 lines (31 loc) · 1016 Bytes
/
wikitestparse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import urllib2
import re
from bs4 import BeautifulSoup
hdrs = { 'User-Agent': "Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11" }
def grab_links(url):
req = urllib2.Request(url, headers=hdrs)
openedpage = urllib2.urlopen(req)
html = openedpage.read()
soup = BeautifulSoup(html)
text = soup.get_text()
refs = find_ref(text)
print refs
def find_ref(text):
"""From a body of text with references as '[[refexample]]' returns a list of all reference strings."""
startref = '[['
endref = ']]'
regex1 = re.compile(startref)
regex2 = re.compile(endref)
startloc = [a.end()+1 for a in regex1.finditer(text)]
endloc = [a.start() for a in regex2.finditer(text)]
refcaps = zip(startloc,endloc)
reflist = []
for a,z in refs:
reflist.append(text[a:z])
return reflist
def form_link(page):
baselinka = 'http://en.wikipedia.org/w/api.php?format=xml&action=query&titles='
baselinkb = '&prop=revisions&rvprop=content'
url = baselinka+page+baselinkb
return url
grab_links(form_link('Mug'))