-
Notifications
You must be signed in to change notification settings - Fork 115
/
birdcodes-usgs.py
executable file
·98 lines (80 loc) · 3.1 KB
/
birdcodes-usgs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Get bird codes from http://www.pwrc.usgs.gov/BBL/MANUAL/speclist.cfm
# which also points to
# http://www.pwrc.usgs.gov/BBL/xml/Mapped_XML_files/speciestable.xml
# which is smaller, but has less data: no scientific names,
# no capitalization info because it's all caps.
#
# Note: these codes are NOT the codes typically used by US birders.
# I'm leaving the script in the repository in case it's useful as
# an example of live parsing a web page, but it's not useful for
# interpreting actual rare bird reports (some codes may be the
# same, but others are not).
import sys
import re
import urllib2
from bs4 import BeautifulSoup
class BirdCodes:
def __init__(self):
self.bird_dict = {}
def print_bird(self, code):
print "%s :: %s :: %s" % (code, self.bird_dict[code]['name'],
self.bird_dict[code]['sci_name'])
def print_all(self):
keys = self.bird_dict.keys()
keys.sort()
for code in keys:
print "%s :: %s :: %s" % (code, self.bird_dict[code]['name'],
self.bird_dict[code]['sci_name'])
def parse_html_table(self):
"""Return a dictionary of { code: { name: "", sci_name: "" } }
parsed from the HTML website.
"""
page = urllib2.urlopen('http://www.pwrc.usgs.gov/BBL/MANUAL/speclist.cfm')
birdpage = page.read()
page.close()
self.soup = BeautifulSoup(birdpage)
for tr in self.soup.findAll('tr'):
tds = tr.findAll('td')
if not tds:
# Probably just the header row. Skip it.
continue
try:
code = tds[1].findAll(text=True)[0].strip() # if tds[1] else ''
name = tds[2].findAll(text=True)[0].strip()
sci = tds[5].findAll(text=True)[0].strip()
self.bird_dict[code] = { 'name': name, 'sci_name': sci }
except:
print "Eek!", tr
def parse_xml(self):
"""Return a dictionary of code: [ common_name, scientific_name ]
parsed from the XML web page.
"""
page = urllib2.urlopen('http://www.pwrc.usgs.gov/BBL/xml/Mapped_XML_files/speciestable.xml')
birdxml = page.read()
page.close()
self.soup = BeautifulSoup(birdxml)
for data in self.soup.findAll('speciesdata'):
code = data.find('alphacode').get_text()
name = data.find('commonname').get_text()
# In the XML file names are all uppercase. Convert to initial caps:
name = name[0] + name[1:].lower()
# The XML file doesn't have scientific names
self.bird_dict[code] = { 'name': name, 'sci_name': '' }
if __name__ == '__main__':
codes = sys.argv[1:]
bc = BirdCodes()
if codes and codes[0] == '-l':
longform = True
codes = codes[1:]
else:
longform = False
if longform:
bc.parse_html_table()
else:
bc.parse_xml()
if not codes:
bc.print_all()
for code in codes:
bc.print_bird(code)