forked from Smerity/cc-mrjob
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tag_counter.py
41 lines (36 loc) · 1.81 KB
/
tag_counter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import re
#
from collections import Counter
#
from mrcc import CCJob
def get_tag_count(data, ctr=None):
"""Extract the names and total usage count of all the opening HTML tags in the document"""
if ctr is None:
ctr = Counter()
# Convert the document to lower case as HTML tags are case insensitive
ctr.update(HTML_TAG_PATTERN.findall(data.lower()))
return ctr
# Optimization: compile the regular expression once so it's not done each time
# The regular expression looks for (1) a tag name using letters (assumes lowercased input) and numbers
# and (2) allows an optional for a space and then extra parameters, eventually ended by a closing >
HTML_TAG_PATTERN = re.compile('<([a-z0-9]+)[^>]*>')
# Let's check to make sure the tag counter works as expected
assert get_tag_count('<html><a href="..."></a><h1 /><br/><p><p></p></p>') == {'html': 1, 'a': 1, 'p': 2, 'h1': 1, 'br': 1}
class TagCounter(CCJob):
def process_record(self, record):
# WARC records have three different types:
# ["application/warc-fields", "application/http; msgtype=request", "application/http; msgtype=response"]
# We're only interested in the HTTP responses
if record['Content-Type'] == 'application/http; msgtype=response':
payload = record.payload.read()
# The HTTP response is defined by a specification: first part is headers (metadata)
# and then following two CRLFs (newlines) has the data for the response
headers, body = payload.split('\r\n\r\n', 1)
if 'Content-Type: text/html' in headers:
# We avoid creating a new Counter for each page as that's actually quite slow
tag_count = get_tag_count(body)
for tag, count in tag_count.items():
yield tag, count
self.increment_counter('commoncrawl', 'processed_pages', 1)
if __name__ == '__main__':
TagCounter.run()