Skip to content

Commit

Permalink
First commit.
Browse files Browse the repository at this point in the history
  • Loading branch information
sepastian committed Jun 5, 2020
0 parents commit 5e53fe4
Show file tree
Hide file tree
Showing 7 changed files with 176 additions and 0 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
**NOTE: this is a preview.**

`warc2corpus` extracts text corpora from WARCs, according to a user-defined specification.
6 changes: 6 additions & 0 deletions bin/pyspark
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash

set -euo pipefail

export PYTHONPATH="${PYTHONPATH:-}:$(dirname $0)/../lib/"
/opt/spark/bin/pyspark --py-files /opt/aut/target/aut.zip --jars /opt/aut/target/aut-0.70.1-SNAPSHOT-fatjar.jar
2 changes: 2 additions & 0 deletions lib/warc2corpus/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
import warc2corpus
import warc2corpus.extractors
118 changes: 118 additions & 0 deletions lib/warc2corpus/extractors/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import os
import sys
import re
import pkgutil
import importlib
import traceback
from jsonschema import validate, ValidationError
import bs4
import datetime
import json

# Create a list of extractors from module in `pwd`.
# Each extractor is a dictionary, containing the keys:
#
# * netloc_regex
# * path_regex
# * extracts
#
# Both `netloc_regex` and `path_regex` are regular expressions,
# used for selecting an extractor based on host and path.
# After selecting an extractor, all elements of the list `extracts`
# are applied to the HTML of that page; each elements contains the keys
#
# * `name`
# * `css_path`
# * `f`
#
# The result of applying an extract is stored under the key `name`;
# elements are selected using the CSS path found at `css_path`;
# the actual value is obtained applying `f` to the elements extracted by
# CSS path.
schema = {
"type": "object",
"properties": {
"meta": {
"type": "object",
"properties": {
"name": { "type": "string" },
"issuer": {
"type": "string",
"enum": ["political_party", "politician", "media_site"]
},
"platform": {
"type": "string",
"enum": ["website","social_media"]
},
"layout": {
"type": "string",
"pattern": "[a-z]+"
},
"type": {
"type": "string",
"enum": ["article","page"]
}
},
"required": [
"issuer", "platform", "layout", "type"
]
},
},
}
extractors = []
for (_, name, _) in pkgutil.iter_modules([os.path.dirname(__file__)]):
# Relative import.
m = importlib.import_module('.' + name, package='warc2corpus.extractors')
# Augment meta info with name of extractor module.
for e in m.extractors:
try:
validate(instance=e,schema=schema)
except ValidationError as err:
raise ValidationError("Error validating JSON schema for extractor {}.".format(name)) from err
e['meta']['extractor'] = 'extractors.{}'.format(name)
# Let 'f' default to lambda s: True'.
e['f'] = e.get('f',lambda s: True)
# Let 'query_regex' default to '.*'.
e['query_regex'] = e.get('query_regex',re.compile('.*'))
extractors += m.extractors

def apply(content,extractors):
"""
Apply extractor to content.
"""
soup = bs4.BeautifulSoup(content,'lxml')
result = []
for extractor in extractors:
# Meta information about extraction.
meta = extractor.get('meta',{})
meta['created_at'] = datetime.datetime.now().isoformat()
data = {}
for e in extractor['extracts']:
n = e.get('name','n/a')
c = e.get('css_path',None)
f = e.get('f',None)
if not c:
continue
text = soup.select(c)
if text and f:
#log.debug(text)
text = f(text)
elif text:
# If no lambda has been given,
# join all text rows.
text = ' '.join([ m.get_text().strip() for m in text ])
data[n] = {
'css_path': c,
'value': (text or None)
}
# Check, if extracted data is valid, store under 'meta.valid';
# the extractor may specify a callable under 'validator';
# if 'validator' exists, insert its return value under 'meta.valid';
# if it does not exist, check, if all extracted values are non-empty.
if not 'validator' in extractor:
# (not not on an empty string returns False.)
extractor['validator'] = lambda pairs: all([ not not v for k,v in pairs ])
pairs = list(((k,v['value']) for k,v in data.items()))
meta['valid'] = extractor['validator'](pairs)
result.append({'meta':meta,'data':data})
return json.dumps(result)
33 changes: 33 additions & 0 deletions lib/warc2corpus/extractors/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import re
import dateparser as dp
import datetime

extractors = [
{
'meta': {
'name': 'Zeit, Mitteilung',
'issuer': 'media_site',
'platform': 'website',
'type': 'article',
'layout': 'a'
},
'netloc_regex': re.compile('www\.zeit\.de'),
'path_regex': re.compile('^/news/.+$'),
'extracts': [
{
'name': 'title',
'css_path': 'title',
'f': lambda m: m[0].get_text().strip().split('|')[0]
},
{
'name': 'body',
'css_path': 'div[class~="article-body"] p, div[class~="article-body"] li',
},
{
'name': 'released_at',
'css_path': 'time[class~="metadata__date"]',
'f': lambda m: dp.parse(m[0].get_text(), date_formats=['%d. %B %Y']).isoformat()
}
]
}
]
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
dateparser
13 changes: 13 additions & 0 deletions test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# docker run --rm -it --name aut -v $(pwd):/w2c --workdir=/w2c -e "PYTHONPATH=/w2c/lib" sepastian/aut:latest /spark/bin/pyspark --py-files /aut/target/aut.zip --jars /aut/target/aut-0.70.1-SNAPSHOT-fatjar.jar

from aut import *
from pyspark.sql.functions import col, udf
from warc2corpus.text import extract
from warc2corpus.extractors import zeit_de, test, apply

df = WebArchive(sc, sqlContext, './data/sample.warc.gz').webpages().filter(col("url").like("%zeit.de/news%"))
extractor_udf = udf(lambda html: apply(html,test.extractors))
df2 = df.select(extractor_udf('content').alias('extract'))
df2.limit(1).collect()[0]['extract']

#df.select(extract('content',zeit_de))

0 comments on commit 5e53fe4

Please sign in to comment.