First commit.

sepastian · Jun 5, 2020 · 5e53fe4 · 5e53fe4
commit 5e53fe4
Show file tree

Hide file tree

Showing 7 changed files with 176 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,3 @@
+**NOTE: this is a preview.**
+
+`warc2corpus` extracts text corpora from WARCs, according to a user-defined specification.
diff --git a/bin/pyspark b/bin/pyspark
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+set -euo pipefail
+
+export PYTHONPATH="${PYTHONPATH:-}:$(dirname $0)/../lib/"
+/opt/spark/bin/pyspark --py-files /opt/aut/target/aut.zip --jars /opt/aut/target/aut-0.70.1-SNAPSHOT-fatjar.jar
diff --git a/lib/warc2corpus/__init__.py b/lib/warc2corpus/__init__.py
@@ -0,0 +1,2 @@
+import warc2corpus
+import warc2corpus.extractors
diff --git a/lib/warc2corpus/extractors/__init__.py b/lib/warc2corpus/extractors/__init__.py
@@ -0,0 +1,118 @@
+import os
+import sys
+import re
+import pkgutil
+import importlib
+import traceback
+from jsonschema import validate, ValidationError
+import bs4
+import datetime
+import json
+
+# Create a list of extractors from module in `pwd`.
+# Each extractor is a dictionary, containing the keys:
+#
+#   * netloc_regex
+#   * path_regex
+#   * extracts
+#
+# Both `netloc_regex` and `path_regex` are regular expressions,
+# used for selecting an extractor based on host and path.
+# After selecting an extractor, all elements of the list `extracts`
+# are applied to the HTML of that page; each elements contains the keys
+#
+#   * `name`
+#   * `css_path`
+#   * `f`
+#
+# The result of applying an extract is stored under the key `name`;
+# elements are selected using the CSS path found at `css_path`;
+# the actual value is obtained applying `f` to the elements extracted by
+# CSS path.
+schema = {
+    "type": "object",
+    "properties": {
+        "meta": {
+            "type": "object",
+            "properties": {
+                "name": { "type": "string" },
+                "issuer": {
+                    "type": "string",
+                    "enum": ["political_party", "politician", "media_site"]
+                },
+                "platform": {
+                    "type": "string",
+                    "enum": ["website","social_media"]
+                },
+                "layout": {
+                    "type": "string",
+                    "pattern": "[a-z]+"
+                },
+                "type": {
+                    "type": "string",
+                    "enum": ["article","page"]
+                }
+            },
+            "required": [
+                "issuer", "platform", "layout", "type"
+            ]
+        },
+    },
+}
+extractors = []
+for (_, name, _) in pkgutil.iter_modules([os.path.dirname(__file__)]):
+    # Relative import.
+    m = importlib.import_module('.' + name, package='warc2corpus.extractors')
+    # Augment meta info with name of extractor module.
+    for e in m.extractors:
+        try:
+            validate(instance=e,schema=schema)
+        except ValidationError as err:
+            raise ValidationError("Error validating JSON schema for extractor {}.".format(name)) from err
+        e['meta']['extractor'] = 'extractors.{}'.format(name)
+        # Let 'f' default to lambda s: True'.
+        e['f'] = e.get('f',lambda s: True)
+        # Let 'query_regex' default to '.*'.
+        e['query_regex'] = e.get('query_regex',re.compile('.*'))
+    extractors += m.extractors
+
+def apply(content,extractors):
+    """
+    Apply extractor to content.
+    """
+    soup = bs4.BeautifulSoup(content,'lxml')
+    result = []
+    for extractor in extractors:
+        # Meta information about extraction.
+        meta = extractor.get('meta',{})
+        meta['created_at'] = datetime.datetime.now().isoformat()
+        data = {}
+        for e in extractor['extracts']:
+            n = e.get('name','n/a')
+            c = e.get('css_path',None)
+            f = e.get('f',None)
+            if not c:
+                continue
+            text = soup.select(c)
+            if text and f:
+                #log.debug(text)
+                text = f(text)
+            elif text:
+                # If no lambda has been given,
+                # join all text rows.
+                text = ' '.join([ m.get_text().strip() for m in text ])
+            data[n] = {
+                'css_path': c,
+                'value': (text or None)
+            }
+            # Check, if extracted data is valid, store under 'meta.valid';
+            # the extractor may specify a callable under 'validator';
+            # if 'validator' exists, insert its return value under 'meta.valid';
+            # if it does not exist, check, if all extracted values are non-empty.
+            if not 'validator' in extractor:
+                # (not not on an empty string returns False.)
+                extractor['validator'] = lambda pairs: all([ not not v for k,v in pairs ])
+                pairs = list(((k,v['value']) for k,v in data.items()))
+                meta['valid'] = extractor['validator'](pairs)
+            result.append({'meta':meta,'data':data})
+        return json.dumps(result)
diff --git a/lib/warc2corpus/extractors/test.py b/lib/warc2corpus/extractors/test.py
@@ -0,0 +1,33 @@
+import re
+import dateparser as dp
+import datetime
+
+extractors = [
+    {
+        'meta': {
+            'name': 'Zeit, Mitteilung',
+            'issuer': 'media_site',
+            'platform': 'website',
+            'type': 'article',
+            'layout': 'a'
+        },
+        'netloc_regex': re.compile('www\.zeit\.de'),
+        'path_regex': re.compile('^/news/.+$'),
+        'extracts': [
+            {
+                'name': 'title',
+                'css_path': 'title',
+                'f': lambda m: m[0].get_text().strip().split('|')[0]
+            },
+            {
+                'name': 'body',
+                'css_path': 'div[class~="article-body"] p, div[class~="article-body"] li',
+            },
+            {
+                'name': 'released_at',
+                'css_path': 'time[class~="metadata__date"]',
+                'f': lambda m: dp.parse(m[0].get_text(), date_formats=['%d. %B %Y']).isoformat()
+            }
+        ]
+    }
+]
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1 @@
+dateparser
diff --git a/test.py b/test.py
@@ -0,0 +1,13 @@
+# docker run --rm -it --name aut -v $(pwd):/w2c --workdir=/w2c -e "PYTHONPATH=/w2c/lib" sepastian/aut:latest /spark/bin/pyspark --py-files /aut/target/aut.zip --jars /aut/target/aut-0.70.1-SNAPSHOT-fatjar.jar
+
+from aut import *
+from pyspark.sql.functions import col, udf
+from warc2corpus.text import extract
+from warc2corpus.extractors import zeit_de, test, apply
+
+df = WebArchive(sc, sqlContext, './data/sample.warc.gz').webpages().filter(col("url").like("%zeit.de/news%"))
+extractor_udf = udf(lambda html: apply(html,test.extractors))
+df2 = df.select(extractor_udf('content').alias('extract'))
+df2.limit(1).collect()[0]['extract']
+
+#df.select(extract('content',zeit_de))
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		NOTE: this is a preview.

		`warc2corpus` extracts text corpora from WARCs, according to a user-defined specification.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		import warc2corpus
		import warc2corpus.extractors