diff --git a/pom.xml b/pom.xml index c4d09052..bbb9761a 100644 --- a/pom.xml +++ b/pom.xml @@ -344,6 +344,29 @@ build-helper-maven-plugin ${build-helper.plugin.version} + + + + maven-assembly-plugin + 2.6 + + + src/main/assembly/python.xml + + aut + false + + + + make-assembly + package + + single + + + + + diff --git a/src/main/assembly/python.xml b/src/main/assembly/python.xml new file mode 100644 index 00000000..57b09aef --- /dev/null +++ b/src/main/assembly/python.xml @@ -0,0 +1,13 @@ + + python + + zip + + + + src/main/python/aut/ + / + + + diff --git a/src/main/python/aut/__init__.py b/src/main/python/aut/__init__.py new file mode 100644 index 00000000..18028104 --- /dev/null +++ b/src/main/python/aut/__init__.py @@ -0,0 +1,5 @@ +from aut.common import WebArchive +from aut.udfs import extract_domain + +__all__ = ['WebArchive', 'extract_domain'] + diff --git a/src/main/python/aut/common.py b/src/main/python/aut/common.py new file mode 100644 index 00000000..1c053a4c --- /dev/null +++ b/src/main/python/aut/common.py @@ -0,0 +1,15 @@ +from pyspark.sql import DataFrame + +class WebArchive: + def __init__(self, sc, sqlContext, path): + self.sc = sc + self.sqlContext = sqlContext + self.loader = sc._jvm.io.archivesunleashed.DataFrameLoader(sc._jsc.sc()) + self.path = path + + def pages(self): + return DataFrame(self.loader.extractValidPages(self.path), self.sqlContext) + + def links(self): + return DataFrame(self.loader.extractHyperlinks(self.path), self.sqlContext) + diff --git a/src/main/python/aut/udfs.py b/src/main/python/aut/udfs.py new file mode 100644 index 00000000..eab46e55 --- /dev/null +++ b/src/main/python/aut/udfs.py @@ -0,0 +1,11 @@ +from pyspark.sql.functions import udf +from pyspark.sql.types import StringType + +def extract_domain_func(url): + url = url.replace('http://', '').replace('https://', '') + if '/' in url: + return url.split('/')[0].replace('www.', '') + else: + return url.replace('www.', '') + +extract_domain = udf(extract_domain_func, StringType()) diff --git a/src/main/scala/io/archivesunleashed/DataFrameLoader.scala b/src/main/scala/io/archivesunleashed/DataFrameLoader.scala new file mode 100644 index 00000000..95f2cc2e --- /dev/null +++ b/src/main/scala/io/archivesunleashed/DataFrameLoader.scala @@ -0,0 +1,16 @@ +package io.archivesunleashed + +import org.apache.spark.SparkContext +import org.apache.spark.sql._ + +class DataFrameLoader(sc: SparkContext) { + def extractValidPages(path: String): DataFrame = { + RecordLoader.loadArchives(path, sc) + .extractValidPagesDF() + } + + def extractHyperlinks(path: String): DataFrame = { + RecordLoader.loadArchives(path, sc) + .extractHyperlinksDF() + } +}