diff --git a/pom.xml b/pom.xml
index c4d09052..bbb9761a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -344,6 +344,29 @@
build-helper-maven-plugin
${build-helper.plugin.version}
+
+
+
+ maven-assembly-plugin
+ 2.6
+
+
+ src/main/assembly/python.xml
+
+ aut
+ false
+
+
+
+ make-assembly
+ package
+
+ single
+
+
+
+
+
diff --git a/src/main/assembly/python.xml b/src/main/assembly/python.xml
new file mode 100644
index 00000000..57b09aef
--- /dev/null
+++ b/src/main/assembly/python.xml
@@ -0,0 +1,13 @@
+
+ python
+
+ zip
+
+
+
+ src/main/python/aut/
+ /
+
+
+
diff --git a/src/main/python/aut/__init__.py b/src/main/python/aut/__init__.py
new file mode 100644
index 00000000..18028104
--- /dev/null
+++ b/src/main/python/aut/__init__.py
@@ -0,0 +1,5 @@
+from aut.common import WebArchive
+from aut.udfs import extract_domain
+
+__all__ = ['WebArchive', 'extract_domain']
+
diff --git a/src/main/python/aut/common.py b/src/main/python/aut/common.py
new file mode 100644
index 00000000..1c053a4c
--- /dev/null
+++ b/src/main/python/aut/common.py
@@ -0,0 +1,15 @@
+from pyspark.sql import DataFrame
+
+class WebArchive:
+ def __init__(self, sc, sqlContext, path):
+ self.sc = sc
+ self.sqlContext = sqlContext
+ self.loader = sc._jvm.io.archivesunleashed.DataFrameLoader(sc._jsc.sc())
+ self.path = path
+
+ def pages(self):
+ return DataFrame(self.loader.extractValidPages(self.path), self.sqlContext)
+
+ def links(self):
+ return DataFrame(self.loader.extractHyperlinks(self.path), self.sqlContext)
+
diff --git a/src/main/python/aut/udfs.py b/src/main/python/aut/udfs.py
new file mode 100644
index 00000000..eab46e55
--- /dev/null
+++ b/src/main/python/aut/udfs.py
@@ -0,0 +1,11 @@
+from pyspark.sql.functions import udf
+from pyspark.sql.types import StringType
+
+def extract_domain_func(url):
+ url = url.replace('http://', '').replace('https://', '')
+ if '/' in url:
+ return url.split('/')[0].replace('www.', '')
+ else:
+ return url.replace('www.', '')
+
+extract_domain = udf(extract_domain_func, StringType())
diff --git a/src/main/scala/io/archivesunleashed/DataFrameLoader.scala b/src/main/scala/io/archivesunleashed/DataFrameLoader.scala
new file mode 100644
index 00000000..95f2cc2e
--- /dev/null
+++ b/src/main/scala/io/archivesunleashed/DataFrameLoader.scala
@@ -0,0 +1,16 @@
+package io.archivesunleashed
+
+import org.apache.spark.SparkContext
+import org.apache.spark.sql._
+
+class DataFrameLoader(sc: SparkContext) {
+ def extractValidPages(path: String): DataFrame = {
+ RecordLoader.loadArchives(path, sc)
+ .extractValidPagesDF()
+ }
+
+ def extractHyperlinks(path: String): DataFrame = {
+ RecordLoader.loadArchives(path, sc)
+ .extractHyperlinksDF()
+ }
+}