Skip to content

Commit

Permalink
adding Makefile, entrypoint, requirements etc
Browse files Browse the repository at this point in the history
  • Loading branch information
sepastian committed Feb 9, 2022
1 parent 027a525 commit d99a41e
Show file tree
Hide file tree
Showing 6 changed files with 74 additions and 0 deletions.
5 changes: 5 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
.git
.gitignore
Dockerfile
Makefile
data/*
25 changes: 25 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Base Alpine Linux based image with OpenJDK and Maven
FROM archivesunleashed/docker-aut:latest

# Metadata
LABEL maintainer="Sebastian Gassner <[email protected]>"
LABEL description="Docker image for warc2corpus, based on the Archives Unleashed Toolkit."
LABEL website="https://github.com/sepastian/warc2corpus/"

WORKDIR /w2c

# Copy pyspark starup script to /usr/local/bin/pyspark
ADD files /
ADD lib /w2c/lib
ADD requirements.txt .

# Install pip3 and requirements.txt
RUN apt-get update && apt-get install -y \
python3-pip \
&& rm -rf /var/lib/apt/lists/* \
&& pip3 install -r requirements.txt

# Add /w2c/lib to the Python search path
ENV PYTHONPATH=/w2c/lib

ENTRYPOINT ["/usr/local/bin/aut-spark-shell"]
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
all: build

.PHONY: build
build:
docker build -t sepastian/warc2corpus:latest .
34 changes: 34 additions & 0 deletions files/usr/local/bin/aut-spark-shell
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/bash

# Default: run spark-shell (Scala):
#
# /spark/bin/spark-shell \
# --jars /aut/target/aut-<VERSION>-SNAPSHOT-fatjar.jar
#
# If the first argument is "pyspark", run pyspark instead:
#
# /spark/bin/pyspark \
# --py-files /aut/target/aut.zip \
# --jars /aut/target/aut-<VERSION>-SNAPSHOT-fatjar.jar
#
# Any additional arguments will be passed to the script to run as-is.

# Select lastest JAR file automatically from /aut/target
jarfile=$(ls /aut/target/aut-*-SNAPSHOT-fatjar.jar)

# Compose command to run;
# run spark by default;
# if first argument is "pyspark", run pyspark instead.
cmd="/spark/bin/spark-shell"
opts="--jars ${jarfile}"
if [[ $1 == "pyspark" ]]; then
echo "Starting pyspark..."
shift
cmd="/spark/bin/pyspark"
opts="--py-files /aut/target/aut.zip ${opts}"
fi

# Run command;
# pass additional commands to the shell selected above.
echo "Command: ${cmd} ${opts} $*"
${cmd} ${opts} $@
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
dateparser
beautifulsoup4
jsonschema
3 changes: 3 additions & 0 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,6 @@
df2.limit(1).collect()[0]['extract']

#df.select(extract('content',zeit_de))

# UDF closure:
# https://stackoverflow.com/a/37428126/92049

0 comments on commit d99a41e

Please sign in to comment.