diff --git a/README.md b/README.md index 07e6a44..d202b93 100644 --- a/README.md +++ b/README.md @@ -13,10 +13,11 @@ We accept pull requests to improve the config files, all contributions will be m ``` git clone https://git.informatik.uni-leipzig.de/dbpedia-assoc/marvin-config cd marvin-config -# delete previous versions of the DIEF +# (optional) delete previous versions of the DIEF rm -rf marvin-config/extraction-framework +# install dief in marvin-extraction/extraction-framework ./setup-dief.sh -# test Romanian extraction, very small +# test run Romanian extraction, very small ./marvin_extraction_run.sh --group=test ``` @@ -32,7 +33,11 @@ To run the other extractions, use either ## Cronjobs -Below is a list of cronjobs we use on the different servers +Below is a list of cronjobs we use on the different servers: + +``` +TODO +``` ## Acknowledgements @@ -44,7 +49,9 @@ We thank Sören Auer and the Technische Informationsbibliothek (TIB) for providi This contribution by TIB to DBpedia & its community is a great push towards incentivizing Open Data and establishing a global and national research and innovation data infrastructure. -# Workflow +# Workflow Description + +## ## Downloading the wikimedia dumps TODO diff --git a/extractionConfiguration/universal.properties.template b/extractionConfiguration/universal.properties.template index d0fee58..9b4c593 100644 --- a/extractionConfiguration/universal.properties.template +++ b/extractionConfiguration/universal.properties.template @@ -5,11 +5,13 @@ dbpedia-version=2018-10 # Replace with your Wikipedia dump download directory (should not change over the course of a release) # base-dir=/data/extraction/wikidumps/ -base-dir=$BASEDIR +# AUTOMATICALLY SET BY setup-dief.sh +# base-dir=$BASEDIR # The log file directory - used to store all log files created in the course of all extractions # log-dir=/data/extraction/logs/extraction/ -log-dir=$LOGDIR/extraction/ +# AUTOMATICALLY SET BY setup-dief.sh +# log-dir=$LOGDIR/extraction/ # to forward extraction summaries and warnings via the slack API, use this option -slack-webhook=https://hooks.slack.com/services/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 diff --git a/functions.sh b/functions.sh index 6d71322..185a836 100755 --- a/functions.sh +++ b/functions.sh @@ -1,5 +1,27 @@ #!/bin/bash + +############## +# setup paths +############## + +ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +CONFIGDIR="$ROOT/extractionConfiguration" +DIEFDIR="$ROOT/marvin-extraction/extraction-framework" && mkdir -p $LOGDIR +LOGDIR="$ROOT/marvin-extraction/logs/$(date +%Y-%m-%d)" +EXTRACTIONBASEDIR="$ROOT/marvin-extraction/wikidumps" && mkdir -p $EXTRACTIONBASEDIR + +# TODO +RELEASEDIR="$ROOT/marvin-extraction/release" +DATAPUSMAVENPLUGINPOMDIR="$ROOT/databus-maven-plugin" +DATAPUSMAVENPLUGINPOMGIT="https://github.com/dbpedia/databus-maven-plugin.git" + +mkdir -p $RELEASEDIR + +############## +# functions +############## + # downlaod and extract data extractDumps() { cd $DIEFDIR/dump; diff --git a/marvin_extraction_run.sh b/marvin_extraction_run.sh index 9b7d080..8099e84 100755 --- a/marvin_extraction_run.sh +++ b/marvin_extraction_run.sh @@ -1,38 +1,19 @@ #!/bin/bash HELP="usage: ---group={test|generic|mappings|wikidata} [--databus-deploy|--skip-dief-install] +--group={test|generic|mappings|wikidata} [--databus-deploy] description: --group={test|generic|mappings|wikidata} : required selects download.\$GROUP.properties and extraction.\$GROUP.properties from extractionConfig dir Some exceptions are hard coded like 'extraction.generic.en.properties' - -[--skip-dief-install] : optional - 'false' -> each run does a fresh checkout install of the DIEF (DBpedia Information Extraction Framework) - 'true' -> skipped - " +####################### +# include all functions and path variables +####################### +source functions.sh -############## -# setup paths -############## -ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )/marvin-extraction" -CONFIGDIR="$ROOT/extractionConfiguration" - -# set and create -LOGDIR="$ROOT/logs/$(date +%Y-%m-%d)" && mkdir -p $LOGDIR -DIEFDIR="$ROOT/extraction-framework" - -# TODO -EXTRACTIONBASEDIR="$ROOT/wikidumps" -DATAPUSMAVENPLUGINPOMDIR="$ROOT/databus-maven-plugin" -RELEASEDIR="$ROOT/release" -DATAPUSMAVENPLUGINPOMGIT="https://github.com/dbpedia/databus-maven-plugin.git" - -mkdir -p $EXTRACTIONBASEDIR -mkdir -p $RELEASEDIR ################# #check arguments @@ -76,23 +57,20 @@ then fi -####################### -# include all functions -####################### -source functions.sh + ####################### # RUN (requires setup-dief.sh) ####################### # DOWNLOAD ONTOLOGY and MAPPINGS -cd $DIEFDIR/core; -../run download-ontology &> $LOGDIR/downloadOntology.log; -../run download-mappings &> $LOGDIR/downloadMappings.log; +cd $DIEFDIR/core +../run download-ontology &> $LOGDIR/downloadOntology.log +../run download-mappings &> $LOGDIR/downloadMappings.log # DOWNLOAD WIKIDUMPS cd $DIEFDIR/dump -../run download $CONFIGDIR/download.$GROUP.properties &> $LOGDIR/downloadWikidumps.log; +../run download $CONFIGDIR/download.$GROUP.properties &> $LOGDIR/downloadWikidumps.log # EXTRACT #extractDumps &> $LOGDIR/extraction.log; diff --git a/setup-dief.sh b/setup-dief.sh index f321a13..a4b438b 100755 --- a/setup-dief.sh +++ b/setup-dief.sh @@ -1,15 +1,16 @@ #!/bin/bash -ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )/marvin-extraction" -CONFIGDIR="$ROOT/extractionConfiguration" -DIEFDIR="$ROOT/extraction-framework" +# get all variables and functions +source functions.sh +cd marvin-extraction git clone "https://github.com/dbpedia/extraction-framework.git" $DIEFDIR + cd $DIEFDIR -# todo add config -#cd $ROOT && cp $ROOT/config.d/universal.properties.template $EXTRACTIONFRAMEWORKDIR/core/src/main/resources/universal.properties; -#sed -i -e 's,$BASEDIR,'$EXTRACTIONBASEDIR',g' $EXTRACTIONFRAMEWORKDIR/core/src/main/resources/universal.properties; -#sed -i -e 's,$LOGDIR,'$LOGDIR',g' $EXTRACTIONFRAMEWORKDIR/core/src/main/resources/universal.properties; +# concat universial props +echo "base-dir=$EXTRACTIONBASEDIR" > $DIEFDIR/core/src/main/resources/universal.properties +echo "log-dir=$LOGDIR/extraction/" >> $DIEFDIR/core/src/main/resources/universal.properties +cat $CONFIGDIR/universal.properties.template >> $DIEFDIR/core/src/main/resources/universal.properties -mvn clean install +mvn clean install &> $LOGDIR/installDIEF.log