From d7859971dcd85358da5af898b865da39602b2ffc Mon Sep 17 00:00:00 2001 From: Sebastian Hellmann Date: Fri, 11 Oct 2019 13:31:12 +0200 Subject: [PATCH] more cleanup --- README.md | 32 +++++++++++++++++++++++++++----- functions.sh | 18 +++++++++--------- marvin_extraction_run.sh | 6 +++--- 3 files changed, 39 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index eb890ee..35a4a16 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,37 @@ # MARVIN-config -MARVIN is the release bot that does automated DBpedia releases each month on three different servers for generic, mappings, wikidata extraction. - +MARVIN is the release bot that does automated DBpedia releases each month on three different servers for generic, mappings, wikidata, abstract extraction. The repository at https://git.informatik.uni-leipzig.de/dbpedia-assoc/marvin-config can be used to fork the architecture for creating extensions, developing new extractors or debugging old ones. +Fixes and patches will be manually deployed via a freah `git clone` from the `master` branch of the [DBpedia Extraction Framework](https://github.com/dbpedia/extraction-framework/). + +## Contributions & License +All scripts and config files in this repo are CC-0 (Public Domain). +We accept pull requests to improve the config files, all contributions will be merged as CC-0. + +## Run a MARVIN extraction +``` +git clone https://git.informatik.uni-leipzig.de/dbpedia-assoc/marvin-config +cd marvin-config +# Romanian extraction, very small +./marvin_extraction_run.sh --group=test +``` + +To run the other extractions, use either +``` +# around 4-7 days +./marvin_extraction_run.sh --group=generic +# around 4-7 days +./marvin_extraction_run.sh --group=mappings +# around 7-14 days +./marvin_extraction_run.sh --group=wikidata +``` -Fixes and patches will be manually deployed via `git pull` from the `master` branch of the [DBpedia Extraction Framework](https://github.com/dbpedia/extraction-framework/). +## Cronjobs -The architecture and workflow can also be forked and adapted to completely different extractions and derive operations outside of the DBpedia framework. +Below is a list -# Acknowledgements +## Acknowledgements We thank Sören Auer and the Technische Informationsbibliothek (TIB) for providing three servers to run: * the main DBpedia extraction on a monthly basis diff --git a/functions.sh b/functions.sh index de9673d..16186ac 100755 --- a/functions.sh +++ b/functions.sh @@ -7,6 +7,7 @@ prepareExtractionFramework(){ if [ "$SKIPDIEFINSTALL" = "false" ] then # TODO make sure this contains marvin-config/marvin-extraction and replace with -rf + echo "deleting $DIEFDIR" rm -rI $DIEFDIR git clone "https://github.com/dbpedia/extraction-framework.git" $DIEFDIR cd $DIEFDIR @@ -25,18 +26,17 @@ prepareExtractionFramework(){ # downlaod and extract data extractDumps() { cd $DIEFDIR/dump; - - # run for all - >&2 ../run extraction $ROOT/config.d/extraction.$GROUP.properties; - - # exceptions - - ## for generic, as English is big and has to be run separately + + # exception for generic, 1. spark, 2. as English is big and has to be run separately if [ "$GROUP" = "generic" ] then - >&2 ../run sparkextraction $ROOT/config.d/extraction.generic.en.properties; - fi + >&2 ../run sparkextraction $CONFIGDIR/extraction.generic.properties; + >&2 ../run sparkextraction $CONFIGDIR/extraction.generic.en.properties; + else + # run for all + >&2 ../run extraction $CONFIGDIR/extraction.$GROUP.properties; + fi } diff --git a/marvin_extraction_run.sh b/marvin_extraction_run.sh index 6c8e2f2..a55d328 100755 --- a/marvin_extraction_run.sh +++ b/marvin_extraction_run.sh @@ -18,7 +18,7 @@ description: ############## # setup paths ############## -ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )/marvin-extraction/" +ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )/marvin-extraction" CONFIGDIR="$ROOT/extractionConfiguration" # set and create @@ -101,10 +101,10 @@ cd $DIEFDIR/dump extractDumps &> $LOGDIR/extraction.log; # POST-PROCESSING -postProcessing 2> $LOGDIR/postProcessing.log; +#postProcessing 2> $LOGDIR/postProcessing.log; # RELEASE -databusRelease 2> $LOGDIR/databusDeploy.log +#databusRelease 2> $LOGDIR/databusDeploy.log # CLEANUP archiveLogFiles;