diff --git a/README.md b/README.md index d202b93..c444a26 100644 --- a/README.md +++ b/README.md @@ -1,27 +1,35 @@ # MARVIN-config MARVIN is the release bot that does automated DBpedia releases each month on three different servers for generic, mappings, wikidata, abstract extraction. -The repository at https://git.informatik.uni-leipzig.de/dbpedia-assoc/marvin-config can be used to fork the architecture for creating extensions, developing new extractors or debugging old ones. -Fixes and patches will be manually deployed via a fresh `git clone` from the `master` branch of the [DIEF (DBpedia Information Extraction Framework)](https://github.com/dbpedia/extraction-framework/). +[This repository](https://git.informatik.uni-leipzig.de/dbpedia-assoc/marvin-config) can be used to fork the architecture for creating extensions, developing new extractors or debugging old ones. +Fixes and patches will be deployed on the DBpedia servers each month via a fresh `git clone` from the `master` branch of the [DIEF (DBpedia Information Extraction Framework)](https://github.com/dbpedia/extraction-framework/). ## Contributions & License All scripts and config files in this repo are CC-0 (Public Domain). We accept pull requests to improve the config files, all contributions will be merged as CC-0. +Marvin-config is intended to bootstrap developing fixes for the DIEF. ## Run a MARVIN extraction +Implementation note: the scripts creates a folder `marvin-extraction` where the code, results and logs are. + ``` +# check out this repo with all config files git clone https://git.informatik.uni-leipzig.de/dbpedia-assoc/marvin-config cd marvin-config + + # (optional) delete previous versions of the DIEF -rm -rf marvin-config/extraction-framework -# install dief in marvin-extraction/extraction-framework +# (~10 minutes) install dief in marvin-extraction/extraction-framework +# if you installed it already you can run `git pull && mvn clean install` to update +rm -rf marvin-extraction/extraction-framework ./setup-dief.sh + # test run Romanian extraction, very small ./marvin_extraction_run.sh --group=test ``` -To run the other extractions, use either +To run the other extractions, use either of ``` # around 4-7 days ./marvin_extraction_run.sh --group=generic @@ -39,6 +47,7 @@ Below is a list of cronjobs we use on the different servers: TODO ``` +## ## Acknowledgements We thank Sören Auer and the Technische Informationsbibliothek (TIB) for providing three servers to run: diff --git a/functions.sh b/functions.sh index 66a92f0..d577099 100755 --- a/functions.sh +++ b/functions.sh @@ -52,22 +52,20 @@ postProcessing() { cd $DIEFDIR/scripts; echo "post-processing of $GROUP" - # todo check BASEDIR - if [ "$GROUP" = "mappings" ] then - >&2 ../run ResolveTransitiveLinks $BASEDIR redirects redirects_transitive .ttl.bz2 @downloaded; - >&2 ../run MapObjectUris $BASEDIR redirects_transitive .ttl.bz2 mappingbased-objects-uncleaned _redirected .ttl.bz2 @downloaded; + >&2 ../run ResolveTransitiveLinks $EXTRACTIONBASEDIR redirects redirects_transitive .ttl.bz2 @downloaded; + >&2 ../run MapObjectUris $EXTRACTIONBASEDIR redirects_transitive .ttl.bz2 mappingbased-objects-uncleaned _redirected .ttl.bz2 @downloaded; >&2 ../run TypeConsistencyCheck type.consistency.check.properties; elif [ "$GROUP" = "wikidata" ] then - >&2 ../run ResolveTransitiveLinks $BASEDIR redirects transitive-redirects .ttl.bz2 wikidata - >&2 ../run MapObjectUris $BASEDIR transitive-redirects .ttl.bz2 mappingbased-objects-uncleaned,raw -redirected .ttl.bz2 wikidata + >&2 ../run ResolveTransitiveLinks $EXTRACTIONBASEDIR redirects transitive-redirects .ttl.bz2 wikidata + >&2 ../run MapObjectUris $EXTRACTIONBASEDIR transitive-redirects .ttl.bz2 mappingbased-objects-uncleaned,raw -redirected .ttl.bz2 wikidata >&2 ../run TypeConsistencyCheck type.consistency.check.properties; elif [ "$GROUP" = "generic" ] then - >&2 ../run ResolveTransitiveLinks $BASEDIR redirects redirects_transitive .ttl.bz2 @downloaded; - >&2 ../run MapObjectUris $BASEDIR redirects_transitive .ttl.bz2 disambiguations,infobox-properties,page-links,persondata,topical-concepts _redirected .ttl.bz2 @downloaded; + >&2 ../run ResolveTransitiveLinks $EXTRACTIONBASEDIR redirects redirects_transitive .ttl.bz2 @downloaded; + >&2 ../run MapObjectUris $EXTRACTIONBASEDIR redirects_transitive .ttl.bz2 disambiguations,infobox-properties,page-links,persondata,topical-concepts _redirected .ttl.bz2 @downloaded; elif [ "$GROUP" = "abstract" ] then echo "TODO" @@ -83,7 +81,7 @@ postProcessing() { # compress log files archiveLogFiles() { # todo copy to some archive - for f in $(find $LOGDIR -type f ); do lbzip2 $f; done; + for f in $(find $LOGDIR -type f ); do lbzip2 -f $f; done; } diff --git a/marvin_extraction_run.sh b/marvin_extraction_run.sh index 8099e84..25f36ae 100755 --- a/marvin_extraction_run.sh +++ b/marvin_extraction_run.sh @@ -73,10 +73,10 @@ cd $DIEFDIR/dump ../run download $CONFIGDIR/download.$GROUP.properties &> $LOGDIR/downloadWikidumps.log # EXTRACT -#extractDumps &> $LOGDIR/extraction.log; +extractDumps &> $LOGDIR/extraction.log; # POST-PROCESSING -#postProcessing 2> $LOGDIR/postProcessing.log; +postProcessing 2> $LOGDIR/postProcessing.log; # RELEASE #databusRelease 2> $LOGDIR/databusDeploy.log