diff --git a/bin/install_nltk_data b/bin/install_nltk_data new file mode 100755 index 0000000..e32ad73 --- /dev/null +++ b/bin/install_nltk_data @@ -0,0 +1,31 @@ +# bin/install_nltk_data +#!/usr/bin/env bash + +source $BIN_DIR/utils + +echo "-----> Starting nltk data installation" + +# Assumes NLTK_DATA environment variable is already set +# $ heroku config:set NLTK_DATA='/app/nltk_data' + +# Install the nltk data +# NOTE: The following command installs the wordnet corpora, +# so you may want to change for your specific needs. +# See http://www.nltk.org/data.html +python -m nltk.downloader wordnet +python -m nltk.downloader punkt +python -m nltk.downloader hmm_treebank_pos_tagger +python -m nltk.downloader maxent_treebank_pos_tagger +python -m nltk.downloader maxent_ne_chunker +python -m nltk.downloader stopwords + +# If using Textblob, use this instead: +# python -m textblob.download_corpora lite + +# Open the NLTK_DATA directory +cd ${NLTK_DATA} + +# Delete all of the zip files +find . -name "*.zip" -type f -delete + +echo "-----> Finished nltk data installation" diff --git a/bin/post_compile b/bin/post_compile new file mode 100755 index 0000000..cf55356 --- /dev/null +++ b/bin/post_compile @@ -0,0 +1,10 @@ +# bin/post_compile +#!/usr/bin/env bash + +if [ -f bin/install_nltk_data ]; then + echo "-----> Running install_nltk_data" + chmod +x bin/install_nltk_data + bin/install_nltk_data +fi + +echo "-----> Post-compile done"