From c1758c81f87da851dc938f21f4015a76de62f339 Mon Sep 17 00:00:00 2001 From: fwhigh Date: Wed, 9 Jan 2019 00:06:10 -0800 Subject: [PATCH] First local training and s3 copy --- .gitignore | 2 + Dockerfile | 6 +- Dockerfile.train | 2 - README.md | 20 +- notebooks/model-training.ipynb | 335 +++------------------------------ requirements.txt | 3 +- scripts/get_glove.sh | 2 +- scripts/get_latest_model.sh | 15 ++ scripts/get_training_data.sh | 10 +- scripts/install.sh | 11 +- scripts/push_image.sh | 10 + scripts/train.sh | 24 +-- 12 files changed, 90 insertions(+), 350 deletions(-) create mode 100755 scripts/get_latest_model.sh create mode 100755 scripts/push_image.sh diff --git a/.gitignore b/.gitignore index 1a015d9..69a0c50 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ .idea data +batch_create_compute_env.json +*~ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/Dockerfile b/Dockerfile index 343ef25..742c29d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ - +FROM 015216235264.dkr.ecr.us-west-1.amazonaws.com/pmip:latest EXPOSE 8000 -ENTRYPOINT [] -CMD [ "gunicorn", "-w", "3", "-b", ":8000", "wsgi" ] +ENTRYPOINT [ "bash" ] +CMD [ "-c", "gunicorn -w 3 -b :8000 wsgi" ] \ No newline at end of file diff --git a/Dockerfile.train b/Dockerfile.train index 0215ad3..b0dfcf6 100644 --- a/Dockerfile.train +++ b/Dockerfile.train @@ -17,8 +17,6 @@ COPY . . RUN ENVIRONMENT=$ENVIRONMENT bash scripts/install.sh -RUN bash scripts/get_glove.sh - EXPOSE 8888 ENTRYPOINT [ "bash" ] CMD [ "-c", "jupyter notebook notebooks/ --allow-root --ip=0.0.0.0 --port=8888 --no-browser" ] \ No newline at end of file diff --git a/README.md b/README.md index 1c54c6f..e35d030 100644 --- a/README.md +++ b/README.md @@ -6,15 +6,13 @@ Predictive Models in Production ### Pro tips -If you ever find yourself with a "no space left on device" error, try +If you ever find yourself with a "no space left on device" error when building the Docker image, try ```bash docker rm $(docker ps -q -f 'status=exited') docker rmi $(docker images -q -f "dangling=true") ``` -See, eg, https://forums.docker.com/t/no-space-left-on-device-error/10894/14. - ### Build the base training image ```bash @@ -24,17 +22,27 @@ ENVIRONMENT=dev bash scripts/build_training_image.sh ### Do interactive model training and data exploration in the Jupyter notebook ```bash -ENVIRONMENT=dev bash scripts/run_training_container.sh +ENVIRONMENT=dev bash scripts/run_training_container.sh -c jupyter notebook notebooks/ --allow-root --ip=0.0.0.0 --port=8888 --no-browser ``` -Then open [http://localhost:8888](http://localhost:8888). +Then open [http://localhost:8888](http://localhost:8888) to run Jupyter. ### Train a model programmatically ```bash -ENVIRONMENT=dev RUNID=`date +%Y%m%d` bash scripts/run_training_container.sh scripts/train.sh +ENVIRONMENT=dev bash scripts/run_training_container.sh scripts/train.sh ``` +### Pushing the new Docker image to production for the training and API services + +If this is your first and only ECR repo, then run + +```bash +bash scripts/push_image.sh $(aws ecr describe-repositories | jq -r '.repositories[0].repositoryUri') +``` + +You have have multiple ECR repos you'll have to change the argument so that it points to the one you want to push to. + ## Resources * https://github.com/pypa/sampleproject/blob/master/setup.py \ No newline at end of file diff --git a/notebooks/model-training.ipynb b/notebooks/model-training.ipynb index 31a3550..1ab648f 100644 --- a/notebooks/model-training.ipynb +++ b/notebooks/model-training.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -14,7 +14,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -39,7 +39,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "tags": [ "parameters" @@ -47,47 +47,7 @@ }, "outputs": [], "source": [ - "RUNID=datetime.now().strftime(\"%Y%m%d\")\n", - "DATA_DIR=os.path.join(\"..\", \"data\", RUNID)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 10\u001b[0m )\n\u001b[1;32m 11\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mzipfile\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mZipFile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mzip_archive\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"r\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mzip_ref\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0mzip_ref\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextractall\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mDATA_DIR\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m/usr/local/lib/python3.7/zipfile.py\u001b[0m in \u001b[0;36mextractall\u001b[0;34m(self, path, members, pwd)\u001b[0m\n\u001b[1;32m 1614\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1615\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mzipinfo\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmembers\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1616\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_extract_member\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mzipinfo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpwd\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1617\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1618\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mclassmethod\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.7/zipfile.py\u001b[0m in \u001b[0;36m_extract_member\u001b[0;34m(self, member, targetpath, pwd)\u001b[0m\n\u001b[1;32m 1669\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmember\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpwd\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpwd\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0msource\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1670\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtargetpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"wb\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1671\u001b[0;31m \u001b[0mshutil\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopyfileobj\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msource\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1672\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1673\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mtargetpath\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.7/shutil.py\u001b[0m in \u001b[0;36mcopyfileobj\u001b[0;34m(fsrc, fdst, length)\u001b[0m\n\u001b[1;32m 77\u001b[0m \u001b[0;34m\"\"\"copy data from file-like object fsrc to file-like object fdst\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 78\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 79\u001b[0;31m \u001b[0mbuf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfsrc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlength\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 80\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mbuf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 81\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.7/zipfile.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, n)\u001b[0m\n\u001b[1;32m 897\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_offset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 898\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0mn\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_eof\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 899\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_read1\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 900\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mn\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 901\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_readbuffer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.7/zipfile.py\u001b[0m in \u001b[0;36m_read1\u001b[0;34m(self, n)\u001b[0m\n\u001b[1;32m 973\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compress_type\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mZIP_DEFLATED\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 974\u001b[0m \u001b[0mn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mMIN_READ_SIZE\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 975\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_decompressor\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecompress\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 976\u001b[0m self._eof = (self._decompressor.eof or\n\u001b[1;32m 977\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compress_left\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32mand\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " - ] - } - ], - "source": [ - "# load the GloVe vectors in a dictionary:\n", - "\n", - "glove_data_dir = os.path.join(DATA_DIR, \"..\", \"glove\", \"glove.840B.300d.txt\")\n", - "\n", - "if not os.path.exists(os.path.join(glove_data_dir, \"glove.840B.300d.txt\")):\n", - " zip_archive = os.path.join(glove_data_dir, \"glove.840B.300d.zip\")\n", - " if not os.path.isdir(glove_data_dir):\n", - " os.makedirs(glove_data_dir)\n", - " urllib.request.urlretrieve(\n", - " \"http://www-nlp.stanford.edu/data/glove.840B.300d.zip\", \n", - " zip_archive,\n", - " )\n", - " with zipfile.ZipFile(zip_archive, \"r\") as zip_ref:\n", - " zip_ref.extractall(glove_data_dir)\n" + "DATA_DIR=os.path.join(\"..\", \"data\")" ] }, { @@ -95,41 +55,6 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "\n", - "embeddings_index = {}\n", - "with open(os.path.join(DATA_DIR, 'glove.840B.300d.txt'), 'r') as f:\n", - " for line in tqdm(f):\n", - " values = line.split()\n", - " word = values[0]\n", - " coefs = np.asarray(values[1:], dtype='float32')\n", - " embeddings_index[word] = coefs\n", - "\n", - "print('Found %s word vectors.' % len(embeddings_index))" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['Youtube03-LMFAO.csv',\n", - " 'Youtube04-Eminem.csv',\n", - " 'Youtube05-Shakira.csv',\n", - " 'Youtube02-KatyPerry.csv',\n", - " '__MACOSX',\n", - " 'YouTube-Spam-Collection-v1.zip',\n", - " 'Youtube01-Psy.csv']" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ "if os.getenv(\"ENVIRONMENT\", \"\") == \"dev\":\n", " zip_archive = os.path.join(DATA_DIR, \"YouTube-Spam-Collection-v1.zip\")\n", @@ -148,165 +73,9 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
COMMENT_IDAUTHORDATECONTENTCLASS
1220_2viQ_Qnc69Nq0Ytk1jCpzWPCrpGEk6T7cdVAxfSlAkShadrach Grentz2013-07-29T17:39:24.876000Hey Music Fans I really appreciate any of you ...1
964z13rgdjjjzmkuhnvf23dd1wxkqzwvjiij04Jessica Onyekwere2015-05-23T17:42:14.383000This song is special, because is a song for Af...0
96z13qwl2rznzohbhqy04ch3cy5tnihrkhlt40kThejaynetts2015-05-23T00:53:59.385000Never get old 0
1566z13sx5nrhq22yvste23fvnirosixy55ag04The Robot Green Hypno2014-11-07T13:37:51i like this song the video goes perfect with it0
1113_2viQ_Qnc68hNPCfXGAxIxW9V7wcDDxSdp-gyHTkghoranferi delgado2013-10-04T03:56:04.784000best song eva0
1813z12ufrszxq3zstw0r22yfbipvqvaypoldjosson 642014-11-07T15:58:57SUPER!!! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!...0
1512z12dynswht2rujq1e22xi5dappq1vrlh504ali aydın2014-10-25T16:59:02I like you . Katy Perry 600▲60▲6▲0
884z12hfp2wmyuqztkw504cgblyxtbsxjuzeow0kJesse Pinkman2015-05-06T11:42:44.601000Rihanna looks so beautiful with red hair ;)0
1931z12gwldoxpvgjru4004cj3fxyvvvwffjqjgRipazha Gaming2014-11-12T17:37:08http://hackfbaccountlive.com/?ref=52425751
1082_2viQ_Qnc69vgWhC2acrKSH-tvjKq1KuKBca1UtB8wkLouis Bryant2013-10-12T15:20:19.887000You guys should check out this EXTRAORDINARY w...1
\n", - "
" - ], - "text/plain": [ - " COMMENT_ID AUTHOR \\\n", - "1220 _2viQ_Qnc69Nq0Ytk1jCpzWPCrpGEk6T7cdVAxfSlAk Shadrach Grentz \n", - "964 z13rgdjjjzmkuhnvf23dd1wxkqzwvjiij04 Jessica Onyekwere \n", - "96 z13qwl2rznzohbhqy04ch3cy5tnihrkhlt40k Thejaynetts \n", - "1566 z13sx5nrhq22yvste23fvnirosixy55ag04 The Robot Green Hypno \n", - "1113 _2viQ_Qnc68hNPCfXGAxIxW9V7wcDDxSdp-gyHTkgho ranferi delgado \n", - "1813 z12ufrszxq3zstw0r22yfbipvqvaypold josson 64 \n", - "1512 z12dynswht2rujq1e22xi5dappq1vrlh504 ali aydın \n", - "884 z12hfp2wmyuqztkw504cgblyxtbsxjuzeow0k Jesse Pinkman \n", - "1931 z12gwldoxpvgjru4004cj3fxyvvvwffjqjg Ripazha Gaming \n", - "1082 _2viQ_Qnc69vgWhC2acrKSH-tvjKq1KuKBca1UtB8wk Louis Bryant \n", - "\n", - " DATE \\\n", - "1220 2013-07-29T17:39:24.876000 \n", - "964 2015-05-23T17:42:14.383000 \n", - "96 2015-05-23T00:53:59.385000 \n", - "1566 2014-11-07T13:37:51 \n", - "1113 2013-10-04T03:56:04.784000 \n", - "1813 2014-11-07T15:58:57 \n", - "1512 2014-10-25T16:59:02 \n", - "884 2015-05-06T11:42:44.601000 \n", - "1931 2014-11-12T17:37:08 \n", - "1082 2013-10-12T15:20:19.887000 \n", - "\n", - " CONTENT CLASS \n", - "1220 Hey Music Fans I really appreciate any of you ... 1 \n", - "964 This song is special, because is a song for Af... 0 \n", - "96 Never get old  0 \n", - "1566 i like this song the video goes perfect with it 0 \n", - "1113 best song eva 0 \n", - "1813 SUPER!!! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!... 0 \n", - "1512 I like you . Katy Perry 600▲60▲6▲ 0 \n", - "884 Rihanna looks so beautiful with red hair ;) 0 \n", - "1931 http://hackfbaccountlive.com/?ref=5242575 1 \n", - "1082 You guys should check out this EXTRAORDINARY w... 1 " - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "training_df_list = []\n", "for file_ in [os.path.join(DATA_DIR, file) for file in training_files if file.endswith(\".csv\")]:\n", @@ -326,18 +95,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(1760,)\n", - "(196,)\n" - ] - } - ], + "outputs": [], "source": [ "xtrain, xvalid, ytrain, yvalid = train_test_split(\n", " training_df.CONTENT, \n", @@ -353,7 +113,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -379,7 +139,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -400,7 +160,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -411,53 +171,9 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Fitting 2 folds for each of 12 candidates, totalling 24 fits\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.\n", - "[Parallel(n_jobs=-1)]: Done 1 tasks | elapsed: 9.2s\n", - "[Parallel(n_jobs=-1)]: Done 6 tasks | elapsed: 13.5s\n", - "[Parallel(n_jobs=-1)]: Done 13 tasks | elapsed: 22.4s\n", - "[Parallel(n_jobs=-1)]: Done 16 out of 24 | elapsed: 28.7s remaining: 14.4s\n", - "[Parallel(n_jobs=-1)]: Done 19 out of 24 | elapsed: 31.1s remaining: 8.2s\n", - "[Parallel(n_jobs=-1)]: Done 22 out of 24 | elapsed: 34.3s remaining: 3.1s\n", - "[Parallel(n_jobs=-1)]: Done 24 out of 24 | elapsed: 35.3s finished\n", - "/usr/local/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", - " FutureWarning)\n" - ] - }, - { - "data": { - "text/plain": [ - "GridSearchCV(cv=2, error_score='raise-deprecating',\n", - " estimator=Pipeline(memory=None,\n", - " steps=[('svd', TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,\n", - " random_state=None, tol=0.0)), ('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", - " intercept_scaling=1, max_iter=100, multi_class='warn',\n", - " n_jobs=None, penalty='l2', random_state=None, solver='warn',\n", - " tol=0.0001, verbose=0, warm_start=False))]),\n", - " fit_params=None, iid=True, n_jobs=-1,\n", - " param_grid={'svd__n_components': [120, 180], 'lr__C': [0.1, 1.0, 10], 'lr__penalty': ['l1', 'l2']},\n", - " pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',\n", - " scoring=None, verbose=10)" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Initialize Grid Search Model\n", "model = GridSearchCV(\n", @@ -476,21 +192,9 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Best score: 0.939\n", - "Best parameters set:\n", - "\tlr__C: 1.0\n", - "\tlr__penalty: 'l1'\n", - "\tsvd__n_components: 180\n" - ] - } - ], + "outputs": [], "source": [ "print(\"Best score: %0.3f\" % model.best_score_)\n", "print(\"Best parameters set:\")\n", @@ -505,8 +209,15 @@ "metadata": {}, "outputs": [], "source": [ - "pickle_to_fs()" + "pickle_to_fs(model.best_estimator_, filename=\"model.pkl\", subdirectory=DATA_DIR)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/requirements.txt b/requirements.txt index 9031a9e..f33876a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,5 @@ xgboost sklearn nltk jupyter -papermill \ No newline at end of file +papermill +awscli \ No newline at end of file diff --git a/scripts/get_glove.sh b/scripts/get_glove.sh index 906dbee..73b09e4 100644 --- a/scripts/get_glove.sh +++ b/scripts/get_glove.sh @@ -6,4 +6,4 @@ if [ ! -f "data/glove/glove.840B.300d.txt" ]; then mv glove.840B.300d.zip data/glove unzip data/glove/glove.840B.300d.zip -d data/glove rm data/glove/glove.840B.300d.zip -figo \ No newline at end of file +fi \ No newline at end of file diff --git a/scripts/get_latest_model.sh b/scripts/get_latest_model.sh new file mode 100755 index 0000000..57d615d --- /dev/null +++ b/scripts/get_latest_model.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +set -e + +BUCKET="s3://fwhigh-predictive-models" +MODEL_ID=$(aws s3 ls ${BUCKET}/models/ | awk '$1~/PRE/ {print $2}' | sed 's/\///g' | sort -nr | head -n 1) +S3_DIR=$BUCKET/models/$MODEL_ID +DIR=data + +echo Getting data from $S3_DIR +echo Writing it to $DIR + +mkdir -p $DIR + +aws s3 cp --recursive --exclude "*" --include "model.pkl" $S3_DIR/ $DIR/ \ No newline at end of file diff --git a/scripts/get_training_data.sh b/scripts/get_training_data.sh index 9a3ef0a..5f3a337 100644 --- a/scripts/get_training_data.sh +++ b/scripts/get_training_data.sh @@ -1,9 +1,11 @@ #!/usr/bin/env bash -set -e +S3_DIR=$1 +DIR=$2 -$DIR=$1 +echo Getting data from $S3_DIR +echo Writing it to $DIR +aws s3 cp --recursive $S3_DIR/ $DIR/ cd $DIR -wget https://archive.ics.uci.edu/ml/machine-learning-databases/00380/YouTube-Spam-Collection-v1.zip -unzip YouTube-Spam-Collection-v1.zip +unzip -v *.zip \ No newline at end of file diff --git a/scripts/install.sh b/scripts/install.sh index 8f26f5b..718c08d 100644 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -1,12 +1,3 @@ #!/usr/bin/env bash -if [ "$ENVIRONMENT" == "prod" ]; then - echo "Installing pmip package in prod environment" - pip install -U --upgrade-strategy only-if-needed -e . -elif [ "$ENVIRONMENT" == "staging" ]; then - echo "Installing pmip package in staging environment" - pip install -U --upgrade-strategy only-if-needed . -else - echo "Installing pmip package in dev environment" - pip install -U --upgrade-strategy only-if-needed . -fi \ No newline at end of file +pip install -U --upgrade-strategy only-if-needed -e . \ No newline at end of file diff --git a/scripts/push_image.sh b/scripts/push_image.sh new file mode 100755 index 0000000..daae8a8 --- /dev/null +++ b/scripts/push_image.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +ECR_URI=$1 +AWS_REGION=$(aws configure get region) + +IMAGE_TAG=latest + +$(aws ecr get-login --no-include-email --region $AWS_REGION) +docker tag pmip:staging ${ECR_URI}:${IMAGE_TAG} +docker push ${ECR_URI}:${IMAGE_TAG} \ No newline at end of file diff --git a/scripts/train.sh b/scripts/train.sh index 65fa821..46d609a 100644 --- a/scripts/train.sh +++ b/scripts/train.sh @@ -2,26 +2,28 @@ set -e -RUNID=`date +%Y%m%d` -DATA_DIR=data/${RUNID} -BUCKET="s3://predictive-models" -S3_DATA_DIR=${BUCKET}/${RUNID} +BUCKET="s3://fwhigh-predictive-models" +MODEL_ID=`date +%Y%m%d` +S3_DIR=$BUCKET/models/$MODEL_ID +DIR=data -mkdir -p DATA_DIR +TRAINING_ID=$(aws s3 ls $BUCKET/training/ | awk '$1~/PRE/ {print $2}' | sed 's/\///g' | sort -nr | head -n 1) + +mkdir -p $DIR # Get the data. Replace this line with something like: -# aws s3 cp $BUCKET/training-data/ $DATA_DIR/ +# aws s3 cp $BUCKET/training-data/ $DIR/ # to train on new data that's placed into S3 directly. -bash scripts/get_training_data.sh ${DATA_DIR} +bash scripts/get_training_data.sh $BUCKET/training/$TRAINING_ID $DIR # Train the model -papermill notebooks/model-training.ipynb ${DATA_DIR}/model-training-${RUNID}.ipynb \ - -p RUNID ${RUNID} -p DATA_DIR ${DATA_DIR} +papermill notebooks/model-training.ipynb $DIR/model-training-$MODEL_ID.ipynb -p DATA_DIR $DIR # Convert the notebook into HTML -jupyter nbconvert --to html ${DATA_DIR}/model-training-${RUNID}.ipynb +jupyter nbconvert --to html $DIR/model-training-$MODEL_ID.ipynb # Push any assets to the cloud if [ "$ENVIRONMENT" == "staging" ]; then - aws s3 cp --exclude * --include *.ipynb *.html *.pkl ${DATA_DIR}/ S3_DATA_DIR/ + echo Pushing model to S3 + aws s3 cp --recursive --exclude "*" --include "*.ipynb" --include "*.html" --include "*.pkl" $DIR/ $S3_DIR/ fi \ No newline at end of file