From c1758c81f87da851dc938f21f4015a76de62f339 Mon Sep 17 00:00:00 2001
From: fwhigh <fwhigh@gmail.com>
Date: Wed, 9 Jan 2019 00:06:10 -0800
Subject: [PATCH] First local training and s3 copy

---
 .gitignore                     |   2 +
 Dockerfile                     |   6 +-
 Dockerfile.train               |   2 -
 README.md                      |  20 +-
 notebooks/model-training.ipynb | 335 +++------------------------------
 requirements.txt               |   3 +-
 scripts/get_glove.sh           |   2 +-
 scripts/get_latest_model.sh    |  15 ++
 scripts/get_training_data.sh   |  10 +-
 scripts/install.sh             |  11 +-
 scripts/push_image.sh          |  10 +
 scripts/train.sh               |  24 +--
 12 files changed, 90 insertions(+), 350 deletions(-)
 create mode 100755 scripts/get_latest_model.sh
 create mode 100755 scripts/push_image.sh

diff --git a/.gitignore b/.gitignore
index 1a015d9..69a0c50 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,7 @@
 .idea
 data
+batch_create_compute_env.json
+*~
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/Dockerfile b/Dockerfile
index 343ef25..742c29d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
-
+FROM 015216235264.dkr.ecr.us-west-1.amazonaws.com/pmip:latest
 
 EXPOSE 8000
-ENTRYPOINT []
-CMD [ "gunicorn", "-w", "3", "-b", ":8000", "wsgi" ]
+ENTRYPOINT [ "bash" ]
+CMD [ "-c", "gunicorn -w 3 -b :8000 wsgi" ]
\ No newline at end of file
diff --git a/Dockerfile.train b/Dockerfile.train
index 0215ad3..b0dfcf6 100644
--- a/Dockerfile.train
+++ b/Dockerfile.train
@@ -17,8 +17,6 @@ COPY . .
 
 RUN ENVIRONMENT=$ENVIRONMENT bash scripts/install.sh
 
-RUN bash scripts/get_glove.sh
-
 EXPOSE 8888
 ENTRYPOINT [ "bash" ]
 CMD [ "-c", "jupyter notebook notebooks/ --allow-root --ip=0.0.0.0 --port=8888 --no-browser" ]
\ No newline at end of file
diff --git a/README.md b/README.md
index 1c54c6f..e35d030 100644
--- a/README.md
+++ b/README.md
@@ -6,15 +6,13 @@ Predictive Models in Production
 
 ### Pro tips
 
-If you ever find yourself with a "no space left on device" error, try
+If you ever find yourself with a "no space left on device" error when building the Docker image, try
 
 ```bash
 docker rm $(docker ps -q -f 'status=exited')
 docker rmi $(docker images -q -f "dangling=true")
 ```
 
-See, eg, https://forums.docker.com/t/no-space-left-on-device-error/10894/14. 
-
 ### Build the base training image
 
 ```bash
@@ -24,17 +22,27 @@ ENVIRONMENT=dev bash scripts/build_training_image.sh
 ### Do interactive model training and data exploration in the Jupyter notebook
 
 ```bash
-ENVIRONMENT=dev bash scripts/run_training_container.sh
+ENVIRONMENT=dev bash scripts/run_training_container.sh -c jupyter notebook notebooks/ --allow-root --ip=0.0.0.0 --port=8888 --no-browser
 ```
 
-Then open [http://localhost:8888](http://localhost:8888).
+Then open [http://localhost:8888](http://localhost:8888) to run Jupyter.
 
 ### Train a model programmatically
 
 ```bash
-ENVIRONMENT=dev RUNID=`date +%Y%m%d` bash scripts/run_training_container.sh scripts/train.sh
+ENVIRONMENT=dev bash scripts/run_training_container.sh scripts/train.sh
 ```
 
+### Pushing the new Docker image to production for the training and API services
+
+If this is your first and only ECR repo, then run
+
+```bash
+bash scripts/push_image.sh $(aws ecr describe-repositories | jq -r '.repositories[0].repositoryUri')
+```
+
+You have have multiple ECR repos you'll have to change the argument so that it points to the one you want to push to. 
+
 ## Resources
 
 * https://github.com/pypa/sampleproject/blob/master/setup.py
\ No newline at end of file
diff --git a/notebooks/model-training.ipynb b/notebooks/model-training.ipynb
index 31a3550..1ab648f 100644
--- a/notebooks/model-training.ipynb
+++ b/notebooks/model-training.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -14,7 +14,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -39,7 +39,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {
     "tags": [
      "parameters"
@@ -47,47 +47,7 @@
    },
    "outputs": [],
    "source": [
-    "RUNID=datetime.now().strftime(\"%Y%m%d\")\n",
-    "DATA_DIR=os.path.join(\"..\", \"data\", RUNID)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "KeyboardInterrupt",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-15-60a38be5214f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     10\u001b[0m     )\n\u001b[1;32m     11\u001b[0m     \u001b[0;32mwith\u001b[0m \u001b[0mzipfile\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mZipFile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mzip_archive\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"r\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mzip_ref\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m         \u001b[0mzip_ref\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextractall\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mDATA_DIR\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[0;32m/usr/local/lib/python3.7/zipfile.py\u001b[0m in \u001b[0;36mextractall\u001b[0;34m(self, path, members, pwd)\u001b[0m\n\u001b[1;32m   1614\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1615\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mzipinfo\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmembers\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1616\u001b[0;31m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_extract_member\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mzipinfo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpwd\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1617\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1618\u001b[0m     \u001b[0;34m@\u001b[0m\u001b[0mclassmethod\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/usr/local/lib/python3.7/zipfile.py\u001b[0m in \u001b[0;36m_extract_member\u001b[0;34m(self, member, targetpath, pwd)\u001b[0m\n\u001b[1;32m   1669\u001b[0m         \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmember\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpwd\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpwd\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0msource\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1670\u001b[0m              \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtargetpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"wb\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1671\u001b[0;31m             \u001b[0mshutil\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopyfileobj\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msource\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1672\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1673\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mtargetpath\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/usr/local/lib/python3.7/shutil.py\u001b[0m in \u001b[0;36mcopyfileobj\u001b[0;34m(fsrc, fdst, length)\u001b[0m\n\u001b[1;32m     77\u001b[0m     \u001b[0;34m\"\"\"copy data from file-like object fsrc to file-like object fdst\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     78\u001b[0m     \u001b[0;32mwhile\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 79\u001b[0;31m         \u001b[0mbuf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfsrc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlength\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     80\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mbuf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     81\u001b[0m             \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/usr/local/lib/python3.7/zipfile.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, n)\u001b[0m\n\u001b[1;32m    897\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_offset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    898\u001b[0m         \u001b[0;32mwhile\u001b[0m \u001b[0mn\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_eof\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 899\u001b[0;31m             \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_read1\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    900\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mn\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    901\u001b[0m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_readbuffer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/usr/local/lib/python3.7/zipfile.py\u001b[0m in \u001b[0;36m_read1\u001b[0;34m(self, n)\u001b[0m\n\u001b[1;32m    973\u001b[0m         \u001b[0;32melif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compress_type\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mZIP_DEFLATED\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    974\u001b[0m             \u001b[0mn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mMIN_READ_SIZE\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 975\u001b[0;31m             \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_decompressor\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecompress\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    976\u001b[0m             self._eof = (self._decompressor.eof or\n\u001b[1;32m    977\u001b[0m                          \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compress_left\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32mand\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
-     ]
-    }
-   ],
-   "source": [
-    "# load the GloVe vectors in a dictionary:\n",
-    "\n",
-    "glove_data_dir = os.path.join(DATA_DIR, \"..\", \"glove\", \"glove.840B.300d.txt\")\n",
-    "\n",
-    "if not os.path.exists(os.path.join(glove_data_dir, \"glove.840B.300d.txt\")):\n",
-    "    zip_archive = os.path.join(glove_data_dir, \"glove.840B.300d.zip\")\n",
-    "    if not os.path.isdir(glove_data_dir):\n",
-    "        os.makedirs(glove_data_dir)\n",
-    "    urllib.request.urlretrieve(\n",
-    "        \"http://www-nlp.stanford.edu/data/glove.840B.300d.zip\", \n",
-    "        zip_archive,\n",
-    "    )\n",
-    "    with zipfile.ZipFile(zip_archive, \"r\") as zip_ref:\n",
-    "        zip_ref.extractall(glove_data_dir)\n"
+    "DATA_DIR=os.path.join(\"..\", \"data\")"
    ]
   },
   {
@@ -95,41 +55,6 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "\n",
-    "embeddings_index = {}\n",
-    "with open(os.path.join(DATA_DIR, 'glove.840B.300d.txt'), 'r') as f:\n",
-    "    for line in tqdm(f):\n",
-    "        values = line.split()\n",
-    "        word = values[0]\n",
-    "        coefs = np.asarray(values[1:], dtype='float32')\n",
-    "        embeddings_index[word] = coefs\n",
-    "\n",
-    "print('Found %s word vectors.' % len(embeddings_index))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['Youtube03-LMFAO.csv',\n",
-       " 'Youtube04-Eminem.csv',\n",
-       " 'Youtube05-Shakira.csv',\n",
-       " 'Youtube02-KatyPerry.csv',\n",
-       " '__MACOSX',\n",
-       " 'YouTube-Spam-Collection-v1.zip',\n",
-       " 'Youtube01-Psy.csv']"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
     "if os.getenv(\"ENVIRONMENT\", \"\") == \"dev\":\n",
     "    zip_archive = os.path.join(DATA_DIR, \"YouTube-Spam-Collection-v1.zip\")\n",
@@ -148,165 +73,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>COMMENT_ID</th>\n",
-       "      <th>AUTHOR</th>\n",
-       "      <th>DATE</th>\n",
-       "      <th>CONTENT</th>\n",
-       "      <th>CLASS</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>1220</th>\n",
-       "      <td>_2viQ_Qnc69Nq0Ytk1jCpzWPCrpGEk6T7cdVAxfSlAk</td>\n",
-       "      <td>Shadrach Grentz</td>\n",
-       "      <td>2013-07-29T17:39:24.876000</td>\n",
-       "      <td>Hey Music Fans I really appreciate any of you ...</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>964</th>\n",
-       "      <td>z13rgdjjjzmkuhnvf23dd1wxkqzwvjiij04</td>\n",
-       "      <td>Jessica Onyekwere</td>\n",
-       "      <td>2015-05-23T17:42:14.383000</td>\n",
-       "      <td>This song is special, because is a song for Af...</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>96</th>\n",
-       "      <td>z13qwl2rznzohbhqy04ch3cy5tnihrkhlt40k</td>\n",
-       "      <td>Thejaynetts</td>\n",
-       "      <td>2015-05-23T00:53:59.385000</td>\n",
-       "      <td>Never get old ﻿</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1566</th>\n",
-       "      <td>z13sx5nrhq22yvste23fvnirosixy55ag04</td>\n",
-       "      <td>The Robot Green Hypno</td>\n",
-       "      <td>2014-11-07T13:37:51</td>\n",
-       "      <td>i like this song the video goes perfect with it﻿</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1113</th>\n",
-       "      <td>_2viQ_Qnc68hNPCfXGAxIxW9V7wcDDxSdp-gyHTkgho</td>\n",
-       "      <td>ranferi delgado</td>\n",
-       "      <td>2013-10-04T03:56:04.784000</td>\n",
-       "      <td>best song eva</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1813</th>\n",
-       "      <td>z12ufrszxq3zstw0r22yfbipvqvaypold</td>\n",
-       "      <td>josson 64</td>\n",
-       "      <td>2014-11-07T15:58:57</td>\n",
-       "      <td>SUPER!!! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!...</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1512</th>\n",
-       "      <td>z12dynswht2rujq1e22xi5dappq1vrlh504</td>\n",
-       "      <td>ali aydın</td>\n",
-       "      <td>2014-10-25T16:59:02</td>\n",
-       "      <td>I like you . Katy Perry 600▲60▲6▲﻿</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>884</th>\n",
-       "      <td>z12hfp2wmyuqztkw504cgblyxtbsxjuzeow0k</td>\n",
-       "      <td>Jesse Pinkman</td>\n",
-       "      <td>2015-05-06T11:42:44.601000</td>\n",
-       "      <td>Rihanna looks so beautiful with red hair ;)﻿</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1931</th>\n",
-       "      <td>z12gwldoxpvgjru4004cj3fxyvvvwffjqjg</td>\n",
-       "      <td>Ripazha Gaming</td>\n",
-       "      <td>2014-11-12T17:37:08</td>\n",
-       "      <td>http://hackfbaccountlive.com/?ref=5242575﻿</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1082</th>\n",
-       "      <td>_2viQ_Qnc69vgWhC2acrKSH-tvjKq1KuKBca1UtB8wk</td>\n",
-       "      <td>Louis Bryant</td>\n",
-       "      <td>2013-10-12T15:20:19.887000</td>\n",
-       "      <td>You guys should check out this EXTRAORDINARY w...</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                       COMMENT_ID                 AUTHOR  \\\n",
-       "1220  _2viQ_Qnc69Nq0Ytk1jCpzWPCrpGEk6T7cdVAxfSlAk        Shadrach Grentz   \n",
-       "964           z13rgdjjjzmkuhnvf23dd1wxkqzwvjiij04      Jessica Onyekwere   \n",
-       "96          z13qwl2rznzohbhqy04ch3cy5tnihrkhlt40k            Thejaynetts   \n",
-       "1566          z13sx5nrhq22yvste23fvnirosixy55ag04  The Robot Green Hypno   \n",
-       "1113  _2viQ_Qnc68hNPCfXGAxIxW9V7wcDDxSdp-gyHTkgho        ranferi delgado   \n",
-       "1813            z12ufrszxq3zstw0r22yfbipvqvaypold              josson 64   \n",
-       "1512          z12dynswht2rujq1e22xi5dappq1vrlh504              ali aydın   \n",
-       "884         z12hfp2wmyuqztkw504cgblyxtbsxjuzeow0k          Jesse Pinkman   \n",
-       "1931          z12gwldoxpvgjru4004cj3fxyvvvwffjqjg         Ripazha Gaming   \n",
-       "1082  _2viQ_Qnc69vgWhC2acrKSH-tvjKq1KuKBca1UtB8wk           Louis Bryant   \n",
-       "\n",
-       "                            DATE  \\\n",
-       "1220  2013-07-29T17:39:24.876000   \n",
-       "964   2015-05-23T17:42:14.383000   \n",
-       "96    2015-05-23T00:53:59.385000   \n",
-       "1566         2014-11-07T13:37:51   \n",
-       "1113  2013-10-04T03:56:04.784000   \n",
-       "1813         2014-11-07T15:58:57   \n",
-       "1512         2014-10-25T16:59:02   \n",
-       "884   2015-05-06T11:42:44.601000   \n",
-       "1931         2014-11-12T17:37:08   \n",
-       "1082  2013-10-12T15:20:19.887000   \n",
-       "\n",
-       "                                                CONTENT  CLASS  \n",
-       "1220  Hey Music Fans I really appreciate any of you ...      1  \n",
-       "964   This song is special, because is a song for Af...      0  \n",
-       "96                                      Never get old ﻿      0  \n",
-       "1566   i like this song the video goes perfect with it﻿      0  \n",
-       "1113                                      best song eva      0  \n",
-       "1813  SUPER!!! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!...      0  \n",
-       "1512                 I like you . Katy Perry 600▲60▲6▲﻿      0  \n",
-       "884        Rihanna looks so beautiful with red hair ;)﻿      0  \n",
-       "1931         http://hackfbaccountlive.com/?ref=5242575﻿      1  \n",
-       "1082  You guys should check out this EXTRAORDINARY w...      1  "
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "training_df_list = []\n",
     "for file_ in [os.path.join(DATA_DIR, file) for file in training_files if file.endswith(\".csv\")]:\n",
@@ -326,18 +95,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(1760,)\n",
-      "(196,)\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "xtrain, xvalid, ytrain, yvalid = train_test_split(\n",
     "    training_df.CONTENT, \n",
@@ -353,7 +113,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -379,7 +139,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -400,7 +160,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -411,53 +171,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Fitting 2 folds for each of 12 candidates, totalling 24 fits\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.\n",
-      "[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    9.2s\n",
-      "[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   13.5s\n",
-      "[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   22.4s\n",
-      "[Parallel(n_jobs=-1)]: Done  16 out of  24 | elapsed:   28.7s remaining:   14.4s\n",
-      "[Parallel(n_jobs=-1)]: Done  19 out of  24 | elapsed:   31.1s remaining:    8.2s\n",
-      "[Parallel(n_jobs=-1)]: Done  22 out of  24 | elapsed:   34.3s remaining:    3.1s\n",
-      "[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   35.3s finished\n",
-      "/usr/local/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
-      "  FutureWarning)\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "GridSearchCV(cv=2, error_score='raise-deprecating',\n",
-       "       estimator=Pipeline(memory=None,\n",
-       "     steps=[('svd', TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,\n",
-       "       random_state=None, tol=0.0)), ('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
-       "          intercept_scaling=1, max_iter=100, multi_class='warn',\n",
-       "          n_jobs=None, penalty='l2', random_state=None, solver='warn',\n",
-       "          tol=0.0001, verbose=0, warm_start=False))]),\n",
-       "       fit_params=None, iid=True, n_jobs=-1,\n",
-       "       param_grid={'svd__n_components': [120, 180], 'lr__C': [0.1, 1.0, 10], 'lr__penalty': ['l1', 'l2']},\n",
-       "       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',\n",
-       "       scoring=None, verbose=10)"
-      ]
-     },
-     "execution_count": 13,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Initialize Grid Search Model\n",
     "model = GridSearchCV(\n",
@@ -476,21 +192,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Best score: 0.939\n",
-      "Best parameters set:\n",
-      "\tlr__C: 1.0\n",
-      "\tlr__penalty: 'l1'\n",
-      "\tsvd__n_components: 180\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print(\"Best score: %0.3f\" % model.best_score_)\n",
     "print(\"Best parameters set:\")\n",
@@ -505,8 +209,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "pickle_to_fs()"
+    "pickle_to_fs(model.best_estimator_, filename=\"model.pkl\", subdirectory=DATA_DIR)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/requirements.txt b/requirements.txt
index 9031a9e..f33876a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,4 +5,5 @@ xgboost
 sklearn
 nltk
 jupyter
-papermill
\ No newline at end of file
+papermill
+awscli
\ No newline at end of file
diff --git a/scripts/get_glove.sh b/scripts/get_glove.sh
index 906dbee..73b09e4 100644
--- a/scripts/get_glove.sh
+++ b/scripts/get_glove.sh
@@ -6,4 +6,4 @@ if [ ! -f "data/glove/glove.840B.300d.txt" ]; then
     mv glove.840B.300d.zip data/glove
     unzip data/glove/glove.840B.300d.zip -d data/glove
     rm data/glove/glove.840B.300d.zip
-figo
\ No newline at end of file
+fi
\ No newline at end of file
diff --git a/scripts/get_latest_model.sh b/scripts/get_latest_model.sh
new file mode 100755
index 0000000..57d615d
--- /dev/null
+++ b/scripts/get_latest_model.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+
+set -e
+
+BUCKET="s3://fwhigh-predictive-models"
+MODEL_ID=$(aws s3 ls ${BUCKET}/models/ | awk '$1~/PRE/ {print $2}' | sed 's/\///g' | sort -nr | head -n 1)
+S3_DIR=$BUCKET/models/$MODEL_ID
+DIR=data
+
+echo Getting data from $S3_DIR
+echo Writing it to $DIR
+
+mkdir -p $DIR
+
+aws s3 cp --recursive --exclude "*" --include "model.pkl" $S3_DIR/ $DIR/
\ No newline at end of file
diff --git a/scripts/get_training_data.sh b/scripts/get_training_data.sh
index 9a3ef0a..5f3a337 100644
--- a/scripts/get_training_data.sh
+++ b/scripts/get_training_data.sh
@@ -1,9 +1,11 @@
 #!/usr/bin/env bash
 
-set -e
+S3_DIR=$1
+DIR=$2
 
-$DIR=$1
+echo Getting data from $S3_DIR
+echo Writing it to $DIR
 
+aws s3 cp --recursive $S3_DIR/ $DIR/
 cd $DIR
-wget https://archive.ics.uci.edu/ml/machine-learning-databases/00380/YouTube-Spam-Collection-v1.zip
-unzip YouTube-Spam-Collection-v1.zip
+unzip -v *.zip
\ No newline at end of file
diff --git a/scripts/install.sh b/scripts/install.sh
index 8f26f5b..718c08d 100644
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -1,12 +1,3 @@
 #!/usr/bin/env bash
 
-if [ "$ENVIRONMENT" == "prod" ]; then
-    echo "Installing pmip package in prod environment"
-    pip install -U --upgrade-strategy only-if-needed -e .
-elif [ "$ENVIRONMENT" == "staging" ]; then
-    echo "Installing pmip package in staging environment"
-    pip install -U --upgrade-strategy only-if-needed .
-else
-    echo "Installing pmip package in dev environment"
-    pip install -U --upgrade-strategy only-if-needed .
-fi
\ No newline at end of file
+pip install -U --upgrade-strategy only-if-needed -e .
\ No newline at end of file
diff --git a/scripts/push_image.sh b/scripts/push_image.sh
new file mode 100755
index 0000000..daae8a8
--- /dev/null
+++ b/scripts/push_image.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+ECR_URI=$1
+AWS_REGION=$(aws configure get region)
+
+IMAGE_TAG=latest
+
+$(aws ecr get-login --no-include-email --region $AWS_REGION)
+docker tag pmip:staging ${ECR_URI}:${IMAGE_TAG}
+docker push ${ECR_URI}:${IMAGE_TAG}
\ No newline at end of file
diff --git a/scripts/train.sh b/scripts/train.sh
index 65fa821..46d609a 100644
--- a/scripts/train.sh
+++ b/scripts/train.sh
@@ -2,26 +2,28 @@
 
 set -e
 
-RUNID=`date +%Y%m%d`
-DATA_DIR=data/${RUNID}
-BUCKET="s3://predictive-models"
-S3_DATA_DIR=${BUCKET}/${RUNID}
+BUCKET="s3://fwhigh-predictive-models"
+MODEL_ID=`date +%Y%m%d`
+S3_DIR=$BUCKET/models/$MODEL_ID
+DIR=data
 
-mkdir -p DATA_DIR
+TRAINING_ID=$(aws s3 ls $BUCKET/training/ | awk '$1~/PRE/ {print $2}' | sed 's/\///g' | sort -nr | head -n 1)
+
+mkdir -p $DIR
 
 # Get the data. Replace this line with something like:
-#   aws s3 cp $BUCKET/training-data/ $DATA_DIR/
+#   aws s3 cp $BUCKET/training-data/ $DIR/
 # to train on new data that's placed into S3 directly.
-bash scripts/get_training_data.sh ${DATA_DIR}
+bash scripts/get_training_data.sh $BUCKET/training/$TRAINING_ID $DIR
 
 # Train the model
-papermill notebooks/model-training.ipynb ${DATA_DIR}/model-training-${RUNID}.ipynb \
-    -p RUNID ${RUNID} -p DATA_DIR ${DATA_DIR}
+papermill notebooks/model-training.ipynb $DIR/model-training-$MODEL_ID.ipynb -p DATA_DIR $DIR
 
 # Convert the notebook into HTML
-jupyter nbconvert --to html ${DATA_DIR}/model-training-${RUNID}.ipynb
+jupyter nbconvert --to html $DIR/model-training-$MODEL_ID.ipynb
 
 # Push any assets to the cloud
 if [ "$ENVIRONMENT" == "staging" ]; then
-    aws s3 cp --exclude * --include *.ipynb *.html *.pkl ${DATA_DIR}/ S3_DATA_DIR/
+    echo Pushing model to S3
+    aws s3 cp --recursive --exclude "*" --include "*.ipynb" --include "*.html" --include "*.pkl" $DIR/ $S3_DIR/
 fi
\ No newline at end of file