batch-model-training (#121)

* batch compute env for automated model training revised batch setup for model training terraform black and flake8 code checks using AWS base image using aws tensorflow base image, cleaner script org for model training modified for batch deploy more flexible for parallel batch compute jobs modified job defs for batch model training fixed minor bugs fixed certs error, using ubuntu base image apt install awscli instead of pip, removed user fix for installing awscli noninteractively install awscli using curl update argparsing to allow single model training, fix kfold val bug black and flake8 formatting fixed typo set default options for main script save and upload models iteratively bugfix- iteratively save models using single jobdef for model training * bugfixes, updated to model-training image to python 3.9.5 and tensorflow 2.5.0 * black and flake8 * removed keras dependency, using tensorflow keras only
spacetelescope · Jun 15, 2021 · 63e7a47 · 63e7a47
1 parent 90c7856
commit 63e7a47
Show file tree

Hide file tree

Showing 11 changed files with 5,213 additions and 27 deletions.
diff --git a/lambda/JobPredict/predict_handler.py b/lambda/JobPredict/predict_handler.py
@@ -13,6 +13,26 @@
 # mitigation of potential API rate restrictions (esp for Batch API)
 retry_config = Config(retries={"max_attempts": 5, "mode": "standard"})
 s3 = boto3.resource("s3", config=retry_config)
+client = boto3.client("s3", config=retry_config)
+
+
+def get_model(model_path):
+    """Loads pretrained Keras functional model"""
+    model = tf.keras.models.load_model(model_path)
+    return model
+
+
+def classifier(model, data):
+    """Returns class prediction"""
+    pred_proba = model.predict(data)
+    pred = int(np.argmax(pred_proba, axis=-1))
+    return pred, pred_proba
+
+
+def regressor(model, data):
+    """Returns Regression model prediction"""
+    pred = model.predict(data)
+    return pred
 
 
 class Preprocess:
@@ -117,31 +137,6 @@ def transformer(self):
         return X
 
 
-def get_model(model_path):
-    """Loads pretrained Keras functional model"""
-    model = tf.keras.models.load_model(model_path)
-    return model
-
-
-def classifier(model, data):
-    """Returns class prediction"""
-    pred_proba = model.predict(data)
-    pred = int(np.argmax(pred_proba, axis=-1))
-    return pred, pred_proba
-
-
-def regressor(model, data):
-    """Returns Regression model prediction"""
-    pred = model.predict(data)
-    return pred
-
-
-# load models
-clf = get_model("./models/mem_clf/")
-mem_reg = get_model("./models/mem_reg/")
-wall_reg = get_model("./models/wall_reg/")
-
-
 def lambda_handler(event, context):
     """Predict Resource Allocation requirements for memory (GB) and max execution `kill time` / `wallclock` (seconds) using three pre-trained neural networks. This lambda is invoked from the Job Submit lambda which json.dumps the s3 bucket and key to the file containing job input parameters. The path to the text file in s3 assumes the following format: `control/ipppssoot/ipppssoot_MemModelFeatures.txt`.
 
@@ -158,6 +153,10 @@ def lambda_handler(event, context):
     MEMORY REGRESSION: A third regression model is used to estimate the actual value of memory needed for the job. This is mainly for the purpose of logging/future analysis and is not currently being used for allocating memory in calcloud jobs.
     """
     bucket_name = event["Bucket"]
+    # load models
+    clf = get_model("./models/mem_clf/")
+    mem_reg = get_model("./models/mem_reg/")
+    wall_reg = get_model("./models/wall_reg/")
     key = event["Key"]
     ipppssoot = event["Ipppssoot"]
     prep = Preprocess(ipppssoot, bucket_name, key)

diff --git a/modeling/Dockerfile b/modeling/Dockerfile
@@ -0,0 +1,15 @@
+FROM library/ubuntu:20.04
+#SSL/TLS cert setup for STScI AWS firewalling
+USER root
+ENV REQUESTS_CA_BUNDLE=/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem
+ENV CURL_CA_BUNDLE=/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem
+RUN mkdir -p /etc/ssl/certs/ && mkdir -p /etc/pki/ca-trust/extracted/pem/
+COPY certs/tls-ca-bundle.pem /etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem
+RUN ln -s /etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem /etc/ssl/certs/ca-bundle.crt && ln -s /etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem /etc/ssl/certs/ca-certificates.crt && mkdir -p /etc/pki/ca-trust/extracted/openssl
+RUN DEBIAN_FRONTEND=noninteractive && NCORES=`nproc` && apt update && apt upgrade -y && apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev liblzma-dev curl unzip && wget https://www.python.org/ftp/python/3.9.5/Python-3.9.5.tgz && tar -xf Python-3.9.5.tgz && cd Python-3.9.5 && ./configure --enable-optimizations && make -j $NCORES && make altinstall && update-alternatives --install /usr/local/bin/python python /usr/local/bin/python3.9 10 && cd ../ && curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" && unzip awscliv2.zip && ./aws/install
+
+WORKDIR /home/developer
+COPY requirements.txt ./
+RUN python -m pip install --upgrade pip && python -m pip install -r requirements.txt && mkdir -p /home/developer/modeling
+ADD main.py io.py ingest.py train.py ./modeling/
+CMD ["python", "-m", "modeling.main"]
diff --git a/modeling/certs/tls-ca-bundle.pem b/modeling/certs/tls-ca-bundle.pem