Skip to content

Commit

Permalink
batch-model-training (#121)
Browse files Browse the repository at this point in the history
* batch compute env for automated model training

revised batch setup for model training terraform

black and flake8 code checks

using AWS base image

using aws tensorflow base image, cleaner script org for model training

modified for batch deploy

more flexible for parallel batch compute jobs

modified job defs for batch model training

fixed minor bugs

fixed certs error, using ubuntu base image

apt install awscli instead of pip, removed user

fix for installing awscli noninteractively

install awscli using curl

update argparsing to allow single model training, fix kfold val bug

black and flake8 formatting

fixed typo

set default options for main script

save and upload models iteratively

bugfix- iteratively save models

using single jobdef for model training

* bugfixes, updated to model-training image to python 3.9.5 and tensorflow 2.5.0

* black and flake8

* removed keras dependency, using tensorflow keras only
  • Loading branch information
alphasentaurii authored Jun 15, 2021
1 parent 90c7856 commit 63e7a47
Show file tree
Hide file tree
Showing 11 changed files with 5,213 additions and 27 deletions.
49 changes: 24 additions & 25 deletions lambda/JobPredict/predict_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,26 @@
# mitigation of potential API rate restrictions (esp for Batch API)
retry_config = Config(retries={"max_attempts": 5, "mode": "standard"})
s3 = boto3.resource("s3", config=retry_config)
client = boto3.client("s3", config=retry_config)


def get_model(model_path):
"""Loads pretrained Keras functional model"""
model = tf.keras.models.load_model(model_path)
return model


def classifier(model, data):
"""Returns class prediction"""
pred_proba = model.predict(data)
pred = int(np.argmax(pred_proba, axis=-1))
return pred, pred_proba


def regressor(model, data):
"""Returns Regression model prediction"""
pred = model.predict(data)
return pred


class Preprocess:
Expand Down Expand Up @@ -117,31 +137,6 @@ def transformer(self):
return X


def get_model(model_path):
"""Loads pretrained Keras functional model"""
model = tf.keras.models.load_model(model_path)
return model


def classifier(model, data):
"""Returns class prediction"""
pred_proba = model.predict(data)
pred = int(np.argmax(pred_proba, axis=-1))
return pred, pred_proba


def regressor(model, data):
"""Returns Regression model prediction"""
pred = model.predict(data)
return pred


# load models
clf = get_model("./models/mem_clf/")
mem_reg = get_model("./models/mem_reg/")
wall_reg = get_model("./models/wall_reg/")


def lambda_handler(event, context):
"""Predict Resource Allocation requirements for memory (GB) and max execution `kill time` / `wallclock` (seconds) using three pre-trained neural networks. This lambda is invoked from the Job Submit lambda which json.dumps the s3 bucket and key to the file containing job input parameters. The path to the text file in s3 assumes the following format: `control/ipppssoot/ipppssoot_MemModelFeatures.txt`.
Expand All @@ -158,6 +153,10 @@ def lambda_handler(event, context):
MEMORY REGRESSION: A third regression model is used to estimate the actual value of memory needed for the job. This is mainly for the purpose of logging/future analysis and is not currently being used for allocating memory in calcloud jobs.
"""
bucket_name = event["Bucket"]
# load models
clf = get_model("./models/mem_clf/")
mem_reg = get_model("./models/mem_reg/")
wall_reg = get_model("./models/wall_reg/")
key = event["Key"]
ipppssoot = event["Ipppssoot"]
prep = Preprocess(ipppssoot, bucket_name, key)
Expand Down
15 changes: 15 additions & 0 deletions modeling/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
FROM library/ubuntu:20.04
#SSL/TLS cert setup for STScI AWS firewalling
USER root
ENV REQUESTS_CA_BUNDLE=/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem
ENV CURL_CA_BUNDLE=/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem
RUN mkdir -p /etc/ssl/certs/ && mkdir -p /etc/pki/ca-trust/extracted/pem/
COPY certs/tls-ca-bundle.pem /etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem
RUN ln -s /etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem /etc/ssl/certs/ca-bundle.crt && ln -s /etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem /etc/ssl/certs/ca-certificates.crt && mkdir -p /etc/pki/ca-trust/extracted/openssl
RUN DEBIAN_FRONTEND=noninteractive && NCORES=`nproc` && apt update && apt upgrade -y && apt install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev libsqlite3-dev wget libbz2-dev liblzma-dev curl unzip && wget https://www.python.org/ftp/python/3.9.5/Python-3.9.5.tgz && tar -xf Python-3.9.5.tgz && cd Python-3.9.5 && ./configure --enable-optimizations && make -j $NCORES && make altinstall && update-alternatives --install /usr/local/bin/python python /usr/local/bin/python3.9 10 && cd ../ && curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" && unzip awscliv2.zip && ./aws/install

WORKDIR /home/developer
COPY requirements.txt ./
RUN python -m pip install --upgrade pip && python -m pip install -r requirements.txt && mkdir -p /home/developer/modeling
ADD main.py io.py ingest.py train.py ./modeling/
CMD ["python", "-m", "modeling.main"]
3,814 changes: 3,814 additions & 0 deletions modeling/certs/tls-ca-bundle.pem

Large diffs are not rendered by default.

Loading

0 comments on commit 63e7a47

Please sign in to comment.