From acecbc3e3e61930110df5cd9da2ad89526c0e2c9 Mon Sep 17 00:00:00 2001
From: Alexandre Lissy <lissyx@lissyx.dyndns.org>
Date: Thu, 8 Apr 2021 22:21:24 +0200
Subject: [PATCH] Optimize a bit Docker

---
 .github/workflows/docker.yml |   9 +-
 Dockerfile.build.tmpl        | 157 +++++++++++++++--------------------
 Dockerfile.train.tmpl        |  64 ++++++--------
 Makefile                     |   2 +-
 4 files changed, 101 insertions(+), 131 deletions(-)

diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index 742920a1c5..beeb95a1ba 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -14,11 +14,14 @@ jobs:
     steps:
       - uses: actions/checkout@v2
         with:
-          fetch-depth: 0
-          submodules: 'recursive'
+          fetch-depth: 1
       - run: |
           make Dockerfile.${{ matrix.template }} \
             DEEPSPEECH_REPO=https://github.com/${{ github.repository }} \
             DEEPSPEECH_SHA=${{ github.sha }}
       - run: |
-          docker build -t app:${{ matrix.template }} -f Dockerfile.${{ matrix.template }} .
+          mkdir /tmp/empty
+      - run: |
+          cd /tmp/empty; docker build -t app:${{ matrix.template }} -f ${{ github.workspace }}/Dockerfile.${{ matrix.template }} .
+      - run: |
+          docker save app:${{ matrix.template}} | zstd -o app_${{ matrix.template }}.zstd
diff --git a/Dockerfile.build.tmpl b/Dockerfile.build.tmpl
index a8c2f63c23..73638dbeb4 100644
--- a/Dockerfile.build.tmpl
+++ b/Dockerfile.build.tmpl
@@ -3,8 +3,8 @@
 # Need devel version cause we need /usr/include/cudnn.h 
 FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
 
-ENV DEEPSPEECH_REPO=#DEEPSPEECH_REPO#
-ENV DEEPSPEECH_SHA=#DEEPSPEECH_SHA#
+ENV DEEPSPEECH_REPO=#DEEPSPEECH_REPO# \
+    DEEPSPEECH_SHA=#DEEPSPEECH_SHA#
 
 # >> START Install base software
 
@@ -39,62 +39,59 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     sox \
     unzip \
     wget \
-    zlib1g-dev
-
-RUN update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1
-RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 1
-
-# Install Bazel
-RUN curl -LO "https://github.com/bazelbuild/bazel/releases/download/3.1.0/bazel_3.1.0-linux-x86_64.deb"
-RUN dpkg -i bazel_*.deb
-
-# Try and free some space
-RUN rm -rf /var/lib/apt/lists/*
+    zlib1g-dev; \
+    update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \
+    update-alternatives --install /usr/bin/python python /usr/bin/python3 1; \
+    # Install Bazel \
+    curl -LO "https://github.com/bazelbuild/bazel/releases/download/3.1.0/bazel_3.1.0-linux-x86_64.deb" && dpkg -i bazel_*.deb; \
+    # Try and free some space \
+    rm -rf /var/lib/apt/lists/* bazel_*.deb
 
 # << END Install base software
 
 # >> START Configure Tensorflow Build
 
 # GPU Environment Setup
-ENV TF_NEED_ROCM 0
-ENV TF_NEED_OPENCL_SYCL 0
-ENV TF_NEED_OPENCL 0
-ENV TF_NEED_CUDA 1
-ENV TF_CUDA_PATHS "/usr,/usr/local/cuda-10.1,/usr/lib/x86_64-linux-gnu/"
-ENV TF_CUDA_VERSION 10.1
-ENV TF_CUDNN_VERSION 7.6
-ENV TF_CUDA_COMPUTE_CAPABILITIES 6.0
-ENV TF_NCCL_VERSION 2.8
-
-# Common Environment Setup
-ENV TF_BUILD_CONTAINER_TYPE GPU
-ENV TF_BUILD_OPTIONS OPT
-ENV TF_BUILD_DISABLE_GCP 1
-ENV TF_BUILD_ENABLE_XLA 0
-ENV TF_BUILD_PYTHON_VERSION PYTHON3
-ENV TF_BUILD_IS_OPT OPT
-ENV TF_BUILD_IS_PIP PIP
-
-# Other Parameters
-ENV CC_OPT_FLAGS -mavx -mavx2 -msse4.1 -msse4.2 -mfma
-ENV TF_NEED_GCP 0
-ENV TF_NEED_HDFS 0
-ENV TF_NEED_JEMALLOC 1
-ENV TF_NEED_OPENCL 0
-ENV TF_CUDA_CLANG 0
-ENV TF_NEED_MKL 0
-ENV TF_ENABLE_XLA 0
-ENV TF_NEED_AWS 0
-ENV TF_NEED_KAFKA 0
-ENV TF_NEED_NGRAPH 0
-ENV TF_DOWNLOAD_CLANG 0
-ENV TF_NEED_TENSORRT 0
-ENV TF_NEED_GDR 0
-ENV TF_NEED_VERBS 0
-ENV TF_NEED_OPENCL_SYCL 0
-
-ENV PYTHON_BIN_PATH /usr/bin/python3.6
-ENV PYTHON_LIB_PATH /usr/local/lib/python3.6/dist-packages
+ENV TF_NEED_ROCM=0 \
+    TF_NEED_OPENCL_SYCL=0 \
+    TF_NEED_OPENCL=0 \
+    TF_NEED_CUDA=1 \
+    TF_CUDA_PATHS="/usr,/usr/local/cuda-10.1,/usr/lib/x86_64-linux-gnu/" \
+    TF_CUDA_VERSION=10.1 \
+    TF_CUDNN_VERSION=7.6 \
+    TF_CUDA_COMPUTE_CAPABILITIES=6.0 \
+    TF_NCCL_VERSION=2.8 \
+    # Common Environment Setup \
+    TF_BUILD_CONTAINER_TYPE=GPU \
+    TF_BUILD_OPTIONS=OPT \
+    TF_BUILD_DISABLE_GCP=1 \
+    TF_BUILD_ENABLE_XLA=0 \
+    TF_BUILD_PYTHON_VERSION=PYTHON3 \
+    TF_BUILD_IS_OPT=OPT \
+    TF_BUILD_IS_PIP=PIP \
+    # Build client.cc and install Python client and decoder bindings \
+    TFDIR=/DeepSpeech/tensorflow \
+    # Allow Python printing utf-8 \
+    PYTHONIOENCODING=UTF-8 \
+    # Other Parameters \
+    CC_OPT_FLAGS="-mavx -mavx2 -msse4.1 -msse4.2 -mfma" \
+    TF_NEED_GCP=0 \
+    TF_NEED_HDFS=0 \
+    TF_NEED_JEMALLOC=1 \
+    TF_NEED_OPENCL=0 \
+    TF_CUDA_CLANG=0 \
+    TF_NEED_MKL=0 \
+    TF_ENABLE_XLA=0 \
+    TF_NEED_AWS=0 \
+    TF_NEED_KAFKA=0 \
+    TF_NEED_NGRAPH=0 \
+    TF_DOWNLOAD_CLANG=0 \
+    TF_NEED_TENSORRT=0 \
+    TF_NEED_GDR=0 \
+    TF_NEED_VERBS=0 \
+    TF_NEED_OPENCL_SYCL=0 \
+    PYTHON_BIN_PATH=/usr/bin/python3.6 \
+    PYTHON_LIB_PATH=/usr/local/lib/python3.6/dist-packages
 
 # << END Configure Tensorflow Build
 
@@ -103,29 +100,23 @@ ENV PYTHON_LIB_PATH /usr/local/lib/python3.6/dist-packages
 # Running bazel inside a `docker build` command causes trouble, cf:
 #   https://github.com/bazelbuild/bazel/issues/134
 # The easiest solution is to set up a bazelrc file forcing --batch.
-RUN echo "startup --batch" >>/etc/bazel.bazelrc
 # Similarly, we need to workaround sandboxing issues:
 #   https://github.com/bazelbuild/bazel/issues/418
-RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
-    >>/etc/bazel.bazelrc
+RUN echo "startup --batch" >>/etc/bazel.bazelrc; \
+    echo "build --spawn_strategy=standalone --genrule_strategy=standalone" >> /etc/bazel.bazelrc
 
 # << END Configure Bazel
 
 WORKDIR /
 
-RUN git clone --recursive $DEEPSPEECH_REPO DeepSpeech
-WORKDIR /DeepSpeech
-RUN git fetch origin $DEEPSPEECH_SHA && git checkout $DEEPSPEECH_SHA
-RUN git submodule sync tensorflow/ && git submodule update --init tensorflow/
-RUN git submodule sync kenlm/ && git submodule update --init kenlm/
+RUN git clone --recursive $DEEPSPEECH_REPO DeepSpeech && \
+    cd /DeepSpeech && \
+    git fetch origin $DEEPSPEECH_SHA && git checkout $DEEPSPEECH_SHA; \
+    git submodule sync tensorflow/ && git submodule update --init tensorflow/; \
+    git submodule sync kenlm/ && git submodule update --init kenlm/
 
 # >> START Build and bind
-
-WORKDIR /DeepSpeech/tensorflow
-
 # Fix for not found script https://github.com/tensorflow/tensorflow/issues/471
-RUN ./configure
-
 # Using CPU optimizations:
 # -mtune=generic -march=x86-64 -msse -msse2 -msse3 -msse4.1 -msse4.2 -mavx.
 # Adding --config=cuda flag to build using CUDA.
@@ -133,7 +124,7 @@ RUN ./configure
 # passing LD_LIBRARY_PATH is required cause Bazel doesn't pickup it from environment
 
 # Build DeepSpeech
-RUN bazel build \
+RUN cd /DeepSpeech/tensorflow && ./configure && bazel build \
 	--workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" \
 	--config=monolithic \
 	--config=cuda \
@@ -151,36 +142,22 @@ RUN bazel build \
 	--copt=-fvisibility=hidden \
 	//native_client:libdeepspeech.so \
 	--verbose_failures \
-	--action_env=LD_LIBRARY_PATH=${LD_LIBRARY_PATH}
+	--action_env=LD_LIBRARY_PATH=${LD_LIBRARY_PATH} && \
+    cp bazel-bin/native_client/libdeepspeech.so /DeepSpeech/native_client/ && \
+    rm -fr /root/.cache/*
 
-# Copy built libs to /DeepSpeech/native_client
-RUN cp bazel-bin/native_client/libdeepspeech.so /DeepSpeech/native_client/
-
-# Build client.cc and install Python client and decoder bindings
-ENV TFDIR /DeepSpeech/tensorflow
-
-RUN nproc
-
-WORKDIR /DeepSpeech/native_client
-RUN make NUM_PROCESSES=$(nproc) deepspeech
-
-WORKDIR /DeepSpeech
-RUN cd native_client/python && make NUM_PROCESSES=$(nproc) bindings
-RUN pip3 install --upgrade native_client/python/dist/*.whl
-
-RUN cd native_client/ctcdecode && make NUM_PROCESSES=$(nproc) bindings
-RUN pip3 install --upgrade native_client/ctcdecode/dist/*.whl
+RUN cd /DeepSpeech/native_client && make NUM_PROCESSES=$(nproc) deepspeech ; \
+    cd /DeepSpeech/native_client/python && make NUM_PROCESSES=$(nproc) bindings; \
+    pip3 install --upgrade dist/*.whl; \
+    cd /DeepSpeech/native_client/ctcdecode && make NUM_PROCESSES=$(nproc) bindings; \
+    pip3 install --upgrade dist/*.whl
 
 # << END Build and bind
 
-# Allow Python printing utf-8
-ENV PYTHONIOENCODING UTF-8
-
 # Build KenLM in /DeepSpeech/kenlm folder
 WORKDIR /DeepSpeech/kenlm
-RUN wget -O - https://gitlab.com/libeigen/eigen/-/archive/3.3.8/eigen-3.3.8.tar.bz2 | tar xj
-RUN ls -hal
-RUN mkdir -p build && \
+RUN wget -O - https://gitlab.com/libeigen/eigen/-/archive/3.3.8/eigen-3.3.8.tar.bz2 | tar xj; \
+    mkdir -p build && \
     cd build && \
     EIGEN3_ROOT=/DeepSpeech/kenlm/eigen-3.3.8 cmake .. && \
     make -j $(nproc)
diff --git a/Dockerfile.train.tmpl b/Dockerfile.train.tmpl
index 9332ad74b6..c73dcaf6ca 100644
--- a/Dockerfile.train.tmpl
+++ b/Dockerfile.train.tmpl
@@ -1,10 +1,9 @@
 # Please refer to the TRAINING documentation, "Basic Dockerfile for training"
 
 FROM tensorflow/tensorflow:1.15.4-gpu-py3
-ENV DEBIAN_FRONTEND=noninteractive
-
-ENV DEEPSPEECH_REPO=#DEEPSPEECH_REPO#
-ENV DEEPSPEECH_SHA=#DEEPSPEECH_SHA#
+ENV DEBIAN_FRONTEND=noninteractive \
+    DEEPSPEECH_REPO=#DEEPSPEECH_REPO# \
+    DEEPSPEECH_SHA=#DEEPSPEECH_SHA#
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
     apt-utils \
@@ -20,48 +19,39 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     python3-venv \
     unzip \
     xz-utils \
-    wget
-
-# We need to remove it because it's breaking deepspeech install later with
-# weird errors about setuptools
-RUN apt-get purge -y python3-xdg
-
-# Install dependencies for audio augmentation
-RUN apt-get install -y --no-install-recommends libopus0 libsndfile1
-
-# Try and free some space
-RUN rm -rf /var/lib/apt/lists/*
+    wget && \
+    # We need to remove it because it's breaking deepspeech install later with \
+    # weird errors about setuptools \
+    apt-get purge -y python3-xdg && \
+    # Install dependencies for audio augmentation \
+    apt-get install -y --no-install-recommends libopus0 libsndfile1 && \
+    # Try and free some space \
+    rm -rf /var/lib/apt/lists/*
 
 WORKDIR /
-RUN git clone $DEEPSPEECH_REPO DeepSpeech
-
-WORKDIR /DeepSpeech
-RUN git fetch origin $DEEPSPEECH_SHA && git checkout $DEEPSPEECH_SHA
-RUN git submodule sync kenlm/ && git submodule update --init kenlm/
+RUN git clone $DEEPSPEECH_REPO DeepSpeech && \
+    cd /DeepSpeech && git fetch origin $DEEPSPEECH_SHA && git checkout $DEEPSPEECH_SHA && \
+    git submodule sync kenlm/ && git submodule update --init kenlm/
 
 # Build CTC decoder first, to avoid clashes on incompatible versions upgrades
-RUN cd native_client/ctcdecode && make NUM_PROCESSES=$(nproc) bindings
-RUN pip3 install --upgrade native_client/ctcdecode/dist/*.whl
+RUN cd /DeepSpeech/native_client/ctcdecode && make NUM_PROCESSES=$(nproc) bindings && \
+    pip3 install --upgrade dist/*.whl
 
 # Prepare deps
-RUN pip3 install --upgrade pip==20.2.2 wheel==0.34.2 setuptools==49.6.0
-
-# Install DeepSpeech
-#  - No need for the decoder since we did it earlier
-#  - There is already correct TensorFlow GPU installed on the base image,
-#    we don't want to break that
-RUN DS_NODECODER=y DS_NOTENSORFLOW=y pip3 install --upgrade -e .
-
-# Tool to convert output graph for inference
-RUN curl -vsSL https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/linux.amd64.convert_graphdef_memmapped_format.xz | xz -d > convert_graphdef_memmapped_format
-
-RUN chmod +x convert_graphdef_memmapped_format
+RUN cd /DeepSpeech && pip3 install --upgrade pip==20.2.2 wheel==0.34.2 setuptools==49.6.0 && \
+    # Install DeepSpeech \
+    #  - No need for the decoder since we did it earlier \
+    #  - There is already correct TensorFlow GPU installed on the base image, \
+    #    we don't want to break that \
+    DS_NODECODER=y DS_NOTENSORFLOW=y pip3 install --upgrade -e . && \
+    # Tool to convert output graph for inference \
+    curl -vsSL https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/linux.amd64.convert_graphdef_memmapped_format.xz | xz -d > convert_graphdef_memmapped_format && \
+    chmod +x convert_graphdef_memmapped_format
 
 # Build KenLM to generate new scorers
 WORKDIR /DeepSpeech/kenlm
-RUN wget -O - https://gitlab.com/libeigen/eigen/-/archive/3.3.8/eigen-3.3.8.tar.bz2 | tar xj
-RUN ls -hal
-RUN mkdir -p build && \
+RUN wget -O - https://gitlab.com/libeigen/eigen/-/archive/3.3.8/eigen-3.3.8.tar.bz2 | tar xj && \
+    mkdir -p build && \
     cd build && \
     EIGEN3_ROOT=/DeepSpeech/kenlm/eigen-3.3.8 cmake .. && \
     make -j $(nproc)
diff --git a/Makefile b/Makefile
index 2d28d24b94..c845cde1e3 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 DEEPSPEECH_REPO ?= https://github.com/mozilla/DeepSpeech.git
-DEEPSPEECH_SHA  ?= origin/master
+DEEPSPEECH_SHA  ?= master
 
 Dockerfile%: Dockerfile%.tmpl
 	sed \