From acecbc3e3e61930110df5cd9da2ad89526c0e2c9 Mon Sep 17 00:00:00 2001 From: Alexandre Lissy Date: Thu, 8 Apr 2021 22:21:24 +0200 Subject: [PATCH] Optimize a bit Docker --- .github/workflows/docker.yml | 9 +- Dockerfile.build.tmpl | 157 +++++++++++++++-------------------- Dockerfile.train.tmpl | 64 ++++++-------- Makefile | 2 +- 4 files changed, 101 insertions(+), 131 deletions(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 742920a1c5..beeb95a1ba 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -14,11 +14,14 @@ jobs: steps: - uses: actions/checkout@v2 with: - fetch-depth: 0 - submodules: 'recursive' + fetch-depth: 1 - run: | make Dockerfile.${{ matrix.template }} \ DEEPSPEECH_REPO=https://github.com/${{ github.repository }} \ DEEPSPEECH_SHA=${{ github.sha }} - run: | - docker build -t app:${{ matrix.template }} -f Dockerfile.${{ matrix.template }} . + mkdir /tmp/empty + - run: | + cd /tmp/empty; docker build -t app:${{ matrix.template }} -f ${{ github.workspace }}/Dockerfile.${{ matrix.template }} . + - run: | + docker save app:${{ matrix.template}} | zstd -o app_${{ matrix.template }}.zstd diff --git a/Dockerfile.build.tmpl b/Dockerfile.build.tmpl index a8c2f63c23..73638dbeb4 100644 --- a/Dockerfile.build.tmpl +++ b/Dockerfile.build.tmpl @@ -3,8 +3,8 @@ # Need devel version cause we need /usr/include/cudnn.h FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04 -ENV DEEPSPEECH_REPO=#DEEPSPEECH_REPO# -ENV DEEPSPEECH_SHA=#DEEPSPEECH_SHA# +ENV DEEPSPEECH_REPO=#DEEPSPEECH_REPO# \ + DEEPSPEECH_SHA=#DEEPSPEECH_SHA# # >> START Install base software @@ -39,62 +39,59 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ sox \ unzip \ wget \ - zlib1g-dev - -RUN update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 -RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 1 - -# Install Bazel -RUN curl -LO "https://github.com/bazelbuild/bazel/releases/download/3.1.0/bazel_3.1.0-linux-x86_64.deb" -RUN dpkg -i bazel_*.deb - -# Try and free some space -RUN rm -rf /var/lib/apt/lists/* + zlib1g-dev; \ + update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \ + update-alternatives --install /usr/bin/python python /usr/bin/python3 1; \ + # Install Bazel \ + curl -LO "https://github.com/bazelbuild/bazel/releases/download/3.1.0/bazel_3.1.0-linux-x86_64.deb" && dpkg -i bazel_*.deb; \ + # Try and free some space \ + rm -rf /var/lib/apt/lists/* bazel_*.deb # << END Install base software # >> START Configure Tensorflow Build # GPU Environment Setup -ENV TF_NEED_ROCM 0 -ENV TF_NEED_OPENCL_SYCL 0 -ENV TF_NEED_OPENCL 0 -ENV TF_NEED_CUDA 1 -ENV TF_CUDA_PATHS "/usr,/usr/local/cuda-10.1,/usr/lib/x86_64-linux-gnu/" -ENV TF_CUDA_VERSION 10.1 -ENV TF_CUDNN_VERSION 7.6 -ENV TF_CUDA_COMPUTE_CAPABILITIES 6.0 -ENV TF_NCCL_VERSION 2.8 - -# Common Environment Setup -ENV TF_BUILD_CONTAINER_TYPE GPU -ENV TF_BUILD_OPTIONS OPT -ENV TF_BUILD_DISABLE_GCP 1 -ENV TF_BUILD_ENABLE_XLA 0 -ENV TF_BUILD_PYTHON_VERSION PYTHON3 -ENV TF_BUILD_IS_OPT OPT -ENV TF_BUILD_IS_PIP PIP - -# Other Parameters -ENV CC_OPT_FLAGS -mavx -mavx2 -msse4.1 -msse4.2 -mfma -ENV TF_NEED_GCP 0 -ENV TF_NEED_HDFS 0 -ENV TF_NEED_JEMALLOC 1 -ENV TF_NEED_OPENCL 0 -ENV TF_CUDA_CLANG 0 -ENV TF_NEED_MKL 0 -ENV TF_ENABLE_XLA 0 -ENV TF_NEED_AWS 0 -ENV TF_NEED_KAFKA 0 -ENV TF_NEED_NGRAPH 0 -ENV TF_DOWNLOAD_CLANG 0 -ENV TF_NEED_TENSORRT 0 -ENV TF_NEED_GDR 0 -ENV TF_NEED_VERBS 0 -ENV TF_NEED_OPENCL_SYCL 0 - -ENV PYTHON_BIN_PATH /usr/bin/python3.6 -ENV PYTHON_LIB_PATH /usr/local/lib/python3.6/dist-packages +ENV TF_NEED_ROCM=0 \ + TF_NEED_OPENCL_SYCL=0 \ + TF_NEED_OPENCL=0 \ + TF_NEED_CUDA=1 \ + TF_CUDA_PATHS="/usr,/usr/local/cuda-10.1,/usr/lib/x86_64-linux-gnu/" \ + TF_CUDA_VERSION=10.1 \ + TF_CUDNN_VERSION=7.6 \ + TF_CUDA_COMPUTE_CAPABILITIES=6.0 \ + TF_NCCL_VERSION=2.8 \ + # Common Environment Setup \ + TF_BUILD_CONTAINER_TYPE=GPU \ + TF_BUILD_OPTIONS=OPT \ + TF_BUILD_DISABLE_GCP=1 \ + TF_BUILD_ENABLE_XLA=0 \ + TF_BUILD_PYTHON_VERSION=PYTHON3 \ + TF_BUILD_IS_OPT=OPT \ + TF_BUILD_IS_PIP=PIP \ + # Build client.cc and install Python client and decoder bindings \ + TFDIR=/DeepSpeech/tensorflow \ + # Allow Python printing utf-8 \ + PYTHONIOENCODING=UTF-8 \ + # Other Parameters \ + CC_OPT_FLAGS="-mavx -mavx2 -msse4.1 -msse4.2 -mfma" \ + TF_NEED_GCP=0 \ + TF_NEED_HDFS=0 \ + TF_NEED_JEMALLOC=1 \ + TF_NEED_OPENCL=0 \ + TF_CUDA_CLANG=0 \ + TF_NEED_MKL=0 \ + TF_ENABLE_XLA=0 \ + TF_NEED_AWS=0 \ + TF_NEED_KAFKA=0 \ + TF_NEED_NGRAPH=0 \ + TF_DOWNLOAD_CLANG=0 \ + TF_NEED_TENSORRT=0 \ + TF_NEED_GDR=0 \ + TF_NEED_VERBS=0 \ + TF_NEED_OPENCL_SYCL=0 \ + PYTHON_BIN_PATH=/usr/bin/python3.6 \ + PYTHON_LIB_PATH=/usr/local/lib/python3.6/dist-packages # << END Configure Tensorflow Build @@ -103,29 +100,23 @@ ENV PYTHON_LIB_PATH /usr/local/lib/python3.6/dist-packages # Running bazel inside a `docker build` command causes trouble, cf: # https://github.com/bazelbuild/bazel/issues/134 # The easiest solution is to set up a bazelrc file forcing --batch. -RUN echo "startup --batch" >>/etc/bazel.bazelrc # Similarly, we need to workaround sandboxing issues: # https://github.com/bazelbuild/bazel/issues/418 -RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \ - >>/etc/bazel.bazelrc +RUN echo "startup --batch" >>/etc/bazel.bazelrc; \ + echo "build --spawn_strategy=standalone --genrule_strategy=standalone" >> /etc/bazel.bazelrc # << END Configure Bazel WORKDIR / -RUN git clone --recursive $DEEPSPEECH_REPO DeepSpeech -WORKDIR /DeepSpeech -RUN git fetch origin $DEEPSPEECH_SHA && git checkout $DEEPSPEECH_SHA -RUN git submodule sync tensorflow/ && git submodule update --init tensorflow/ -RUN git submodule sync kenlm/ && git submodule update --init kenlm/ +RUN git clone --recursive $DEEPSPEECH_REPO DeepSpeech && \ + cd /DeepSpeech && \ + git fetch origin $DEEPSPEECH_SHA && git checkout $DEEPSPEECH_SHA; \ + git submodule sync tensorflow/ && git submodule update --init tensorflow/; \ + git submodule sync kenlm/ && git submodule update --init kenlm/ # >> START Build and bind - -WORKDIR /DeepSpeech/tensorflow - # Fix for not found script https://github.com/tensorflow/tensorflow/issues/471 -RUN ./configure - # Using CPU optimizations: # -mtune=generic -march=x86-64 -msse -msse2 -msse3 -msse4.1 -msse4.2 -mavx. # Adding --config=cuda flag to build using CUDA. @@ -133,7 +124,7 @@ RUN ./configure # passing LD_LIBRARY_PATH is required cause Bazel doesn't pickup it from environment # Build DeepSpeech -RUN bazel build \ +RUN cd /DeepSpeech/tensorflow && ./configure && bazel build \ --workspace_status_command="bash native_client/bazel_workspace_status_cmd.sh" \ --config=monolithic \ --config=cuda \ @@ -151,36 +142,22 @@ RUN bazel build \ --copt=-fvisibility=hidden \ //native_client:libdeepspeech.so \ --verbose_failures \ - --action_env=LD_LIBRARY_PATH=${LD_LIBRARY_PATH} + --action_env=LD_LIBRARY_PATH=${LD_LIBRARY_PATH} && \ + cp bazel-bin/native_client/libdeepspeech.so /DeepSpeech/native_client/ && \ + rm -fr /root/.cache/* -# Copy built libs to /DeepSpeech/native_client -RUN cp bazel-bin/native_client/libdeepspeech.so /DeepSpeech/native_client/ - -# Build client.cc and install Python client and decoder bindings -ENV TFDIR /DeepSpeech/tensorflow - -RUN nproc - -WORKDIR /DeepSpeech/native_client -RUN make NUM_PROCESSES=$(nproc) deepspeech - -WORKDIR /DeepSpeech -RUN cd native_client/python && make NUM_PROCESSES=$(nproc) bindings -RUN pip3 install --upgrade native_client/python/dist/*.whl - -RUN cd native_client/ctcdecode && make NUM_PROCESSES=$(nproc) bindings -RUN pip3 install --upgrade native_client/ctcdecode/dist/*.whl +RUN cd /DeepSpeech/native_client && make NUM_PROCESSES=$(nproc) deepspeech ; \ + cd /DeepSpeech/native_client/python && make NUM_PROCESSES=$(nproc) bindings; \ + pip3 install --upgrade dist/*.whl; \ + cd /DeepSpeech/native_client/ctcdecode && make NUM_PROCESSES=$(nproc) bindings; \ + pip3 install --upgrade dist/*.whl # << END Build and bind -# Allow Python printing utf-8 -ENV PYTHONIOENCODING UTF-8 - # Build KenLM in /DeepSpeech/kenlm folder WORKDIR /DeepSpeech/kenlm -RUN wget -O - https://gitlab.com/libeigen/eigen/-/archive/3.3.8/eigen-3.3.8.tar.bz2 | tar xj -RUN ls -hal -RUN mkdir -p build && \ +RUN wget -O - https://gitlab.com/libeigen/eigen/-/archive/3.3.8/eigen-3.3.8.tar.bz2 | tar xj; \ + mkdir -p build && \ cd build && \ EIGEN3_ROOT=/DeepSpeech/kenlm/eigen-3.3.8 cmake .. && \ make -j $(nproc) diff --git a/Dockerfile.train.tmpl b/Dockerfile.train.tmpl index 9332ad74b6..c73dcaf6ca 100644 --- a/Dockerfile.train.tmpl +++ b/Dockerfile.train.tmpl @@ -1,10 +1,9 @@ # Please refer to the TRAINING documentation, "Basic Dockerfile for training" FROM tensorflow/tensorflow:1.15.4-gpu-py3 -ENV DEBIAN_FRONTEND=noninteractive - -ENV DEEPSPEECH_REPO=#DEEPSPEECH_REPO# -ENV DEEPSPEECH_SHA=#DEEPSPEECH_SHA# +ENV DEBIAN_FRONTEND=noninteractive \ + DEEPSPEECH_REPO=#DEEPSPEECH_REPO# \ + DEEPSPEECH_SHA=#DEEPSPEECH_SHA# RUN apt-get update && apt-get install -y --no-install-recommends \ apt-utils \ @@ -20,48 +19,39 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ python3-venv \ unzip \ xz-utils \ - wget - -# We need to remove it because it's breaking deepspeech install later with -# weird errors about setuptools -RUN apt-get purge -y python3-xdg - -# Install dependencies for audio augmentation -RUN apt-get install -y --no-install-recommends libopus0 libsndfile1 - -# Try and free some space -RUN rm -rf /var/lib/apt/lists/* + wget && \ + # We need to remove it because it's breaking deepspeech install later with \ + # weird errors about setuptools \ + apt-get purge -y python3-xdg && \ + # Install dependencies for audio augmentation \ + apt-get install -y --no-install-recommends libopus0 libsndfile1 && \ + # Try and free some space \ + rm -rf /var/lib/apt/lists/* WORKDIR / -RUN git clone $DEEPSPEECH_REPO DeepSpeech - -WORKDIR /DeepSpeech -RUN git fetch origin $DEEPSPEECH_SHA && git checkout $DEEPSPEECH_SHA -RUN git submodule sync kenlm/ && git submodule update --init kenlm/ +RUN git clone $DEEPSPEECH_REPO DeepSpeech && \ + cd /DeepSpeech && git fetch origin $DEEPSPEECH_SHA && git checkout $DEEPSPEECH_SHA && \ + git submodule sync kenlm/ && git submodule update --init kenlm/ # Build CTC decoder first, to avoid clashes on incompatible versions upgrades -RUN cd native_client/ctcdecode && make NUM_PROCESSES=$(nproc) bindings -RUN pip3 install --upgrade native_client/ctcdecode/dist/*.whl +RUN cd /DeepSpeech/native_client/ctcdecode && make NUM_PROCESSES=$(nproc) bindings && \ + pip3 install --upgrade dist/*.whl # Prepare deps -RUN pip3 install --upgrade pip==20.2.2 wheel==0.34.2 setuptools==49.6.0 - -# Install DeepSpeech -# - No need for the decoder since we did it earlier -# - There is already correct TensorFlow GPU installed on the base image, -# we don't want to break that -RUN DS_NODECODER=y DS_NOTENSORFLOW=y pip3 install --upgrade -e . - -# Tool to convert output graph for inference -RUN curl -vsSL https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/linux.amd64.convert_graphdef_memmapped_format.xz | xz -d > convert_graphdef_memmapped_format - -RUN chmod +x convert_graphdef_memmapped_format +RUN cd /DeepSpeech && pip3 install --upgrade pip==20.2.2 wheel==0.34.2 setuptools==49.6.0 && \ + # Install DeepSpeech \ + # - No need for the decoder since we did it earlier \ + # - There is already correct TensorFlow GPU installed on the base image, \ + # we don't want to break that \ + DS_NODECODER=y DS_NOTENSORFLOW=y pip3 install --upgrade -e . && \ + # Tool to convert output graph for inference \ + curl -vsSL https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/linux.amd64.convert_graphdef_memmapped_format.xz | xz -d > convert_graphdef_memmapped_format && \ + chmod +x convert_graphdef_memmapped_format # Build KenLM to generate new scorers WORKDIR /DeepSpeech/kenlm -RUN wget -O - https://gitlab.com/libeigen/eigen/-/archive/3.3.8/eigen-3.3.8.tar.bz2 | tar xj -RUN ls -hal -RUN mkdir -p build && \ +RUN wget -O - https://gitlab.com/libeigen/eigen/-/archive/3.3.8/eigen-3.3.8.tar.bz2 | tar xj && \ + mkdir -p build && \ cd build && \ EIGEN3_ROOT=/DeepSpeech/kenlm/eigen-3.3.8 cmake .. && \ make -j $(nproc) diff --git a/Makefile b/Makefile index 2d28d24b94..c845cde1e3 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ DEEPSPEECH_REPO ?= https://github.com/mozilla/DeepSpeech.git -DEEPSPEECH_SHA ?= origin/master +DEEPSPEECH_SHA ?= master Dockerfile%: Dockerfile%.tmpl sed \