Skip to content

Commit

Permalink
Merge pull request #197 from joschrew/dockerfile-update
Browse files Browse the repository at this point in the history
docker/install: build Tesseract (and training utils) from source, add CD
  • Loading branch information
bertsky authored Feb 14, 2024
2 parents f1036e3 + dab081e commit bf29777
Show file tree
Hide file tree
Showing 8 changed files with 174 additions and 61 deletions.
44 changes: 33 additions & 11 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,34 +6,56 @@ orbs:

jobs:

build-python:
test-python:
parameters:
python-version:
type: string
docker:
- image: cimg/python:<< parameters.python-version >>
environment:
# cimg/python uses pyenv instead of venv
VIRTUAL_ENV: ${PYENV_ROOT}
steps:
- checkout
- run: git submodule sync && git submodule update --init
- run: sudo make deps-ubuntu
- when:
condition:
equal: [ '3.6', << parameters.python-version >> ]
steps:
# speed-up build time for end-of-life Python by holding at latest binary:
- run: pip install --prefer-binary -U opencv-python-headless numpy
- run: make install-tesseract
- run: make install-tesserocr
- run: make install
# PPA tessdata prefix (= ocrd_tesserocr moduledir) is owned by root
- run: sudo chmod go+w `dpkg-query -L tesseract-ocr-eng | sed -n s,/eng.traineddata,,p`
- run: ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata
- run: ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata
- run: ocrd resmgr download ocrd-tesserocr-recognize deu.traineddata
- run: ocrd resmgr download ocrd-tesserocr-recognize Fraktur.traineddata
- run: make test-cli
- run: make coverage
- codecov/upload

deploy-docker:
docker:
- image: circleci/buildpack-deps:stretch
environment:
DOCKER_TAG: ocrd/tesserocr
steps:
- checkout
- setup_remote_docker: # https://circleci.com/docs/2.0/building-docker-images/
docker_layer_caching: true
- run: make docker DOCKER_TAG=$DOCKER_TAG
- run:
name: Login to Docker Hub
command: echo "$DOCKERHUB_PASS" | docker login --username "$DOCKERHUB_USER" --password-stdin
- run: docker push $DOCKER_TAG


workflows:
build:
jobs:
- build-python:
- test-python:
matrix:
parameters:
python-version: ['3.6', '3.7', '3.8', '3.9', '3.10']
python-version: ['3.7', '3.8', '3.9', '3.10']
deploy:
jobs:
- deploy-docker:
filters:
branches:
only: master
2 changes: 2 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
!requirements_test.txt
!LICENSE
!README.md
!repo/tesserocr
!repo/tesseract

# avoid .git and __pycache__ etc:
!ocrd_tesserocr/**/*.py
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ test-workspace
/.coverage
/htmlcov
/.cache
build_tesseract
6 changes: 6 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[submodule "repo/tesserocr"]
path = repo/tesserocr
url = https://github.com/sirfz/tesserocr/
[submodule "repo/tesseract"]
path = repo/tesseract
url = https://github.com/tesseract-ocr/tesseract
50 changes: 40 additions & 10 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
FROM ocrd/core
FROM ocrd/core:v2.62.0 AS base
# set proper locales
ENV LANG C.UTF-8
ENV LC_ALL C.UTF-8
# install ocrd-tesserocr (until here commands for installing tesseract-ocr)
ARG VCS_REF
ARG BUILD_DATE
LABEL \
Expand All @@ -7,32 +11,58 @@ LABEL \
org.label-schema.vcs-url="https://github.com/OCR-D/ocrd_tesserocr" \
org.label-schema.build-date=$BUILD_DATE

ENV DEBIAN_FRONTEND noninteractive
ENV PYTHONIOENCODING utf8

# set frontend non-interactive to silence interactive tzdata config
ARG DEBIAN_FRONTEND=noninteractive

# set proper date and timezone in container
RUN echo "Europe/Berlin" > /etc/timezone
RUN ln -sf /usr/share/zoneinfo/Europe/Berlin /etc/localtime
RUN dpkg-reconfigure -f noninteractive tzdata

# diagnostic output - check timezone settings
# RUN cat /etc/timezone

# avoid HOME/.local/share (hard to predict USER here)
# so let XDG_DATA_HOME coincide with fixed system location
# (can still be overridden by derived stages)
ENV XDG_DATA_HOME /usr/local/share
ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources
ENV TESSDATA_PREFIX $XDG_DATA_HOME/tessdata

WORKDIR /build-ocrd
WORKDIR /build
COPY setup.py .
COPY ocrd_tesserocr/ocrd-tool.json .
COPY README.md .
COPY requirements.txt .
COPY requirements_test.txt .
COPY ocrd_tesserocr ./ocrd_tesserocr
COPY repo/tesserocr ./repo/tesserocr
COPY repo/tesseract ./repo/tesseract
COPY Makefile .
RUN make deps-ubuntu && \
apt-get install -y --no-install-recommends \
g++ \
&& make deps install \
&& rm -rf /build-ocrd \
RUN make deps-ubuntu deps install-tesseract install-tesseract-training install-tesserocr install \
&& rm -rf /build \
&& apt-get -y remove --auto-remove g++ libtesseract-dev make
# PPA tessdata prefix (= ocrd_tesserocr moduledir) is owned by root
RUN sudo chmod go+w `dpkg-query -L tesseract-ocr-eng | sed -n s,/eng.traineddata,,p`

RUN ocrd resmgr download ocrd-tesserocr-recognize Fraktur.traineddata
RUN ocrd resmgr download ocrd-tesserocr-recognize deu.traineddata
RUN ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata
RUN ocrd resmgr download ocrd-tesserocr-recognize equ.traineddata
RUN ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata

# as discussed in ocrd_all#378, we do not want to manage more than one resource location
# to mount for model persistence;
# with named volumes, the preinstalled models will be copied to the host and complemented
# by downloaded models;
# tessdata is the only problematic module location
RUN mkdir -p $XDG_CONFIG_HOME
RUN mv $TESSDATA_PREFIX $XDG_CONFIG_HOME/ocrd-tesserocr-recognize
RUN ln -s $XDG_CONFIG_HOME/ocrd-tesserocr-recognize $TESSDATA_PREFIX
# finally, alias/symlink all ocrd-resources to /models for shorter mount commands
RUN mv $XDG_CONFIG_HOME /models && ln -s /models $XDG_CONFIG_HOME


# finally, alias/symlink all ocrd-resources to /models for shorter mount commands
WORKDIR /data
VOLUME /data
130 changes: 90 additions & 40 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,67 +7,82 @@ LOG_LEVEL = INFO
PYTHONIOENCODING=utf8
LC_ALL = C.UTF-8
LANG = C.UTF-8
export
ifdef VIRTUAL_ENV
TESSERACT_PREFIX = $(VIRTUAL_ENV)
else
TESSERACT_PREFIX = /usr/local
endif

ifeq ($(PKG_CONFIG_PATH),)
PKG_CONFIG_PATH := $(TESSERACT_PREFIX)/lib/pkgconfig
else
PKG_CONFIG_PATH := $(TESSERACT_PREFIX)/lib/pkgconfig:$(PKG_CONFIG_PATH)
endif
export PKG_CONFIG_PATH

export

# pytest args. Set to '-s' to see log output during test execution, '--verbose' to see individual tests. Default: '$(PYTEST_ARGS)'
PYTEST_ARGS =

# Docker container tag
DOCKER_TAG = 'ocrd/tesserocr'

# BEGIN-EVAL makefile-parser --make-help Makefile

help:
@echo ""
@echo " Targets"
@echo ""
@echo " deps-ubuntu Dependencies for deployment in an ubuntu/debian linux"
@echo " (lib*-dev merely for building tesserocr with pip)"
@echo " (tesseract-ocr: Ubuntu 18.04 now ships 4.0.0,"
@echo " which is unsupported. Add the tesseract-ocr PPA"
@echo " from Alexander Pozdnyakov which provides 4.1.0."
@echo " See https://launchpad.net/~alex-p/+archive/ubuntu/tesseract-ocr"
@echo " for details.)"
@echo " deps Install Python deps for install via pip"
@echo " deps-test Install Python deps for test via pip"
@echo " docker Build docker image"
@echo " install Install this package"
@echo " test Run unit tests"
@echo " coverage Run unit tests and determine test coverage"
@echo " test-cli Test the command line tools"
@echo " test/assets Setup test assets"
@echo " repo/assets Clone OCR-D/assets to ./repo/assets"
@echo " assets-clean Remove symlinks in test/assets"
@echo " deps-ubuntu Install system dependencies in an Ubuntu/Debian Linux"
@echo " install-tesseract Compile and install Tesseract"
@echo " install-tesseract-training Compile and install training utilities for Tesseract"
@echo " install-tesserocr Compile and install Tesserocr"
@echo " deps Install Python dependencies for install via pip"
@echo " install Install this package via pip"
@echo " deps-test Install Python deps for test via pip"
@echo " test Run unit tests"
@echo " coverage Run unit tests and determine test coverage"
@echo " test-cli Test the command line tools"
@echo " test/assets Setup test assets"
@echo " repo/assets Clone OCR-D/assets to ./repo/assets"
@echo " repo/tesseract Checkout Tesseract ./repo/tesseract"
@echo " repo/tesserocr Checkout Tesserocr to ./repo/tesserocr"
@echo " docker Build docker image"
@echo " assets-clean Remove symlinks in test/assets"
@echo ""
@echo " Variables"
@echo ""
@echo " PYTEST_ARGS pytest args. Set to '-s' to see log output during test execution, '--verbose' to see individual tests. Default: '$(PYTEST_ARGS)'"
@echo " DOCKER_TAG Docker container tag"
@echo " TESSDATA_PREFIX search path for recognition models (overriding Tesseract compile-time default)"

# END-EVAL

# Dependencies for deployment in an ubuntu/debian linux
# (lib*-dev merely for building tesserocr with pip)
# (tesseract-ocr: Ubuntu 18.04 now ships 4.0.0,
# which is unsupported. Add the tesseract-ocr PPA
# from Alexander Pozdnyakov which provides 4.1.0.
# See https://launchpad.net/~alex-p/+archive/ubuntu/tesseract-ocr
# for details.)
@echo " PYTEST_ARGS pytest args. Set to '-s' to see log output during test execution, '--verbose' to see individual tests. [$(PYTEST_ARGS)]"
@echo " DOCKER_TAG Docker container tag [$(DOCKER_TAG)]"
@echo " TESSDATA_PREFIX search path for recognition models (overriding Tesseract compile-time default) [$(TESSDATA_PREFIX)]"

# Dependencies for deployment in an Ubuntu/Debian Linux
# (lib*-dev merely for building Tesseract and tesserocr from sources)
deps-ubuntu:
apt-get install -y --no-install-recommends software-properties-common
-add-apt-repository -u -y ppa:alex-p/tesseract-ocr
apt-get install -y \
apt-get update && apt-get install -y --no-install-recommends \
apt-utils \
build-essential \
g++ \
git \
python3 \
python3-pip \
libtesseract-dev \
libjpeg-dev \
libgif-dev \
libwebp-dev \
libopenjp2-7-dev \
libpng-dev \
libtiff-dev \
libtool \
pkg-config \
tzdata \
xzgv \
zlib1g-dev \
libleptonica-dev \
tesseract-ocr-eng \
tesseract-ocr-script-frak \
tesseract-ocr
libpango1.0-dev \
libicu-dev \
autotools-dev \
automake \
libcurl4-nss-dev \
libarchive-dev

# Install Python deps for install via pip
deps:
Expand All @@ -85,6 +100,35 @@ docker:
--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \
-t $(DOCKER_TAG) .

install-tesserocr: repo/tesserocr
$(PIP) install ./$<

install-tesseract: $(TESSERACT_PREFIX)/bin/tesseract

install-tesseract-training: $(TESSERACT_PREFIX)/bin/lstmtraining

$(TESSERACT_PREFIX)/bin/tesseract: build_tesseract/Makefile
$(MAKE) -C build_tesseract install
if [[ "$(TESSERACT_PREFIX)" = "/usr"* ]]; then ldconfig; fi

$(TESSERACT_PREFIX)/bin/lstmtraining: build_tesseract/Makefile
$(MAKE) -C build_tesseract training-install

build_tesseract/Makefile: repo/tesseract/Makefile.in
mkdir -p $(@D)
cd $(@D) && $(CURDIR)/repo/tesseract/configure \
--prefix=$(TESSERACT_PREFIX) \
--disable-openmp \
--disable-shared \
'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic -fPIC'

repo/tesseract/Makefile.in: repo/tesseract
cd $<; ./autogen.sh

repo/tesserocr repo/tesseract:
git submodule sync $@
git submodule update --init $@

# Install this package
install: deps
$(PIP) install .
Expand Down Expand Up @@ -135,6 +179,12 @@ repo/assets:
mkdir -p $(dir $@)
git clone https://github.com/OCR-D/assets "$@"

.PHONY: clean
clean: assets-clean tesseract-clean

tesseract-clean:
rm -rf $(CURDIR)/build_tesseract
cd repo/tesseract; make distclean

.PHONY: assets-clean
# Remove symlinks in test/assets
Expand Down
1 change: 1 addition & 0 deletions repo/tesseract
Submodule tesseract added at 8ee020
1 change: 1 addition & 0 deletions repo/tesserocr
Submodule tesserocr added at 1f960e

0 comments on commit bf29777

Please sign in to comment.