Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

docker/install: build Tesseract from source #197

Merged
merged 13 commits into from
Feb 14, 2024
44 changes: 33 additions & 11 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,34 +6,56 @@ orbs:

jobs:

build-python:
test-python:
parameters:
python-version:
type: string
docker:
- image: cimg/python:<< parameters.python-version >>
environment:
# cimg/python uses pyenv instead of venv
VIRTUAL_ENV: ${PYENV_ROOT}
steps:
- checkout
- run: git submodule sync && git submodule update --init
- run: sudo make deps-ubuntu
- when:
condition:
equal: [ '3.6', << parameters.python-version >> ]
steps:
# speed-up build time for end-of-life Python by holding at latest binary:
- run: pip install --prefer-binary -U opencv-python-headless numpy
- run: make install-tesseract
- run: make install-tesserocr
- run: make install
# PPA tessdata prefix (= ocrd_tesserocr moduledir) is owned by root
- run: sudo chmod go+w `dpkg-query -L tesseract-ocr-eng | sed -n s,/eng.traineddata,,p`
- run: ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata
- run: ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata
- run: ocrd resmgr download ocrd-tesserocr-recognize deu.traineddata
- run: ocrd resmgr download ocrd-tesserocr-recognize Fraktur.traineddata
- run: make test-cli
- run: make coverage
- codecov/upload

deploy-docker:
docker:
- image: circleci/buildpack-deps:stretch
environment:
DOCKER_TAG: ocrd/tesserocr
steps:
- checkout
- setup_remote_docker: # https://circleci.com/docs/2.0/building-docker-images/
docker_layer_caching: true
- run: make docker DOCKER_TAG=$DOCKER_TAG
- run:
name: Login to Docker Hub
command: echo "$DOCKERHUB_PASS" | docker login --username "$DOCKERHUB_USER" --password-stdin
- run: docker push $DOCKER_TAG


workflows:
build:
jobs:
- build-python:
- test-python:
matrix:
parameters:
python-version: ['3.6', '3.7', '3.8', '3.9', '3.10']
python-version: ['3.7', '3.8', '3.9', '3.10']
deploy:
jobs:
- deploy-docker:
filters:
branches:
only: master
2 changes: 2 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
!requirements_test.txt
!LICENSE
!README.md
!repo/tesserocr
!repo/tesseract

# avoid .git and __pycache__ etc:
!ocrd_tesserocr/**/*.py
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ test-workspace
/.coverage
/htmlcov
/.cache
build_tesseract
6 changes: 6 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[submodule "repo/tesserocr"]
path = repo/tesserocr
url = https://github.com/sirfz/tesserocr/
[submodule "repo/tesseract"]
path = repo/tesseract
url = https://github.com/tesseract-ocr/tesseract
50 changes: 40 additions & 10 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
FROM ocrd/core
FROM ocrd/core:v2.62.0 AS base
# set proper locales
ENV LANG C.UTF-8
ENV LC_ALL C.UTF-8
# install ocrd-tesserocr (until here commands for installing tesseract-ocr)
ARG VCS_REF
ARG BUILD_DATE
LABEL \
Expand All @@ -7,32 +11,58 @@ LABEL \
org.label-schema.vcs-url="https://github.com/OCR-D/ocrd_tesserocr" \
org.label-schema.build-date=$BUILD_DATE

ENV DEBIAN_FRONTEND noninteractive
ENV PYTHONIOENCODING utf8

# set frontend non-interactive to silence interactive tzdata config
ARG DEBIAN_FRONTEND=noninteractive

# set proper date and timezone in container
RUN echo "Europe/Berlin" > /etc/timezone
RUN ln -sf /usr/share/zoneinfo/Europe/Berlin /etc/localtime
RUN dpkg-reconfigure -f noninteractive tzdata

# diagnostic output - check timezone settings
# RUN cat /etc/timezone

# avoid HOME/.local/share (hard to predict USER here)
# so let XDG_DATA_HOME coincide with fixed system location
# (can still be overridden by derived stages)
ENV XDG_DATA_HOME /usr/local/share
ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources
ENV TESSDATA_PREFIX $XDG_DATA_HOME/tessdata

WORKDIR /build-ocrd
WORKDIR /build
COPY setup.py .
COPY ocrd_tesserocr/ocrd-tool.json .
COPY README.md .
COPY requirements.txt .
COPY requirements_test.txt .
COPY ocrd_tesserocr ./ocrd_tesserocr
COPY repo/tesserocr ./repo/tesserocr
COPY repo/tesseract ./repo/tesseract
COPY Makefile .
RUN make deps-ubuntu && \
bertsky marked this conversation as resolved.
Show resolved Hide resolved
apt-get install -y --no-install-recommends \
g++ \
&& make deps install \
&& rm -rf /build-ocrd \
RUN make deps-ubuntu deps install-tesseract install-tesseract-training install-tesserocr install \
&& rm -rf /build \
&& apt-get -y remove --auto-remove g++ libtesseract-dev make
# PPA tessdata prefix (= ocrd_tesserocr moduledir) is owned by root
RUN sudo chmod go+w `dpkg-query -L tesseract-ocr-eng | sed -n s,/eng.traineddata,,p`

RUN ocrd resmgr download ocrd-tesserocr-recognize Fraktur.traineddata
RUN ocrd resmgr download ocrd-tesserocr-recognize deu.traineddata
RUN ocrd resmgr download ocrd-tesserocr-recognize eng.traineddata
bertsky marked this conversation as resolved.
Show resolved Hide resolved
RUN ocrd resmgr download ocrd-tesserocr-recognize equ.traineddata
RUN ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata

# as discussed in ocrd_all#378, we do not want to manage more than one resource location
# to mount for model persistence;
# with named volumes, the preinstalled models will be copied to the host and complemented
# by downloaded models;
# tessdata is the only problematic module location
RUN mkdir -p $XDG_CONFIG_HOME
RUN mv $TESSDATA_PREFIX $XDG_CONFIG_HOME/ocrd-tesserocr-recognize
RUN ln -s $XDG_CONFIG_HOME/ocrd-tesserocr-recognize $TESSDATA_PREFIX
# finally, alias/symlink all ocrd-resources to /models for shorter mount commands
RUN mv $XDG_CONFIG_HOME /models && ln -s /models $XDG_CONFIG_HOME


# finally, alias/symlink all ocrd-resources to /models for shorter mount commands
WORKDIR /data
VOLUME /data
130 changes: 90 additions & 40 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,67 +7,82 @@ LOG_LEVEL = INFO
PYTHONIOENCODING=utf8
LC_ALL = C.UTF-8
LANG = C.UTF-8
export
ifdef VIRTUAL_ENV
TESSERACT_PREFIX = $(VIRTUAL_ENV)
else
TESSERACT_PREFIX = /usr/local
endif

ifeq ($(PKG_CONFIG_PATH),)
PKG_CONFIG_PATH := $(TESSERACT_PREFIX)/lib/pkgconfig
else
PKG_CONFIG_PATH := $(TESSERACT_PREFIX)/lib/pkgconfig:$(PKG_CONFIG_PATH)
endif
export PKG_CONFIG_PATH

export

# pytest args. Set to '-s' to see log output during test execution, '--verbose' to see individual tests. Default: '$(PYTEST_ARGS)'
PYTEST_ARGS =

# Docker container tag
DOCKER_TAG = 'ocrd/tesserocr'

# BEGIN-EVAL makefile-parser --make-help Makefile

help:
@echo ""
@echo " Targets"
@echo ""
@echo " deps-ubuntu Dependencies for deployment in an ubuntu/debian linux"
@echo " (lib*-dev merely for building tesserocr with pip)"
@echo " (tesseract-ocr: Ubuntu 18.04 now ships 4.0.0,"
@echo " which is unsupported. Add the tesseract-ocr PPA"
@echo " from Alexander Pozdnyakov which provides 4.1.0."
@echo " See https://launchpad.net/~alex-p/+archive/ubuntu/tesseract-ocr"
@echo " for details.)"
@echo " deps Install Python deps for install via pip"
@echo " deps-test Install Python deps for test via pip"
@echo " docker Build docker image"
@echo " install Install this package"
@echo " test Run unit tests"
@echo " coverage Run unit tests and determine test coverage"
@echo " test-cli Test the command line tools"
@echo " test/assets Setup test assets"
@echo " repo/assets Clone OCR-D/assets to ./repo/assets"
@echo " assets-clean Remove symlinks in test/assets"
@echo " deps-ubuntu Install system dependencies in an Ubuntu/Debian Linux"
@echo " install-tesseract Compile and install Tesseract"
@echo " install-tesseract-training Compile and install training utilities for Tesseract"
@echo " install-tesserocr Compile and install Tesserocr"
@echo " deps Install Python dependencies for install via pip"
@echo " install Install this package via pip"
@echo " deps-test Install Python deps for test via pip"
@echo " test Run unit tests"
@echo " coverage Run unit tests and determine test coverage"
@echo " test-cli Test the command line tools"
@echo " test/assets Setup test assets"
@echo " repo/assets Clone OCR-D/assets to ./repo/assets"
@echo " repo/tesseract Checkout Tesseract ./repo/tesseract"
@echo " repo/tesserocr Checkout Tesserocr to ./repo/tesserocr"
@echo " docker Build docker image"
@echo " assets-clean Remove symlinks in test/assets"
@echo ""
@echo " Variables"
@echo ""
@echo " PYTEST_ARGS pytest args. Set to '-s' to see log output during test execution, '--verbose' to see individual tests. Default: '$(PYTEST_ARGS)'"
@echo " DOCKER_TAG Docker container tag"
@echo " TESSDATA_PREFIX search path for recognition models (overriding Tesseract compile-time default)"

# END-EVAL

# Dependencies for deployment in an ubuntu/debian linux
# (lib*-dev merely for building tesserocr with pip)
# (tesseract-ocr: Ubuntu 18.04 now ships 4.0.0,
# which is unsupported. Add the tesseract-ocr PPA
# from Alexander Pozdnyakov which provides 4.1.0.
# See https://launchpad.net/~alex-p/+archive/ubuntu/tesseract-ocr
# for details.)
@echo " PYTEST_ARGS pytest args. Set to '-s' to see log output during test execution, '--verbose' to see individual tests. [$(PYTEST_ARGS)]"
@echo " DOCKER_TAG Docker container tag [$(DOCKER_TAG)]"
@echo " TESSDATA_PREFIX search path for recognition models (overriding Tesseract compile-time default) [$(TESSDATA_PREFIX)]"

# Dependencies for deployment in an Ubuntu/Debian Linux
# (lib*-dev merely for building Tesseract and tesserocr from sources)
deps-ubuntu:
apt-get install -y --no-install-recommends software-properties-common
-add-apt-repository -u -y ppa:alex-p/tesseract-ocr
apt-get install -y \
apt-get update && apt-get install -y --no-install-recommends \
apt-utils \
build-essential \
g++ \
git \
python3 \
python3-pip \
libtesseract-dev \
libjpeg-dev \
libgif-dev \
libwebp-dev \
libopenjp2-7-dev \
libpng-dev \
libtiff-dev \
libtool \
pkg-config \
tzdata \
xzgv \
zlib1g-dev \
libleptonica-dev \
tesseract-ocr-eng \
tesseract-ocr-script-frak \
tesseract-ocr
libpango1.0-dev \
libicu-dev \
autotools-dev \
automake \
libcurl4-nss-dev \
libarchive-dev

# Install Python deps for install via pip
deps:
Expand All @@ -85,6 +100,35 @@ docker:
--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \
-t $(DOCKER_TAG) .

install-tesserocr: repo/tesserocr
$(PIP) install ./$<

install-tesseract: $(TESSERACT_PREFIX)/bin/tesseract

install-tesseract-training: $(TESSERACT_PREFIX)/bin/lstmtraining

$(TESSERACT_PREFIX)/bin/tesseract: build_tesseract/Makefile
$(MAKE) -C build_tesseract install
if [[ "$(TESSERACT_PREFIX)" = "/usr"* ]]; then ldconfig; fi

$(TESSERACT_PREFIX)/bin/lstmtraining: build_tesseract/Makefile
$(MAKE) -C build_tesseract training-install

build_tesseract/Makefile: repo/tesseract/Makefile.in
mkdir -p $(@D)
cd $(@D) && $(CURDIR)/repo/tesseract/configure \
--prefix=$(TESSERACT_PREFIX) \
--disable-openmp \
--disable-shared \
'CXXFLAGS=-g -O2 -fno-math-errno -Wall -Wextra -Wpedantic -fPIC'

repo/tesseract/Makefile.in: repo/tesseract
cd $<; ./autogen.sh

repo/tesserocr repo/tesseract:
git submodule sync $@
git submodule update --init $@

# Install this package
install: deps
$(PIP) install .
Expand Down Expand Up @@ -135,6 +179,12 @@ repo/assets:
mkdir -p $(dir $@)
git clone https://github.com/OCR-D/assets "$@"

.PHONY: clean
clean: assets-clean tesseract-clean

tesseract-clean:
rm -rf $(CURDIR)/build_tesseract
cd repo/tesseract; make distclean

.PHONY: assets-clean
# Remove symlinks in test/assets
Expand Down
1 change: 1 addition & 0 deletions repo/tesseract
Submodule tesseract added at 8ee020
1 change: 1 addition & 0 deletions repo/tesserocr
Submodule tesserocr added at 1f960e