Skip to content

Commit

Permalink
make example Dockerfile friendlier - use prebuilt artifacts
Browse files Browse the repository at this point in the history
  • Loading branch information
Daulet Zhanguzin committed May 5, 2023
1 parent 6f28395 commit 12abe54
Show file tree
Hide file tree
Showing 8 changed files with 67 additions and 27 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
libtokenizers.a
/release
/artifacts
18 changes: 9 additions & 9 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,21 @@ build-example:

release-darwin-arm64:
cd lib && cargo build --release --target aarch64-apple-darwin
mkdir -p release/darwin-arm64
cp lib/target/aarch64-apple-darwin/release/libtokenizers.a release/darwin-arm64/libtokenizers.a
cd release/darwin-arm64 && \
mkdir -p artifacts/darwin-arm64
cp lib/target/aarch64-apple-darwin/release/libtokenizers.a artifacts/darwin-arm64/libtokenizers.a
cd artifacts/darwin-arm64 && \
tar -czf libtokenizers.darwin-arm64.tar.gz libtokenizers.a
mkdir -p release/artifacts
cp release/darwin-arm64/libtokenizers.darwin-arm64.tar.gz release/artifacts/libtokenizers.darwin-arm64.tar.gz
mkdir -p artifacts/all
cp artifacts/darwin-arm64/libtokenizers.darwin-arm64.tar.gz artifacts/all/libtokenizers.darwin-arm64.tar.gz

release-linux-%:
docker buildx build --platform linux/$* -f example/Dockerfile . -t tokenizers.linux-$*
mkdir -p release/linux-$*
mkdir -p artifacts/linux-$*
docker run -v $(PWD)/release/linux-$*:/mnt --entrypoint cp tokenizers.linux-$* /workspace/libtokenizers.a /mnt/libtokenizers.a
cd release/linux-$* && \
cd artifacts/linux-$* && \
tar -czf libtokenizers.linux-$*.tar.gz libtokenizers.a
mkdir -p release/artifacts
cp release/linux-$*/libtokenizers.linux-$*.tar.gz release/artifacts/libtokenizers.linux-$*.tar.gz
mkdir -p artifacts/all
cp artifacts/linux-$*/libtokenizers.linux-$*.tar.gz artifacts/all/libtokenizers.linux-$*.tar.gz

release: release-darwin-arm64 release-linux-amd64 release-linux-arm64 release-linux-x86_64

Expand Down
14 changes: 6 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,21 @@ Go bindings for the [HuggingFace Tokenizers](https://github.com/huggingface/toke

## Installation

`make build` or see [./example/Dockerfile](Dockerfile) for deployment example.
`make build` to build `libtokenizers.a` that you need to run your application that uses bindings.

### Using pre-built binaries

Build your Go application using pre-built native binaries: `docker build --platform=linux/amd64 -f example/Dockerfile .`

Available binaries:
* [darwin-arm64](https://github.com/daulet/tokenizers/releases/latest/download/libtokenizers.darwin-arm64.tar.gz)
* [linux-arm64](https://github.com/daulet/tokenizers/releases/latest/download/libtokenizers.linux-arm64.tar.gz)
* [linux-amd64](https://github.com/daulet/tokenizers/releases/latest/download/libtokenizers.linux-amd64.tar.gz)

## Getting started

TLDR: [working example](example/main.go).

Load a tokenizer from a JSON config:
```go
import "github.com/daulet/tokenizers"
Expand Down Expand Up @@ -52,10 +57,3 @@ BenchmarkDecodeNTokens-10 65191378 211.0 ns/op 7 B/op 0 a
PASS
ok github.com/daulet/tokenizers 126.681s
```

## Release

```bash
make release-linux-arm64
make release-linux-amd64
```
10 changes: 1 addition & 9 deletions example/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,17 +1,9 @@
# syntax=docker/dockerfile:1.3

FROM rust:1.69 as builder-rust
ARG TARGETPLATFORM
WORKDIR /workspace
COPY ./lib .
RUN --mount=type=cache,target=/usr/local/cargo/registry,id=${TARGETPLATFORM} \
--mount=type=cache,target=/root/target,id=${TARGETPLATFORM} \
cargo build --release

FROM golang:1.19 as builder-go
ARG TARGETPLATFORM
WORKDIR /workspace
COPY --from=builder-rust /workspace/target/release/libtokenizers.a .
RUN curl -fsSL https://github.com/daulet/tokenizers/releases/latest/download/libtokenizers.$(echo ${TARGETPLATFORM} | tr / -).tar.gz | tar xvz
COPY ./example .
COPY ./test/data ./test/data
RUN --mount=type=cache,target=/root/.cache/go-build \
Expand Down
19 changes: 19 additions & 0 deletions release/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# syntax=docker/dockerfile:1.3

FROM rust:1.69 as builder-rust
ARG TARGETPLATFORM
WORKDIR /workspace
COPY ./lib .
RUN --mount=type=cache,target=/usr/local/cargo/registry,id=${TARGETPLATFORM} \
--mount=type=cache,target=/root/target,id=${TARGETPLATFORM} \
cargo build --release

FROM golang:1.19 as builder-go
ARG TARGETPLATFORM
WORKDIR /workspace
COPY --from=builder-rust /workspace/target/release/libtokenizers.a .
COPY ./release .
COPY ./test/data ./test/data
RUN --mount=type=cache,target=/root/.cache/go-build \
--mount=type=cache,target=/var/cache/go,id=${TARGETPLATFORM} \
CGO_ENABLED=1 CGO_LDFLAGS="-Wl,--copy-dt-needed-entries" go run main.go
5 changes: 5 additions & 0 deletions release/go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
module github.com/daulet/tokenizers/example

go 1.20

require github.com/daulet/tokenizers v0.2.1 // indirect
2 changes: 2 additions & 0 deletions release/go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
github.com/daulet/tokenizers v0.2.1 h1:Sb7gfk8N1yIWFCwG6wu5SEo8MsG+Onm7ejIoZe0ZkNg=
github.com/daulet/tokenizers v0.2.1/go.mod h1:tGnMdZthXdcWY6DGD07IygpwJqiPvG85FQUnhs/wSCs=
24 changes: 24 additions & 0 deletions release/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package main

import (
"fmt"

"github.com/daulet/tokenizers"
)

func main() {
tk, err := tokenizers.FromFile("./test/data/bert-base-uncased.json")
if err != nil {
panic(err)
}
// release native resources
defer tk.Close()
fmt.Println("Vocab size:", tk.VocabSize())
// Vocab size: 30522
fmt.Println(tk.Encode("brown fox jumps over the lazy dog", false))
// [2829 4419 14523 2058 1996 13971 3899]
fmt.Println(tk.Encode("brown fox jumps over the lazy dog", true))
// [101 2829 4419 14523 2058 1996 13971 3899 102]
fmt.Println(tk.Decode([]uint32{2829, 4419, 14523, 2058, 1996, 13971, 3899}, true))
// brown fox jumps over the lazy dog
}

0 comments on commit 12abe54

Please sign in to comment.