diff --git a/Makefile b/Makefile index a8c19aba..cd92031a 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ build: build-example: @docker build -f ./example/Dockerfile . -t tokenizers-example -release-darwin-%: +release-darwin-%: test cargo build --release --target $*-apple-darwin mkdir -p artifacts/darwin-$* cp target/$*-apple-darwin/release/libtokenizers.a artifacts/darwin-$*/libtokenizers.a @@ -15,10 +15,11 @@ release-darwin-%: mkdir -p artifacts/all cp artifacts/darwin-$*/libtokenizers.darwin-$*.tar.gz artifacts/all/libtokenizers.darwin-$*.tar.gz -release-linux-%: - docker buildx build --platform linux/$* -f release/Dockerfile . -t tokenizers.linux-$* +release-linux-%: test + docker buildx build --platform linux/$* --build-arg="DOCKER_TARGETPLATFORM=linux/$*" -f release/Dockerfile . -t tokenizers.linux-$* mkdir -p artifacts/linux-$* - docker run -v $(PWD)/artifacts/linux-$*:/mnt --entrypoint cp tokenizers.linux-$* /workspace/tokenizers/libtokenizers.a /mnt/libtokenizers.a + docker run -v $(PWD)/artifacts/linux-$*:/mnt --entrypoint ls tokenizers.linux-$* /workspace/tokenizers/lib/linux + docker run -v $(PWD)/artifacts/linux-$*:/mnt --entrypoint cp tokenizers.linux-$* /workspace/tokenizers/lib/linux/$*/libtokenizers.a /mnt/libtokenizers.a cd artifacts/linux-$* && \ tar -czf libtokenizers.linux-$*.tar.gz libtokenizers.a mkdir -p artifacts/all @@ -30,7 +31,7 @@ release: release-darwin-aarch64 release-darwin-x86_64 release-linux-arm64 releas cp artifacts/all/libtokenizers.linux-x86_64.tar.gz artifacts/all/libtokenizers.linux-amd64.tar.gz test: build - @go test -v ./... -count=1 + @go test -ldflags="-extldflags '-L./'" -v ./... -count=1 clean: rm -rf libtokenizers.a target diff --git a/README.md b/README.md index 3dd601b0..a3c0e38d 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ Go bindings for the [HuggingFace Tokenizers](https://github.com/huggingface/toke Build your Go application using pre-built native binaries: `docker build --platform=linux/amd64 -f example/Dockerfile .` Available binaries: + * [darwin-arm64](https://github.com/daulet/tokenizers/releases/latest/download/libtokenizers.darwin-arm64.tar.gz) * [linux-arm64](https://github.com/daulet/tokenizers/releases/latest/download/libtokenizers.linux-arm64.tar.gz) * [linux-amd64](https://github.com/daulet/tokenizers/releases/latest/download/libtokenizers.linux-amd64.tar.gz) @@ -20,6 +21,7 @@ Available binaries: TLDR: [working example](example/main.go). Load a tokenizer from a JSON config: + ```go import "github.com/daulet/tokenizers" @@ -32,6 +34,7 @@ defer tk.Close() ``` Encode text and decode tokens: + ```go fmt.Println("Vocab size:", tk.VocabSize()) // Vocab size: 30522 @@ -44,18 +47,19 @@ fmt.Println(tk.Decode([]uint32{2829, 4419, 14523, 2058, 1996, 13971, 3899}, true ``` ## Benchmarks + ```bash go test . -bench=. -benchmem -benchtime=10s goos: darwin goarch: arm64 pkg: github.com/daulet/tokenizers -BenchmarkEncodeNTimes-10 996556 11851 ns/op 116 B/op 6 allocs/op -BenchmarkEncodeNChars-10 1000000000 2.446 ns/op 0 B/op 0 allocs/op -BenchmarkDecodeNTimes-10 7286056 1657 ns/op 112 B/op 4 allocs/op -BenchmarkDecodeNTokens-10 65191378 211.0 ns/op 7 B/op 0 allocs/op +BenchmarkEncodeNTimes-10 996556 11851 ns/op 116 B/op 6 allocs/op +BenchmarkEncodeNChars-10 1000000000 2.446 ns/op 0 B/op 0 allocs/op +BenchmarkDecodeNTimes-10 7286056 1657 ns/op 112 B/op 4 allocs/op +BenchmarkDecodeNTokens-10 65191378 211.0 ns/op 7 B/op 0 allocs/op PASS -ok github.com/daulet/tokenizers 126.681s +ok github.com/daulet/tokenizers 126.681s ``` ## Contributing diff --git a/example/Dockerfile b/example/Dockerfile index f7f1ebc8..a2734186 100644 --- a/example/Dockerfile +++ b/example/Dockerfile @@ -8,6 +8,6 @@ RUN curl -fsSL https://github.com/daulet/tokenizers/releases/download/${VERSION} COPY ./example . COPY ./test/data ./test/data RUN go mod download -RUN mv ./libtokenizers.a /go/pkg/mod/github.com/daulet/tokenizers@${VERSION}/libtokenizers.a +RUN mv ./libtokenizers.a /go/pkg/mod/github.com/daulet/tokenizers@${VERSION}/lib/$(echo ${TARGETPLATFORM} | tr / -)/libtokenizers.a # mounting Go cache won't work since we mutate it above RUN go run main.go diff --git a/release/Dockerfile b/release/Dockerfile index ac2f8b2d..62e2729d 100644 --- a/release/Dockerfile +++ b/release/Dockerfile @@ -9,7 +9,7 @@ COPY ./Cargo.lock ./Cargo.lock RUN cargo build --release FROM golang:1.21 as builder-go -ARG TARGETPLATFORM +ARG DOCKER_TARGETPLATFORM WORKDIR /workspace COPY ./release/go.mod . COPY ./release/main.go . @@ -18,6 +18,6 @@ COPY tokenizer.go ./tokenizers/ COPY tokenizers.h ./tokenizers/ COPY --from=builder-rust \ /workspace/target/release/libtokenizers.a \ - ./tokenizers/ + ./tokenizers/lib/${DOCKER_TARGETPLATFORM}/ COPY ./test/data ./test/data -RUN go run . +RUN go run -ldflags="-extldflags '-L./tokenizers/lib/${DOCKER_TARGETPLATFORM}'" . diff --git a/tokenizer.go b/tokenizer.go index 8d7c9c11..87d55d48 100644 --- a/tokenizer.go +++ b/tokenizer.go @@ -3,7 +3,7 @@ package tokenizers // TODO packaging: how do we build the rust lib for distribution? /* -#cgo LDFLAGS: ${SRCDIR}/libtokenizers.a -ldl -lm -lstdc++ +#cgo LDFLAGS: -ltokenizers -ldl -lm -lstdc++ #include #include "tokenizers.h" */