diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml new file mode 100644 index 000000000..84da21994 --- /dev/null +++ b/.github/workflows/CI.yml @@ -0,0 +1,181 @@ +# This file is autogenerated by maturin v1.7.4 +# To update, run +# +# maturin generate-ci github -m bindings/python/Cargo.toml +# +name: CI + +on: + push: + branches: + - main + - master + tags: + - '*' + pull_request: + workflow_dispatch: + +permissions: + contents: read + +jobs: + linux: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: ubuntu-latest + target: x86_64 + - runner: ubuntu-latest + target: x86 + - runner: ubuntu-latest + target: aarch64 + - runner: ubuntu-latest + target: armv7 + - runner: ubuntu-latest + target: s390x + - runner: ubuntu-latest + target: ppc64le + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.x + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --release --out dist --manifest-path bindings/python/Cargo.toml + sccache: 'true' + manylinux: auto + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-linux-${{ matrix.platform.target }} + path: dist + + musllinux: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: ubuntu-latest + target: x86_64 + - runner: ubuntu-latest + target: x86 + - runner: ubuntu-latest + target: aarch64 + - runner: ubuntu-latest + target: armv7 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.x + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --release --out dist --manifest-path bindings/python/Cargo.toml + sccache: 'true' + manylinux: musllinux_1_2 + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-musllinux-${{ matrix.platform.target }} + path: dist + + windows: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: windows-latest + target: x64 + - runner: windows-latest + target: x86 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.x + architecture: ${{ matrix.platform.target }} + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --release --out dist --manifest-path bindings/python/Cargo.toml + sccache: 'true' + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-windows-${{ matrix.platform.target }} + path: dist + + macos: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: macos-12 + target: x86_64 + - runner: macos-14 + target: aarch64 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.x + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --release --out dist --manifest-path bindings/python/Cargo.toml + sccache: 'true' + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-macos-${{ matrix.platform.target }} + path: dist + + sdist: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Build sdist + uses: PyO3/maturin-action@v1 + with: + command: sdist + args: --out dist --manifest-path bindings/python/Cargo.toml + - name: Upload sdist + uses: actions/upload-artifact@v4 + with: + name: wheels-sdist + path: dist + + release: + name: Release + runs-on: ubuntu-latest + if: ${{ startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch' }} + needs: [linux, musllinux, windows, macos, sdist] + permissions: + # Use to sign the release artifacts + id-token: write + # Used to upload release artifacts + contents: write + # Used to generate artifact attestation + attestations: write + steps: + - uses: actions/download-artifact@v4 + - name: Generate artifact attestation + uses: actions/attest-build-provenance@v1 + with: + subject-path: 'wheels-*/*' + - name: Publish to PyPI + if: "startsWith(github.ref, 'refs/tags/')" + uses: PyO3/maturin-action@v1 + env: + MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_TOKEN_DIST}} + with: + command: upload + args: --non-interactive --skip-existing wheels-*/* diff --git a/.github/workflows/python-release.yml b/.github/workflows/python-release.yml index fbf2fa844..e0af384cd 100644 --- a/.github/workflows/python-release.yml +++ b/.github/workflows/python-release.yml @@ -164,8 +164,9 @@ jobs: with: path: ./bindings/python/dist merge-multiple: true - - name: Upload to PyPi - working-directory: ./bindings/python - run: | - pip install twine - twine upload dist/* -u __token__ -p "$PYPI_TOKEN" + # Temporary deactivation while testing abi3 CI + # - name: Upload to PyPi + # working-directory: ./bindings/python + # run: | + # pip install twine + # twine upload dist/* -u __token__ -p "$PYPI_TOKEN" diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 507184c11..64f5670ad 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -16,7 +16,7 @@ jobs: runs-on: windows-latest strategy: matrix: - python: ["3.7", "3.8", "3.9", "3.10"] + python: ["3.9", "3.10", "3.11", "3.12", "3.13"] steps: - name: Checkout repository uses: actions/checkout@v4 @@ -72,7 +72,7 @@ jobs: - name: Install Python uses: actions/setup-python@v5 with: - python-version: 3.11 + python-version: 3.13 architecture: "x64" diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index c7b516612..acd416b9f 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -14,7 +14,7 @@ serde = { version = "1.0", features = ["rc", "derive"] } serde_json = "1.0" libc = "0.2" env_logger = "0.11" -pyo3 = { version = "0.22", features = ["py-clone"] } +pyo3 = { version = "0.22", features = ["abi3", "abi3-py39"] } numpy = "0.22" ndarray = "0.15" itertools = "0.12" @@ -24,7 +24,7 @@ path = "../../tokenizers" [dev-dependencies] tempfile = "3.10" -pyo3 = { version = "0.22", features = ["auto-initialize", "py-clone"] } +pyo3 = { version = "0.22", features = ["auto-initialize"] } [features] defaut = ["pyo3/extension-module"] diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs index ab4ac0669..daa3f8c57 100644 --- a/bindings/python/src/decoders.rs +++ b/bindings/python/src/decoders.rs @@ -488,7 +488,6 @@ impl PySequenceDecoder { } } -#[derive(Clone)] pub(crate) struct CustomDecoder { inner: PyObject, } diff --git a/bindings/python/src/encoding.rs b/bindings/python/src/encoding.rs index 9d7fbde74..dcad1b037 100644 --- a/bindings/python/src/encoding.rs +++ b/bindings/python/src/encoding.rs @@ -399,11 +399,11 @@ impl PyEncoding { if let Some(kwargs) = kwargs { for (key, value) in kwargs { - let key: &str = key.extract()?; - match key { + let key: String = key.extract()?; + match key.as_ref() { "direction" => { - let value: &str = value.extract()?; - direction = match value { + let value: String = value.extract()?; + direction = match value.as_ref() { "left" => Ok(PaddingDirection::Left), "right" => Ok(PaddingDirection::Right), other => Err(PyError(format!( diff --git a/bindings/python/src/models.rs b/bindings/python/src/models.rs index 891609f23..0d5c0ddcd 100644 --- a/bindings/python/src/models.rs +++ b/bindings/python/src/models.rs @@ -276,8 +276,8 @@ impl PyBPE { ) -> PyResult<(Self, PyModel)> { if let Some(kwargs) = kwargs { for (key, value) in kwargs { - let key: &str = key.extract()?; - match key { + let key: String = key.extract()?; + match key.as_ref() { "cache_capacity" => builder = builder.cache_capacity(value.extract()?), "dropout" => { if let Some(dropout) = value.extract()? { @@ -581,8 +581,8 @@ impl PyWordPiece { ) -> PyResult<(Self, PyModel)> { if let Some(kwargs) = kwargs { for (key, val) in kwargs { - let key: &str = key.extract()?; - match key { + let key: String = key.extract()?; + match key.as_ref() { "unk_token" => { builder = builder.unk_token(val.extract()?); } diff --git a/bindings/python/src/normalizers.rs b/bindings/python/src/normalizers.rs index 7b592690e..38041fc94 100644 --- a/bindings/python/src/normalizers.rs +++ b/bindings/python/src/normalizers.rs @@ -184,9 +184,8 @@ macro_rules! getter { let super_ = $self.as_ref(); if let PyNormalizerTypeWrapper::Single(ref norm) = super_.normalizer { let wrapper = norm.read().unwrap(); - if let PyNormalizerWrapper::Wrapped(NormalizerWrapper::$variant(o)) = (*wrapper).clone() - { - o.$name + if let PyNormalizerWrapper::Wrapped(NormalizerWrapper::$variant(o)) = (&*wrapper) { + o.$name.clone() } else { unreachable!() } @@ -538,7 +537,7 @@ impl PyReplace { } } -#[derive(Debug, Clone)] +#[derive(Debug)] pub(crate) struct CustomNormalizer { inner: PyObject, } @@ -581,7 +580,7 @@ impl<'de> Deserialize<'de> for CustomNormalizer { } } -#[derive(Debug, Clone, Deserialize)] +#[derive(Debug, Deserialize)] #[serde(untagged)] pub(crate) enum PyNormalizerWrapper { Custom(CustomNormalizer), diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index 2453d9ac7..e58d1bee6 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -618,7 +618,6 @@ impl PyUnicodeScripts { } } -#[derive(Clone)] pub(crate) struct CustomPreTokenizer { inner: PyObject, } @@ -662,7 +661,7 @@ impl<'de> Deserialize<'de> for CustomPreTokenizer { } } -#[derive(Clone, Deserialize)] +#[derive(Deserialize)] #[serde(untagged)] pub(crate) enum PyPreTokenizerWrapper { Custom(CustomPreTokenizer), diff --git a/bindings/python/src/processors.rs b/bindings/python/src/processors.rs index 474f1e8fb..1e7520aad 100644 --- a/bindings/python/src/processors.rs +++ b/bindings/python/src/processors.rs @@ -313,7 +313,7 @@ impl From for Template { impl FromPyObject<'_> for PyTemplate { fn extract_bound(ob: &Bound<'_, PyAny>) -> PyResult { - if let Ok(s) = ob.extract::<&str>() { + if let Ok(s) = ob.extract::() { Ok(Self( s.try_into().map_err(exceptions::PyValueError::new_err)?, )) diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs index aa7019f2d..401a146ab 100644 --- a/bindings/python/src/tokenizer.rs +++ b/bindings/python/src/tokenizer.rs @@ -136,8 +136,8 @@ impl PyAddedToken { if let Some(kwargs) = kwargs { for (key, value) in kwargs { - let key: &str = key.extract()?; - match key { + let key: String = key.extract()?; + match key.as_ref() { "single_word" => token.single_word = Some(value.extract()?), "lstrip" => token.lstrip = Some(value.extract()?), "rstrip" => token.rstrip = Some(value.extract()?), @@ -159,8 +159,8 @@ impl PyAddedToken { match state.downcast_bound::(py) { Ok(state) => { for (key, value) in state { - let key: &str = key.extract()?; - match key { + let key: String = key.extract()?; + match key.as_ref() { "content" => self.content = value.extract()?, "single_word" => self.single_word = Some(value.extract()?), "lstrip" => self.lstrip = Some(value.extract()?), @@ -287,7 +287,7 @@ impl FromPyObject<'_> for PyArrayUnicode { } let arr = ob.as_ptr() as *mut npyffi::PyArrayObject; // SAFETY Getting all the metadata about the numpy array to check its sanity - let (type_num, elsize, alignment, data, nd, flags) = unsafe { + let (type_num, elsize, _alignment, data, nd, flags) = unsafe { let desc = (*arr).descr; ( (*desc).type_num, @@ -323,15 +323,16 @@ impl FromPyObject<'_> for PyArrayUnicode { let seq = (0..n_elem) .map(|i| { let bytes = &all_bytes[i * elsize..(i + 1) * elsize]; - let unicode = pyo3::ffi::PyUnicode_FromKindAndData( - pyo3::ffi::PyUnicode_4BYTE_KIND as _, - bytes.as_ptr() as *const _, - elsize as isize / alignment as isize, - ); - let py = ob.py(); - let obj = PyObject::from_owned_ptr(py, unicode); - let s = obj.downcast_bound::(py)?; - Ok(s.to_string_lossy().trim_matches(char::from(0)).to_owned()) + Ok(std::str::from_utf8(bytes)?.to_owned()) + // let unicode = pyo3::ffi::PyUnicode_FromKindAndData( + // pyo3::ffi::PyUnicode_4BYTE_KIND as _, + // bytes.as_ptr() as *const _, + // elsize as isize / alignment as isize, + // ); + // let py = ob.py(); + // let obj = PyObject::from_owned_ptr(py, unicode); + // let s = obj.downcast_bound::(py)?; + // Ok(s.to_string_lossy().trim_matches(char::from(0)).to_owned()) }) .collect::>>()?; @@ -736,12 +737,12 @@ impl PyTokenizer { if let Some(kwargs) = kwargs { for (key, value) in kwargs { - let key: &str = key.extract()?; - match key { + let key: String = key.extract()?; + match key.as_ref() { "stride" => params.stride = value.extract()?, "strategy" => { - let value: &str = value.extract()?; - params.strategy = match value { + let value: String = value.extract()?; + params.strategy = match value.as_ref() { "longest_first" => Ok(TruncationStrategy::LongestFirst), "only_first" => Ok(TruncationStrategy::OnlyFirst), "only_second" => Ok(TruncationStrategy::OnlySecond), @@ -754,8 +755,8 @@ impl PyTokenizer { }? } "direction" => { - let value: &str = value.extract()?; - params.direction = match value { + let value: String = value.extract()?; + params.direction = match value.as_ref() { "left" => Ok(TruncationDirection::Left), "right" => Ok(TruncationDirection::Right), _ => Err(PyError(format!( @@ -838,11 +839,11 @@ impl PyTokenizer { if let Some(kwargs) = kwargs { for (key, value) in kwargs { - let key: &str = key.extract()?; - match key { + let key: String = key.extract()?; + match key.as_ref() { "direction" => { - let value: &str = value.extract()?; - params.direction = match value { + let value: String = value.extract()?; + params.direction = match value.as_ref() { "left" => Ok(PaddingDirection::Left), "right" => Ok(PaddingDirection::Right), other => Err(PyError(format!( @@ -1341,7 +1342,7 @@ impl PyTokenizer { // - An iterator, to allow batching // - A string if let Ok(s) = element.downcast::() { - itertools::Either::Right(std::iter::once(s.to_str().map(|s| s.to_owned()))) + itertools::Either::Right(std::iter::once(s.to_cow().map(|s| s.into_owned()))) } else { match element.iter() { Ok(iter) => itertools::Either::Left( diff --git a/bindings/python/src/trainers.rs b/bindings/python/src/trainers.rs index d4c7e615e..45eabf0dd 100644 --- a/bindings/python/src/trainers.rs +++ b/bindings/python/src/trainers.rs @@ -313,8 +313,8 @@ impl PyBpeTrainer { let mut builder = tk::models::bpe::BpeTrainer::builder(); if let Some(kwargs) = kwargs { for (key, val) in kwargs { - let key: &str = key.extract()?; - match key { + let key: String = key.extract()?; + match key.as_ref() { "vocab_size" => builder = builder.vocab_size(val.extract()?), "min_frequency" => builder = builder.min_frequency(val.extract()?), "show_progress" => builder = builder.show_progress(val.extract()?), @@ -520,8 +520,8 @@ impl PyWordPieceTrainer { let mut builder = tk::models::wordpiece::WordPieceTrainer::builder(); if let Some(kwargs) = kwargs { for (key, val) in kwargs { - let key: &str = key.extract()?; - match key { + let key: String = key.extract()?; + match key.as_ref() { "vocab_size" => builder = builder.vocab_size(val.extract()?), "min_frequency" => builder = builder.min_frequency(val.extract()?), "show_progress" => builder = builder.show_progress(val.extract()?), @@ -661,8 +661,8 @@ impl PyWordLevelTrainer { if let Some(kwargs) = kwargs { for (key, val) in kwargs { - let key: &str = key.extract()?; - match key { + let key: String = key.extract()?; + match key.as_ref() { "vocab_size" => { builder.vocab_size(val.extract()?); } @@ -828,8 +828,8 @@ impl PyUnigramTrainer { let mut builder = tk::models::unigram::UnigramTrainer::builder(); if let Some(kwargs) = kwargs { for (key, val) in kwargs { - let key: &str = key.extract()?; - match key { + let key: String = key.extract()?; + match key.as_ref() { "vocab_size" => builder.vocab_size(val.extract()?), "show_progress" => builder.show_progress(val.extract()?), "n_sub_iterations" => builder.n_sub_iterations(val.extract()?), diff --git a/bindings/python/src/utils/normalization.rs b/bindings/python/src/utils/normalization.rs index 107d0a27c..9de0ece38 100644 --- a/bindings/python/src/utils/normalization.rs +++ b/bindings/python/src/utils/normalization.rs @@ -8,7 +8,7 @@ use tk::normalizer::{char_to_bytes, NormalizedString, Range, SplitDelimiterBehav use tk::pattern::Pattern; /// Represents a Pattern as used by `NormalizedString` -#[derive(Clone, FromPyObject)] +#[derive(FromPyObject)] pub enum PyPattern { #[pyo3(annotation = "str")] Str(String), @@ -95,9 +95,9 @@ pub struct PySplitDelimiterBehavior(pub SplitDelimiterBehavior); impl FromPyObject<'_> for PySplitDelimiterBehavior { fn extract_bound(obj: &Bound<'_, PyAny>) -> PyResult { - let s = obj.extract::<&str>()?; + let s = obj.extract::()?; - Ok(Self(match s { + Ok(Self(match s.as_ref() { "removed" => Ok(SplitDelimiterBehavior::Removed), "isolated" => Ok(SplitDelimiterBehavior::Isolated), "merged_with_previous" => Ok(SplitDelimiterBehavior::MergedWithPrevious), diff --git a/bindings/python/src/utils/pretokenization.rs b/bindings/python/src/utils/pretokenization.rs index a9879ad90..6a14888d3 100644 --- a/bindings/python/src/utils/pretokenization.rs +++ b/bindings/python/src/utils/pretokenization.rs @@ -70,9 +70,9 @@ fn tokenize(pretok: &mut PreTokenizedString, func: &Bound<'_, PyAny>) -> PyResul pub struct PyOffsetReferential(OffsetReferential); impl FromPyObject<'_> for PyOffsetReferential { fn extract_bound(obj: &Bound<'_, PyAny>) -> PyResult { - let s = obj.extract::<&str>()?; + let s = obj.extract::()?; - Ok(Self(match s { + Ok(Self(match s.as_ref() { "original" => Ok(OffsetReferential::Original), "normalized" => Ok(OffsetReferential::Normalized), _ => Err(exceptions::PyValueError::new_err( @@ -86,9 +86,9 @@ impl FromPyObject<'_> for PyOffsetReferential { pub struct PyOffsetType(OffsetType); impl FromPyObject<'_> for PyOffsetType { fn extract_bound(obj: &Bound<'_, PyAny>) -> PyResult { - let s = obj.extract::<&str>()?; + let s = obj.extract::()?; - Ok(Self(match s { + Ok(Self(match s.as_ref() { "byte" => Ok(OffsetType::Byte), "char" => Ok(OffsetType::Char), _ => Err(exceptions::PyValueError::new_err(