diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index 8203c15afc6c..da56c23b5cd9 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -39,6 +39,7 @@ on: - arrow-integration-test/** - arrow-ipc/** - arrow-json/** + - arrow-avro/** - arrow-ord/** - arrow-row/** - arrow-schema/** @@ -55,7 +56,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: Setup Rust toolchain @@ -78,6 +79,8 @@ jobs: run: cargo test -p arrow-csv --all-features - name: Test arrow-json with all features run: cargo test -p arrow-json --all-features + - name: Test arrow-avro with all features + run: cargo test -p arrow-avro --all-features - name: Test arrow-string with all features run: cargo test -p arrow-string --all-features - name: Test arrow-ord with all features @@ -91,7 +94,7 @@ jobs: - name: Test arrow with default features run: cargo test -p arrow - name: Test arrow with all features apart from simd - run: cargo test -p arrow --features=force_validate,prettyprint,ipc_compression,ffi,dyn_cmp_dict,chrono-tz + run: cargo test -p arrow --features=force_validate,prettyprint,ipc_compression,ffi,chrono-tz - name: Run examples run: | # Test arrow examples @@ -109,7 +112,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: Setup Rust toolchain @@ -136,7 +139,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: Setup Rust toolchain @@ -160,7 +163,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: Setup Rust toolchain @@ -179,7 +182,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - name: Setup Clippy @@ -202,6 +205,8 @@ jobs: run: cargo clippy -p arrow-csv --all-targets --all-features -- -D warnings - name: Clippy arrow-json with all features run: cargo clippy -p arrow-json --all-targets --all-features -- -D warnings + - name: Clippy arrow-avro with all features + run: cargo clippy -p arrow-avro --all-targets --all-features -- -D warnings - name: Clippy arrow-string with all features run: cargo clippy -p arrow-string --all-targets --all-features -- -D warnings - name: Clippy arrow-ord with all features @@ -211,7 +216,7 @@ jobs: - name: Clippy arrow-row with all features run: cargo clippy -p arrow-row --all-targets --all-features -- -D warnings - name: Clippy arrow with all features except SIMD - run: cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression,dyn_cmp_dict,chrono-tz --all-targets -- -D warnings + run: cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression,chrono-tz --all-targets -- -D warnings - name: Clippy arrow-integration-test with all features run: cargo clippy -p arrow-integration-test --all-targets --all-features -- -D warnings - name: Clippy arrow-integration-testing with all features diff --git a/.github/workflows/arrow_flight.yml b/.github/workflows/arrow_flight.yml index 5301a3f8563f..242e0f2a3b0d 100644 --- a/.github/workflows/arrow_flight.yml +++ b/.github/workflows/arrow_flight.yml @@ -47,7 +47,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: Setup Rust toolchain @@ -68,7 +68,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - name: Run gen @@ -82,7 +82,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - name: Setup Clippy diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 3fa254142dbe..64b2ca437067 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -36,7 +36,7 @@ jobs: # otherwise we get this error: # Failed to run tests: ASLR disable failed: EPERM: Operation not permitted steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: Setup Rust toolchain diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 0eb2d024f352..1447d72a53b1 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -38,7 +38,7 @@ jobs: name: Release Audit Tool (RAT) runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup Python uses: actions/setup-python@v4 with: @@ -50,8 +50,8 @@ jobs: name: Markdown format runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - uses: actions/setup-node@v3 + - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 with: node-version: "14" - name: Prettier check diff --git a/.github/workflows/dev_pr.yml b/.github/workflows/dev_pr.yml index bb88e9dcd3f5..5f3d9e54c8db 100644 --- a/.github/workflows/dev_pr.yml +++ b/.github/workflows/dev_pr.yml @@ -37,7 +37,7 @@ jobs: contents: read pull-requests: write steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Assign GitHub labels if: | diff --git a/.github/workflows/dev_pr/labeler.yml b/.github/workflows/dev_pr/labeler.yml index e5b86e8bcdf0..ea5873081f18 100644 --- a/.github/workflows/dev_pr/labeler.yml +++ b/.github/workflows/dev_pr/labeler.yml @@ -27,6 +27,7 @@ arrow: - arrow-integration-testing/**/* - arrow-ipc/**/* - arrow-json/**/* + - arrow-avro/**/* - arrow-ord/**/* - arrow-row/**/* - arrow-schema/**/* diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index d3f8e9046510..721260892402 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -43,7 +43,7 @@ jobs: env: RUSTDOCFLAGS: "-Dwarnings --enable-index-page -Zunstable-options" steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: Install python dev @@ -77,7 +77,7 @@ jobs: contents: write runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Download crate docs uses: actions/download-artifact@v3 with: diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 9b2e7797d5ff..c9cb4e31ced9 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -38,6 +38,7 @@ on: - arrow-integration-testing/** - arrow-ipc/** - arrow-json/** + - arrow-avro/** - arrow-ord/** - arrow-pyarrow-integration-testing/** - arrow-schema/** @@ -56,7 +57,15 @@ jobs: env: ARROW_USE_CCACHE: OFF ARROW_CPP_EXE_PATH: /build/cpp/debug + ARROW_RUST_EXE_PATH: /build/rust/debug BUILD_DOCS_CPP: OFF + ARROW_INTEGRATION_CPP: ON + ARROW_INTEGRATION_CSHARP: ON + ARROW_INTEGRATION_GO: ON + ARROW_INTEGRATION_JAVA: ON + ARROW_INTEGRATION_JS: ON + # https://github.com/apache/arrow/pull/38403/files#r1371281630 + ARCHERY_INTEGRATION_WITH_RUST: ON # These are necessary because the github runner overrides $HOME # https://github.com/actions/runner/issues/863 RUSTUP_HOME: /root/.rustup @@ -76,48 +85,20 @@ jobs: - name: Check cmake run: which cmake - name: Checkout Arrow - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: repository: apache/arrow submodules: true fetch-depth: 0 - name: Checkout Arrow Rust - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: path: rust fetch-depth: 0 - - name: Make build directory - run: mkdir /build - - name: Build Rust - run: conda run --no-capture-output ci/scripts/rust_build.sh $PWD /build - - name: Build C++ - run: conda run --no-capture-output ci/scripts/cpp_build.sh $PWD /build - - name: Build C# - run: conda run --no-capture-output ci/scripts/csharp_build.sh $PWD /build - - name: Build Go - run: conda run --no-capture-output ci/scripts/go_build.sh $PWD - - name: Build Java - run: conda run --no-capture-output ci/scripts/java_build.sh $PWD /build - - name: Build JS - run: conda run --no-capture-output ci/scripts/js_build.sh $PWD /build - - name: Install archery - run: conda run --no-capture-output pip install -e dev/archery - - name: Run integration tests - run: | - conda run --no-capture-output archery integration \ - --run-flight \ - --with-cpp=1 \ - --with-csharp=1 \ - --with-java=1 \ - --with-js=1 \ - --with-go=1 \ - --with-rust=1 \ - --gold-dirs=testing/data/arrow-ipc-stream/integration/0.14.1 \ - --gold-dirs=testing/data/arrow-ipc-stream/integration/0.17.1 \ - --gold-dirs=testing/data/arrow-ipc-stream/integration/1.0.0-bigendian \ - --gold-dirs=testing/data/arrow-ipc-stream/integration/1.0.0-littleendian \ - --gold-dirs=testing/data/arrow-ipc-stream/integration/2.0.0-compression \ - --gold-dirs=testing/data/arrow-ipc-stream/integration/4.0.0-shareddict + - name: Build + run: conda run --no-capture-output ci/scripts/integration_arrow_build.sh $PWD /build + - name: Run + run: conda run --no-capture-output ci/scripts/integration_arrow.sh $PWD /build # test FFI against the C-Data interface exposed by pyarrow pyarrow-integration-test: @@ -126,8 +107,10 @@ jobs: strategy: matrix: rust: [ stable ] + # PyArrow 13 was the last version prior to introduction to Arrow PyCapsules + pyarrow: [ "13", "14" ] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: Setup Rust toolchain @@ -148,14 +131,14 @@ jobs: key: ${{ runner.os }}-${{ matrix.arch }}-target-maturin-cache-${{ matrix.rust }}- - uses: actions/setup-python@v4 with: - python-version: '3.7' + python-version: '3.8' - name: Upgrade pip and setuptools run: pip install --upgrade pip setuptools wheel virtualenv - name: Create virtualenv and install dependencies run: | virtualenv venv source venv/bin/activate - pip install maturin toml pytest pytz pyarrow>=5.0 + pip install maturin toml pytest pytz pyarrow==${{ matrix.pyarrow }} - name: Run Rust tests run: | source venv/bin/activate diff --git a/.github/workflows/miri.yaml b/.github/workflows/miri.yaml index 0c1f8069cd40..19b432121b6f 100644 --- a/.github/workflows/miri.yaml +++ b/.github/workflows/miri.yaml @@ -36,6 +36,7 @@ on: - arrow-data/** - arrow-ipc/** - arrow-json/** + - arrow-avro/** - arrow-schema/** - arrow-select/** - arrow-string/** @@ -46,7 +47,7 @@ jobs: name: MIRI runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: Setup Rust toolchain diff --git a/.github/workflows/object_store.yml b/.github/workflows/object_store.yml index 01e14022e122..1b991e33c097 100644 --- a/.github/workflows/object_store.yml +++ b/.github/workflows/object_store.yml @@ -43,7 +43,7 @@ jobs: run: working-directory: object_store steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - name: Setup Clippy @@ -60,6 +60,8 @@ jobs: run: cargo clippy --features gcp -- -D warnings - name: Run clippy with azure feature run: cargo clippy --features azure -- -D warnings + - name: Run clippy with http feature + run: cargo clippy --features http -- -D warnings - name: Run clippy with all features run: cargo clippy --all-features -- -D warnings - name: Run clippy with all features and all targets @@ -79,7 +81,7 @@ jobs: env: RUSTDOCFLAGS: "-Dwarnings" steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Run cargo doc run: cargo doc --document-private-items --no-deps --workspace --all-features @@ -115,7 +117,7 @@ jobs: GOOGLE_SERVICE_ACCOUNT: "/tmp/gcs.json" steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Configure Fake GCS Server (GCP emulation) # Custom image - see fsouza/fake-gcs-server#1164 @@ -124,7 +126,7 @@ jobs: # Give the container a moment to start up prior to configuring it sleep 1 curl -v -X POST --data-binary '{"name":"test-bucket"}' -H "Content-Type: application/json" "http://localhost:4443/storage/v1/b" - echo '{"gcs_base_url": "http://localhost:4443", "disable_oauth": true, "client_email": "", "private_key": ""}' > "$GOOGLE_SERVICE_ACCOUNT" + echo '{"gcs_base_url": "http://localhost:4443", "disable_oauth": true, "client_email": "", "private_key": "", "private_key_id": ""}' > "$GOOGLE_SERVICE_ACCOUNT" - name: Setup WebDav run: docker run -d -p 8080:80 rclone/rclone serve webdav /data --addr :80 @@ -160,7 +162,7 @@ jobs: run: working-directory: object_store steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: Setup Rust toolchain diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml index 55599b776c32..d664a0dc0730 100644 --- a/.github/workflows/parquet.yml +++ b/.github/workflows/parquet.yml @@ -40,6 +40,7 @@ on: - arrow-ipc/** - arrow-csv/** - arrow-json/** + - arrow-avro/** - parquet/** - .github/** @@ -51,7 +52,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: Setup Rust toolchain @@ -74,7 +75,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: Setup Rust toolchain @@ -116,17 +117,19 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: target: wasm32-unknown-unknown,wasm32-wasi + - name: Install clang # Needed for zlib compilation + run: apt-get update && apt-get install -y clang gcc-multilib - name: Build wasm32-unknown-unknown - run: cargo build -p parquet --no-default-features --features cli,snap,flate2,brotli --target wasm32-unknown-unknown + run: cargo build -p parquet --target wasm32-unknown-unknown - name: Build wasm32-wasi - run: cargo build -p parquet --no-default-features --features cli,snap,flate2,brotli --target wasm32-wasi + run: cargo build -p parquet --target wasm32-wasi pyspark-integration-test: name: PySpark Integration Test @@ -135,7 +138,7 @@ jobs: matrix: rust: [ stable ] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup Python uses: actions/setup-python@v4 with: @@ -168,7 +171,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - name: Setup Clippy diff --git a/.github/workflows/parquet_derive.yml b/.github/workflows/parquet_derive.yml index 72b90ecfd81a..d8b02f73a8aa 100644 --- a/.github/workflows/parquet_derive.yml +++ b/.github/workflows/parquet_derive.yml @@ -43,7 +43,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: Setup Rust toolchain @@ -57,7 +57,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - name: Setup Clippy diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index f198f48dfec5..9c4b28b691b7 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -37,7 +37,7 @@ jobs: name: Test on Mac runs-on: macos-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: Install protoc with brew @@ -60,7 +60,7 @@ jobs: name: Test on Windows runs-on: windows-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true - name: Install protobuf compiler in /d/protoc @@ -93,7 +93,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - name: Setup rustfmt @@ -110,7 +110,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - name: Install cargo-msrv diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index c404133f564e..336adff990bd 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,265 @@ # Historical Changelog +## [48.0.0](https://github.com/apache/arrow-rs/tree/48.0.0) (2023-10-18) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/47.0.0...48.0.0) + +**Breaking changes:** + +- Evaluate null\_regex for string type in csv \(now such values will be parsed as `Null` rather than `""`\) [\#4942](https://github.com/apache/arrow-rs/pull/4942) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([haohuaijin](https://github.com/haohuaijin)) +- fix\(csv\)!: infer null for empty column. [\#4910](https://github.com/apache/arrow-rs/pull/4910) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kskalski](https://github.com/kskalski)) +- feat: log headers/trailers in flight CLI \(+ minor fixes\) [\#4898](https://github.com/apache/arrow-rs/pull/4898) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) +- fix\(arrow-json\)!: include null fields in schema inference with a type of Null [\#4894](https://github.com/apache/arrow-rs/pull/4894) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kskalski](https://github.com/kskalski)) +- Mark OnCloseRowGroup Send [\#4893](https://github.com/apache/arrow-rs/pull/4893) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([devinjdangelo](https://github.com/devinjdangelo)) +- Specialize Thrift Decoding \(~40% Faster\) \(\#4891\) [\#4892](https://github.com/apache/arrow-rs/pull/4892) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Make ArrowRowGroupWriter Public and SerializedRowGroupWriter Send [\#4850](https://github.com/apache/arrow-rs/pull/4850) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([devinjdangelo](https://github.com/devinjdangelo)) + +**Implemented enhancements:** + +- Allow schema fields to merge with `Null` datatype [\#4901](https://github.com/apache/arrow-rs/issues/4901) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add option to FlightDataEncoder to always send dictionaries [\#4895](https://github.com/apache/arrow-rs/issues/4895) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Rework Thrift Encoding / Decoding of Parquet Metadata [\#4891](https://github.com/apache/arrow-rs/issues/4891) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Plans for supporting Extension Array to support Fixed shape tensor Array [\#4890](https://github.com/apache/arrow-rs/issues/4890) +- Implement Take for UnionArray [\#4882](https://github.com/apache/arrow-rs/issues/4882) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Check precision overflow for casting floating to decimal [\#4865](https://github.com/apache/arrow-rs/issues/4865) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Replace lexical [\#4774](https://github.com/apache/arrow-rs/issues/4774) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add read access to settings in `csv::WriterBuilder` [\#4735](https://github.com/apache/arrow-rs/issues/4735) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Improve the performance of "DictionaryValue" row encoding [\#4712](https://github.com/apache/arrow-rs/issues/4712) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] + +**Fixed bugs:** + +- Should we make blank values and empty string to `None` in csv? [\#4939](https://github.com/apache/arrow-rs/issues/4939) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[FlightSQL\] SubstraitPlan structure is not exported [\#4932](https://github.com/apache/arrow-rs/issues/4932) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Loading page index breaks skipping of pages with nested types [\#4921](https://github.com/apache/arrow-rs/issues/4921) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- CSV schema inference assumes `Utf8` for empty columns [\#4903](https://github.com/apache/arrow-rs/issues/4903) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- parquet: Field Ids are not read from a Parquet file without serialized arrow schema [\#4877](https://github.com/apache/arrow-rs/issues/4877) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- make\_primitive\_scalar function loses DataType Internal information [\#4851](https://github.com/apache/arrow-rs/issues/4851) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- StructBuilder doesn't handle nulls correctly for empty structs [\#4842](https://github.com/apache/arrow-rs/issues/4842) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `NullArray::is_null()` returns `false` incorrectly [\#4835](https://github.com/apache/arrow-rs/issues/4835) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- cast\_string\_to\_decimal should check precision overflow [\#4829](https://github.com/apache/arrow-rs/issues/4829) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Null fields are omitted by `infer_json_schema_from_seekable` [\#4814](https://github.com/apache/arrow-rs/issues/4814) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Closed issues:** + +- Support for reading JSON Array to Arrow [\#4905](https://github.com/apache/arrow-rs/issues/4905) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Merged pull requests:** + +- Assume Pages Delimit Records When Offset Index Loaded \(\#4921\) [\#4943](https://github.com/apache/arrow-rs/pull/4943) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Update pyo3 requirement from 0.19 to 0.20 [\#4941](https://github.com/apache/arrow-rs/pull/4941) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([crepererum](https://github.com/crepererum)) +- Add `FileWriter` schema getter [\#4940](https://github.com/apache/arrow-rs/pull/4940) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([haixuanTao](https://github.com/haixuanTao)) +- feat: support parsing for parquet writer option [\#4938](https://github.com/apache/arrow-rs/pull/4938) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([fansehep](https://github.com/fansehep)) +- Export `SubstraitPlan` structure in arrow\_flight::sql \(\#4932\) [\#4933](https://github.com/apache/arrow-rs/pull/4933) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([amartins23](https://github.com/amartins23)) +- Update zstd requirement from 0.12.0 to 0.13.0 [\#4923](https://github.com/apache/arrow-rs/pull/4923) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- feat: add method for async read bloom filter [\#4917](https://github.com/apache/arrow-rs/pull/4917) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([hengfeiyang](https://github.com/hengfeiyang)) +- Minor: Clarify rationale for `FlightDataEncoder` API, add examples [\#4916](https://github.com/apache/arrow-rs/pull/4916) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Update regex-syntax requirement from 0.7.1 to 0.8.0 [\#4914](https://github.com/apache/arrow-rs/pull/4914) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- feat: document & streamline flight SQL CLI [\#4912](https://github.com/apache/arrow-rs/pull/4912) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) +- Support Arbitrary JSON values in JSON Reader \(\#4905\) [\#4911](https://github.com/apache/arrow-rs/pull/4911) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Cleanup CSV WriterBuilder, Default to AutoSI Second Precision \(\#4735\) [\#4909](https://github.com/apache/arrow-rs/pull/4909) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update proc-macro2 requirement from =1.0.68 to =1.0.69 [\#4907](https://github.com/apache/arrow-rs/pull/4907) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- chore: add csv example [\#4904](https://github.com/apache/arrow-rs/pull/4904) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([fansehep](https://github.com/fansehep)) +- feat\(schema\): allow null fields to be merged with other datatypes [\#4902](https://github.com/apache/arrow-rs/pull/4902) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kskalski](https://github.com/kskalski)) +- Update proc-macro2 requirement from =1.0.67 to =1.0.68 [\#4900](https://github.com/apache/arrow-rs/pull/4900) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add option to `FlightDataEncoder` to always resend batch dictionaries [\#4896](https://github.com/apache/arrow-rs/pull/4896) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel)) +- Fix integration tests [\#4889](https://github.com/apache/arrow-rs/pull/4889) ([tustvold](https://github.com/tustvold)) +- Support Parsing Avro File Headers [\#4888](https://github.com/apache/arrow-rs/pull/4888) ([tustvold](https://github.com/tustvold)) +- Support parquet bloom filter length [\#4885](https://github.com/apache/arrow-rs/pull/4885) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([letian-jiang](https://github.com/letian-jiang)) +- Replace lz4 with lz4\_flex Allowing Compilation for WASM [\#4884](https://github.com/apache/arrow-rs/pull/4884) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Implement Take for UnionArray [\#4883](https://github.com/apache/arrow-rs/pull/4883) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([avantgardnerio](https://github.com/avantgardnerio)) +- Update tonic-build requirement from =0.10.1 to =0.10.2 [\#4881](https://github.com/apache/arrow-rs/pull/4881) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- parquet: Read field IDs from Parquet Schema [\#4878](https://github.com/apache/arrow-rs/pull/4878) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Samrose-Ahmed](https://github.com/Samrose-Ahmed)) +- feat: improve flight CLI error handling [\#4873](https://github.com/apache/arrow-rs/pull/4873) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) +- Support Encoding Parquet Columns in Parallel [\#4871](https://github.com/apache/arrow-rs/pull/4871) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Check precision overflow for casting floating to decimal [\#4866](https://github.com/apache/arrow-rs/pull/4866) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Make align\_buffers as public API [\#4863](https://github.com/apache/arrow-rs/pull/4863) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Enable new integration tests \(\#4828\) [\#4862](https://github.com/apache/arrow-rs/pull/4862) ([tustvold](https://github.com/tustvold)) +- Faster Serde Integration \(~80% faster\) [\#4861](https://github.com/apache/arrow-rs/pull/4861) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- fix: make\_primitive\_scalar bug [\#4852](https://github.com/apache/arrow-rs/pull/4852) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([JasonLi-cn](https://github.com/JasonLi-cn)) +- Update tonic-build requirement from =0.10.0 to =0.10.1 [\#4846](https://github.com/apache/arrow-rs/pull/4846) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Allow Constructing Non-Empty StructArray with no Fields \(\#4842\) [\#4845](https://github.com/apache/arrow-rs/pull/4845) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Refine documentation to `Array::is_null` [\#4838](https://github.com/apache/arrow-rs/pull/4838) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- fix: add missing precision overflow checking for `cast_string_to_decimal` [\#4830](https://github.com/apache/arrow-rs/pull/4830) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jonahgao](https://github.com/jonahgao)) +## [47.0.0](https://github.com/apache/arrow-rs/tree/47.0.0) (2023-09-19) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/46.0.0...47.0.0) + +**Breaking changes:** + +- Make FixedSizeBinaryArray value\_data return a reference [\#4820](https://github.com/apache/arrow-rs/issues/4820) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Update prost to v0.12.1 [\#4825](https://github.com/apache/arrow-rs/pull/4825) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- feat: FixedSizeBinaryArray::value\_data return reference [\#4821](https://github.com/apache/arrow-rs/pull/4821) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) +- Stateless Row Encoding / Don't Preserve Dictionaries in `RowConverter` \(\#4811\) [\#4819](https://github.com/apache/arrow-rs/pull/4819) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- fix: entries field is non-nullable [\#4808](https://github.com/apache/arrow-rs/pull/4808) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) +- Fix flight sql do put handling, add bind parameter support to FlightSQL cli client [\#4797](https://github.com/apache/arrow-rs/pull/4797) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([suremarc](https://github.com/suremarc)) +- Remove unused dyn\_cmp\_dict feature [\#4766](https://github.com/apache/arrow-rs/pull/4766) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add underlying `std::io::Error` to `IoError` and add `IpcError` variant [\#4726](https://github.com/apache/arrow-rs/pull/4726) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alexandreyc](https://github.com/alexandreyc)) + +**Implemented enhancements:** + +- Row Format Adapative Block Size [\#4812](https://github.com/apache/arrow-rs/issues/4812) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Stateless Row Conversion [\#4811](https://github.com/apache/arrow-rs/issues/4811) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Add option to specify custom null values for CSV reader [\#4794](https://github.com/apache/arrow-rs/issues/4794) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- parquet::record::RowIter cannot be customized with batch\_size and defaults to 1024 [\#4782](https://github.com/apache/arrow-rs/issues/4782) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- `DynScalar` abstraction \(something that makes it easy to create scalar `Datum`s\) [\#4781](https://github.com/apache/arrow-rs/issues/4781) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `Datum` is not exported as part of `arrow` \(it is only exported in `arrow_array`\) [\#4780](https://github.com/apache/arrow-rs/issues/4780) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `Scalar` is not exported as part of `arrow` \(it is only exported in `arrow_array`\) [\#4779](https://github.com/apache/arrow-rs/issues/4779) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support IntoPyArrow for impl RecordBatchReader [\#4730](https://github.com/apache/arrow-rs/issues/4730) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Datum Based String Kernels [\#4595](https://github.com/apache/arrow-rs/issues/4595) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] + +**Fixed bugs:** + +- MapArray::new\_from\_strings creates nullable entries field [\#4807](https://github.com/apache/arrow-rs/issues/4807) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- pyarrow module can't roundtrip tensor arrays [\#4805](https://github.com/apache/arrow-rs/issues/4805) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `concat_batches` errors with "schema mismatch" error when only metadata differs [\#4799](https://github.com/apache/arrow-rs/issues/4799) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- panic in `cmp` kernels with DictionaryArrays: `Option::unwrap()` on a `None` value' [\#4788](https://github.com/apache/arrow-rs/issues/4788) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- stream ffi panics if schema metadata values aren't valid utf8 [\#4750](https://github.com/apache/arrow-rs/issues/4750) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Regression: Incorrect Sorting of `*ListArray` in 46.0.0 [\#4746](https://github.com/apache/arrow-rs/issues/4746) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Row is no longer comparable after reuse [\#4741](https://github.com/apache/arrow-rs/issues/4741) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- DoPut FlightSQL handler inadvertently consumes schema at start of Request\\> [\#4658](https://github.com/apache/arrow-rs/issues/4658) +- Return error when converting schema [\#4752](https://github.com/apache/arrow-rs/pull/4752) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) +- Implement PyArrowType for `Box` [\#4751](https://github.com/apache/arrow-rs/pull/4751) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) + +**Closed issues:** + +- Building arrow-rust for target wasm32-wasi falied to compile packed\_simd\_2 [\#4717](https://github.com/apache/arrow-rs/issues/4717) + +**Merged pull requests:** + +- Respect FormatOption::nulls for NullArray [\#4836](https://github.com/apache/arrow-rs/pull/4836) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix merge\_dictionary\_values in selection kernels [\#4833](https://github.com/apache/arrow-rs/pull/4833) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix like scalar null [\#4832](https://github.com/apache/arrow-rs/pull/4832) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- More chrono deprecations [\#4822](https://github.com/apache/arrow-rs/pull/4822) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Adaptive Row Block Size \(\#4812\) [\#4818](https://github.com/apache/arrow-rs/pull/4818) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update proc-macro2 requirement from =1.0.66 to =1.0.67 [\#4816](https://github.com/apache/arrow-rs/pull/4816) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Do not check schema for equality in concat\_batches [\#4815](https://github.com/apache/arrow-rs/pull/4815) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- fix: export record batch through stream [\#4806](https://github.com/apache/arrow-rs/pull/4806) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) +- Improve CSV Reader Benchmark Coverage of Small Primitives [\#4803](https://github.com/apache/arrow-rs/pull/4803) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- csv: Add option to specify custom null values [\#4795](https://github.com/apache/arrow-rs/pull/4795) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([vrongmeal](https://github.com/vrongmeal)) +- Expand docstring and add example to `Scalar` [\#4793](https://github.com/apache/arrow-rs/pull/4793) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Re-export array crate root \(\#4780\) \(\#4779\) [\#4791](https://github.com/apache/arrow-rs/pull/4791) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix DictionaryArray::normalized\_keys \(\#4788\) [\#4789](https://github.com/apache/arrow-rs/pull/4789) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Allow custom tree builder for parquet::record::RowIter [\#4783](https://github.com/apache/arrow-rs/pull/4783) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([YuraKotov](https://github.com/YuraKotov)) +- Bump actions/checkout from 3 to 4 [\#4767](https://github.com/apache/arrow-rs/pull/4767) ([dependabot[bot]](https://github.com/apps/dependabot)) +- fix: avoid panic if offset index not exists. [\#4761](https://github.com/apache/arrow-rs/pull/4761) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([RinChanNOWWW](https://github.com/RinChanNOWWW)) +- Relax constraints on PyArrowType [\#4757](https://github.com/apache/arrow-rs/pull/4757) ([tustvold](https://github.com/tustvold)) +- Chrono deprecations [\#4748](https://github.com/apache/arrow-rs/pull/4748) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix List Sorting, Revert Removal of Rank Kernels [\#4747](https://github.com/apache/arrow-rs/pull/4747) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Clear row buffer before reuse [\#4742](https://github.com/apache/arrow-rs/pull/4742) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([yjshen](https://github.com/yjshen)) +- Datum based like kernels \(\#4595\) [\#4732](https://github.com/apache/arrow-rs/pull/4732) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- feat: expose DoGet response headers & trailers [\#4727](https://github.com/apache/arrow-rs/pull/4727) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) +- Cleanup length and bit\_length kernels [\#4718](https://github.com/apache/arrow-rs/pull/4718) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +## [46.0.0](https://github.com/apache/arrow-rs/tree/46.0.0) (2023-08-21) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/45.0.0...46.0.0) + +**Breaking changes:** + +- API improvement: `batches_to_flight_data` forces clone [\#4656](https://github.com/apache/arrow-rs/issues/4656) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add AnyDictionary Abstraction and Take ArrayRef in DictionaryArray::with\_values [\#4707](https://github.com/apache/arrow-rs/pull/4707) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Cleanup parquet type builders [\#4706](https://github.com/apache/arrow-rs/pull/4706) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Take kernel dyn Array [\#4705](https://github.com/apache/arrow-rs/pull/4705) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Improve ergonomics of Scalar [\#4704](https://github.com/apache/arrow-rs/pull/4704) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Datum based comparison kernels \(\#4596\) [\#4701](https://github.com/apache/arrow-rs/pull/4701) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) +- Improve `Array` Logical Nullability [\#4691](https://github.com/apache/arrow-rs/pull/4691) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Validate ArrayData Buffer Alignment and Automatically Align IPC buffers \(\#4255\) [\#4681](https://github.com/apache/arrow-rs/pull/4681) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- More intuitive bool-to-string casting [\#4666](https://github.com/apache/arrow-rs/pull/4666) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([fsdvh](https://github.com/fsdvh)) +- enhancement: batches\_to\_flight\_data use a schema ref as param. [\#4665](https://github.com/apache/arrow-rs/pull/4665) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([jackwener](https://github.com/jackwener)) +- fix: from\_thrift avoid panic when stats in invalid. [\#4642](https://github.com/apache/arrow-rs/pull/4642) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jackwener](https://github.com/jackwener)) +- bug: Add some missing field in row group metadata: ordinal, total co… [\#4636](https://github.com/apache/arrow-rs/pull/4636) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([liurenjie1024](https://github.com/liurenjie1024)) +- Remove deprecated limit kernel [\#4597](https://github.com/apache/arrow-rs/pull/4597) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) + +**Implemented enhancements:** + +- parquet: support setting the field\_id with an ArrowWriter [\#4702](https://github.com/apache/arrow-rs/issues/4702) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Support references in i256 arithmetic ops [\#4694](https://github.com/apache/arrow-rs/issues/4694) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Precision-Loss Decimal Arithmetic [\#4664](https://github.com/apache/arrow-rs/issues/4664) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Faster i256 Division [\#4663](https://github.com/apache/arrow-rs/issues/4663) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support `concat_batches` for 0 columns [\#4661](https://github.com/apache/arrow-rs/issues/4661) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `filter_record_batch` should support filtering record batch without columns [\#4647](https://github.com/apache/arrow-rs/issues/4647) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Improve speed of `lexicographical_partition_ranges` [\#4614](https://github.com/apache/arrow-rs/issues/4614) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- object\_store: multipart ranges for HTTP [\#4612](https://github.com/apache/arrow-rs/issues/4612) +- Add Rank Function [\#4606](https://github.com/apache/arrow-rs/issues/4606) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Datum Based Comparison Kernels [\#4596](https://github.com/apache/arrow-rs/issues/4596) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Convenience method to create `DataType::List` correctly [\#4544](https://github.com/apache/arrow-rs/issues/4544) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Remove Deprecated Arithmetic Kernels [\#4481](https://github.com/apache/arrow-rs/issues/4481) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Equality kernel where null==null gives true [\#4438](https://github.com/apache/arrow-rs/issues/4438) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Fixed bugs:** + +- Parquet ArrowWriter Ignores Nulls in Dictionary Values [\#4690](https://github.com/apache/arrow-rs/issues/4690) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Schema Nullability Validation Fails to Account for Dictionary Nulls [\#4689](https://github.com/apache/arrow-rs/issues/4689) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Comparison Kernels Ignore Nulls in Dictionary Values [\#4688](https://github.com/apache/arrow-rs/issues/4688) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Casting List to String Ignores Format Options [\#4669](https://github.com/apache/arrow-rs/issues/4669) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Double free in C Stream Interface [\#4659](https://github.com/apache/arrow-rs/issues/4659) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- CI Failing On Packed SIMD [\#4651](https://github.com/apache/arrow-rs/issues/4651) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `RowInterner::size()` much too low for high cardinality dictionary columns [\#4645](https://github.com/apache/arrow-rs/issues/4645) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Decimal PrimitiveArray change datatype after try\_unary [\#4644](https://github.com/apache/arrow-rs/issues/4644) +- Better explanation in docs for Dictionary field encoding using RowConverter [\#4639](https://github.com/apache/arrow-rs/issues/4639) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `List(FixedSizeBinary)` array equality check may return wrong result [\#4637](https://github.com/apache/arrow-rs/issues/4637) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `arrow::compute::nullif` panics if `NullArray` is provided [\#4634](https://github.com/apache/arrow-rs/issues/4634) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Empty lists in FixedSizeListArray::try\_new is not handled [\#4623](https://github.com/apache/arrow-rs/issues/4623) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Bounds checking in `MutableBuffer::set_null_bits` can be bypassed [\#4620](https://github.com/apache/arrow-rs/issues/4620) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- TypedDictionaryArray Misleading Null Behaviour [\#4616](https://github.com/apache/arrow-rs/issues/4616) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- bug: Parquet writer missing row group metadata fields such as `compressed_size`, `file offset`. [\#4610](https://github.com/apache/arrow-rs/issues/4610) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- `new_null_array` generates an invalid union array [\#4600](https://github.com/apache/arrow-rs/issues/4600) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Footer parsing fails for very large parquet file. [\#4592](https://github.com/apache/arrow-rs/issues/4592) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- bug\(parquet\): Disabling global statistics but enabling for particular column breaks reading [\#4587](https://github.com/apache/arrow-rs/issues/4587) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- `arrow::compute::concat` panics for dense union arrays with non-trivial type IDs [\#4578](https://github.com/apache/arrow-rs/issues/4578) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Closed issues:** + +- \[object\_store\] when Create a AmazonS3 instance work with MinIO without set endpoint got error MissingRegion [\#4617](https://github.com/apache/arrow-rs/issues/4617) + +**Merged pull requests:** + +- Add distinct kernels \(\#960\) \(\#4438\) [\#4716](https://github.com/apache/arrow-rs/pull/4716) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update parquet object\_store 0.7 [\#4715](https://github.com/apache/arrow-rs/pull/4715) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Support Field ID in ArrowWriter \(\#4702\) [\#4710](https://github.com/apache/arrow-rs/pull/4710) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Remove rank kernels [\#4703](https://github.com/apache/arrow-rs/pull/4703) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support references in i256 arithmetic ops [\#4692](https://github.com/apache/arrow-rs/pull/4692) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Cleanup DynComparator \(\#2654\) [\#4687](https://github.com/apache/arrow-rs/pull/4687) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Separate metadata fetch from `ArrowReaderBuilder` construction \(\#4674\) [\#4676](https://github.com/apache/arrow-rs/pull/4676) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- cleanup some assert\(\) with error propagation [\#4673](https://github.com/apache/arrow-rs/pull/4673) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([zeevm](https://github.com/zeevm)) +- Faster i256 Division \(2-100x\) \(\#4663\) [\#4672](https://github.com/apache/arrow-rs/pull/4672) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix MSRV CI [\#4671](https://github.com/apache/arrow-rs/pull/4671) ([tustvold](https://github.com/tustvold)) +- Fix equality of nested nullable FixedSizeBinary \(\#4637\) [\#4670](https://github.com/apache/arrow-rs/pull/4670) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use ArrayFormatter in cast kernel [\#4668](https://github.com/apache/arrow-rs/pull/4668) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Minor: Improve API docs for FlightSQL metadata builders [\#4667](https://github.com/apache/arrow-rs/pull/4667) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) +- Support `concat_batches` for 0 columns [\#4662](https://github.com/apache/arrow-rs/pull/4662) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- fix ownership of c stream error [\#4660](https://github.com/apache/arrow-rs/pull/4660) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) +- Minor: Fix illustration for dict encoding [\#4657](https://github.com/apache/arrow-rs/pull/4657) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([JayjeetAtGithub](https://github.com/JayjeetAtGithub)) +- minor: move comment to the correct location [\#4655](https://github.com/apache/arrow-rs/pull/4655) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) +- Update packed\_simd and run miri tests on simd code [\#4654](https://github.com/apache/arrow-rs/pull/4654) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) +- impl `From>` for `BufferBuilder` and `MutableBuffer` [\#4650](https://github.com/apache/arrow-rs/pull/4650) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) +- Filter record batch with 0 columns [\#4648](https://github.com/apache/arrow-rs/pull/4648) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Account for child `Bucket` size in OrderPreservingInterner [\#4646](https://github.com/apache/arrow-rs/pull/4646) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Implement `Default`,`Extend` and `FromIterator` for `BufferBuilder` [\#4638](https://github.com/apache/arrow-rs/pull/4638) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) +- fix\(select\): handle `NullArray` in `nullif` [\#4635](https://github.com/apache/arrow-rs/pull/4635) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kawadakk](https://github.com/kawadakk)) +- Move `BufferBuilder` to `arrow-buffer` [\#4630](https://github.com/apache/arrow-rs/pull/4630) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) +- allow zero sized empty fixed [\#4626](https://github.com/apache/arrow-rs/pull/4626) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([smiklos](https://github.com/smiklos)) +- fix: compute\_dictionary\_mapping use wrong offsetSize [\#4625](https://github.com/apache/arrow-rs/pull/4625) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) +- impl `FromIterator` for `MutableBuffer` [\#4624](https://github.com/apache/arrow-rs/pull/4624) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) +- expand docs for FixedSizeListArray [\#4622](https://github.com/apache/arrow-rs/pull/4622) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([smiklos](https://github.com/smiklos)) +- fix\(buffer\): panic on end index overflow in `MutableBuffer::set_null_bits` [\#4621](https://github.com/apache/arrow-rs/pull/4621) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kawadakk](https://github.com/kawadakk)) +- impl `Default` for `arrow_buffer::buffer::MutableBuffer` [\#4619](https://github.com/apache/arrow-rs/pull/4619) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) +- Minor: improve docs and add example for lexicographical\_partition\_ranges [\#4615](https://github.com/apache/arrow-rs/pull/4615) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Cleanup sort [\#4613](https://github.com/apache/arrow-rs/pull/4613) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add rank function \(\#4606\) [\#4609](https://github.com/apache/arrow-rs/pull/4609) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add more docs and examples for ListArray and OffsetsBuffer [\#4607](https://github.com/apache/arrow-rs/pull/4607) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Simplify dictionary sort [\#4605](https://github.com/apache/arrow-rs/pull/4605) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Consolidate sort benchmarks [\#4604](https://github.com/apache/arrow-rs/pull/4604) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Don't Reorder Nulls in sort\_to\_indices \(\#4545\) [\#4603](https://github.com/apache/arrow-rs/pull/4603) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- fix\(data\): create child arrays of correct length when building a sparse union null array [\#4601](https://github.com/apache/arrow-rs/pull/4601) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kawadakk](https://github.com/kawadakk)) +- Use u32 metadata\_len when parsing footer of parquet. [\#4599](https://github.com/apache/arrow-rs/pull/4599) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Berrysoft](https://github.com/Berrysoft)) +- fix\(data\): map type ID to child index before indexing a union child array [\#4598](https://github.com/apache/arrow-rs/pull/4598) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kawadakk](https://github.com/kawadakk)) +- Remove deprecated arithmetic kernels \(\#4481\) [\#4594](https://github.com/apache/arrow-rs/pull/4594) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Test Disabled Page Statistics \(\#4587\) [\#4589](https://github.com/apache/arrow-rs/pull/4589) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Cleanup ArrayData::buffers [\#4583](https://github.com/apache/arrow-rs/pull/4583) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use contains\_nulls in ArrayData equality of byte arrays [\#4582](https://github.com/apache/arrow-rs/pull/4582) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Vectorized lexicographical\_partition\_ranges \(~80% faster\) [\#4575](https://github.com/apache/arrow-rs/pull/4575) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- chore: add datatype new\_list [\#4561](https://github.com/apache/arrow-rs/pull/4561) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([fansehep](https://github.com/fansehep)) ## [45.0.0](https://github.com/apache/arrow-rs/tree/45.0.0) (2023-07-30) [Full Changelog](https://github.com/apache/arrow-rs/compare/44.0.0...45.0.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 74f74bc3ef13..ba27d6679ffe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,116 +19,54 @@ # Changelog -## [46.0.0](https://github.com/apache/arrow-rs/tree/46.0.0) (2023-08-21) +## [49.0.0](https://github.com/apache/arrow-rs/tree/49.0.0) (2023-11-07) -[Full Changelog](https://github.com/apache/arrow-rs/compare/45.0.0...46.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/48.0.0...49.0.0) **Breaking changes:** -- API improvement: `batches_to_flight_data` forces clone [\#4656](https://github.com/apache/arrow-rs/issues/4656) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add AnyDictionary Abstraction and Take ArrayRef in DictionaryArray::with\_values [\#4707](https://github.com/apache/arrow-rs/pull/4707) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Cleanup parquet type builders [\#4706](https://github.com/apache/arrow-rs/pull/4706) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Take kernel dyn Array [\#4705](https://github.com/apache/arrow-rs/pull/4705) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Improve ergonomics of Scalar [\#4704](https://github.com/apache/arrow-rs/pull/4704) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Datum based comparison kernels \(\#4596\) [\#4701](https://github.com/apache/arrow-rs/pull/4701) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([tustvold](https://github.com/tustvold)) -- Improve `Array` Logical Nullability [\#4691](https://github.com/apache/arrow-rs/pull/4691) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Validate ArrayData Buffer Alignment and Automatically Align IPC buffers \(\#4255\) [\#4681](https://github.com/apache/arrow-rs/pull/4681) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- More intuitive bool-to-string casting [\#4666](https://github.com/apache/arrow-rs/pull/4666) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([fsdvh](https://github.com/fsdvh)) -- enhancement: batches\_to\_flight\_data use a schema ref as param. [\#4665](https://github.com/apache/arrow-rs/pull/4665) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([jackwener](https://github.com/jackwener)) -- fix: from\_thrift avoid panic when stats in invalid. [\#4642](https://github.com/apache/arrow-rs/pull/4642) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jackwener](https://github.com/jackwener)) -- bug: Add some missing field in row group metadata: ordinal, total co… [\#4636](https://github.com/apache/arrow-rs/pull/4636) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([liurenjie1024](https://github.com/liurenjie1024)) -- Remove deprecated limit kernel [\#4597](https://github.com/apache/arrow-rs/pull/4597) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Return row count when inferring schema from JSON [\#5008](https://github.com/apache/arrow-rs/pull/5008) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([asayers](https://github.com/asayers)) +- Update object\_store 0.8.0 [\#5043](https://github.com/apache/arrow-rs/pull/5043) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) **Implemented enhancements:** -- parquet: support setting the field\_id with an ArrowWriter [\#4702](https://github.com/apache/arrow-rs/issues/4702) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Support references in i256 arithmetic ops [\#4694](https://github.com/apache/arrow-rs/issues/4694) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Precision-Loss Decimal Arithmetic [\#4664](https://github.com/apache/arrow-rs/issues/4664) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Faster i256 Division [\#4663](https://github.com/apache/arrow-rs/issues/4663) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support `concat_batches` for 0 columns [\#4661](https://github.com/apache/arrow-rs/issues/4661) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- `filter_record_batch` should support filtering record batch without columns [\#4647](https://github.com/apache/arrow-rs/issues/4647) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Improve speed of `lexicographical_partition_ranges` [\#4614](https://github.com/apache/arrow-rs/issues/4614) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- object\_store: multipart ranges for HTTP [\#4612](https://github.com/apache/arrow-rs/issues/4612) -- Add Rank Function [\#4606](https://github.com/apache/arrow-rs/issues/4606) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Datum Based Comparison Kernels [\#4596](https://github.com/apache/arrow-rs/issues/4596) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- Convenience method to create `DataType::List` correctly [\#4544](https://github.com/apache/arrow-rs/issues/4544) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Remove Deprecated Arithmetic Kernels [\#4481](https://github.com/apache/arrow-rs/issues/4481) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Equality kernel where null==null gives true [\#4438](https://github.com/apache/arrow-rs/issues/4438) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Cast from integer/timestamp to timestamp/integer [\#5039](https://github.com/apache/arrow-rs/issues/5039) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support casting from integer to binary [\#5014](https://github.com/apache/arrow-rs/issues/5014) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Return row count when inferring schema from JSON [\#5007](https://github.com/apache/arrow-rs/issues/5007) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[FlightSQL\] Allow custom commands in get-flight-info [\#4996](https://github.com/apache/arrow-rs/issues/4996) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Support `RecordBatch::remove_column()` and `Schema::remove_field()` [\#4952](https://github.com/apache/arrow-rs/issues/4952) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `arrow_json`: support `binary` deserialization [\#4945](https://github.com/apache/arrow-rs/issues/4945) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support StructArray in Cast Kernel [\#4908](https://github.com/apache/arrow-rs/issues/4908) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- There exists a `ParquetRecordWriter` proc macro in `parquet_derive`, but `ParquetRecordReader` is missing [\#4772](https://github.com/apache/arrow-rs/issues/4772) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] **Fixed bugs:** -- Parquet ArrowWriter Ignores Nulls in Dictionary Values [\#4690](https://github.com/apache/arrow-rs/issues/4690) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Schema Nullability Validation Fails to Account for Dictionary Nulls [\#4689](https://github.com/apache/arrow-rs/issues/4689) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Comparison Kernels Ignore Nulls in Dictionary Values [\#4688](https://github.com/apache/arrow-rs/issues/4688) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Casting List to String Ignores Format Options [\#4669](https://github.com/apache/arrow-rs/issues/4669) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Double free in C Stream Interface [\#4659](https://github.com/apache/arrow-rs/issues/4659) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- CI Failing On Packed SIMD [\#4651](https://github.com/apache/arrow-rs/issues/4651) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- `RowInterner::size()` much too low for high cardinality dictionary columns [\#4645](https://github.com/apache/arrow-rs/issues/4645) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Decimal PrimitiveArray change datatype after try\_unary [\#4644](https://github.com/apache/arrow-rs/issues/4644) -- Better explanation in docs for Dictionary field encoding using RowConverter [\#4639](https://github.com/apache/arrow-rs/issues/4639) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- `List(FixedSizeBinary)` array equality check may return wrong result [\#4637](https://github.com/apache/arrow-rs/issues/4637) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- `arrow::compute::nullif` panics if `NullArray` is provided [\#4634](https://github.com/apache/arrow-rs/issues/4634) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Empty lists in FixedSizeListArray::try\_new is not handled [\#4623](https://github.com/apache/arrow-rs/issues/4623) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Bounds checking in `MutableBuffer::set_null_bits` can be bypassed [\#4620](https://github.com/apache/arrow-rs/issues/4620) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- TypedDictionaryArray Misleading Null Behaviour [\#4616](https://github.com/apache/arrow-rs/issues/4616) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- bug: Parquet writer missing row group metadata fields such as `compressed_size`, `file offset`. [\#4610](https://github.com/apache/arrow-rs/issues/4610) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- `new_null_array` generates an invalid union array [\#4600](https://github.com/apache/arrow-rs/issues/4600) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Footer parsing fails for very large parquet file. [\#4592](https://github.com/apache/arrow-rs/issues/4592) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- bug\(parquet\): Disabling global statistics but enabling for particular column breaks reading [\#4587](https://github.com/apache/arrow-rs/issues/4587) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- `arrow::compute::concat` panics for dense union arrays with non-trivial type IDs [\#4578](https://github.com/apache/arrow-rs/issues/4578) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] - -**Closed issues:** - -- \[object\_store\] when Create a AmazonS3 instance work with MinIO without set endpoint got error MissingRegion [\#4617](https://github.com/apache/arrow-rs/issues/4617) +- Regression when serializing large json numbers [\#5038](https://github.com/apache/arrow-rs/issues/5038) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- RowSelection::intersection Produces Invalid RowSelection [\#5036](https://github.com/apache/arrow-rs/issues/5036) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Incorrect comment on arrow::compute::kernels::sort::sort\_to\_indices [\#5029](https://github.com/apache/arrow-rs/issues/5029) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Documentation updates:** + +- chore: Update docs to refer to non deprecated function \(`partition`\) [\#5027](https://github.com/apache/arrow-rs/pull/5027) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) **Merged pull requests:** -- Add distinct kernels \(\#960\) \(\#4438\) [\#4716](https://github.com/apache/arrow-rs/pull/4716) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Update parquet object\_store 0.7 [\#4715](https://github.com/apache/arrow-rs/pull/4715) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Support Field ID in ArrowWriter \(\#4702\) [\#4710](https://github.com/apache/arrow-rs/pull/4710) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Remove rank kernels [\#4703](https://github.com/apache/arrow-rs/pull/4703) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Support references in i256 arithmetic ops [\#4692](https://github.com/apache/arrow-rs/pull/4692) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Cleanup DynComparator \(\#2654\) [\#4687](https://github.com/apache/arrow-rs/pull/4687) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Separate metadata fetch from `ArrowReaderBuilder` construction \(\#4674\) [\#4676](https://github.com/apache/arrow-rs/pull/4676) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- cleanup some assert\(\) with error propagation [\#4673](https://github.com/apache/arrow-rs/pull/4673) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([zeevm](https://github.com/zeevm)) -- Faster i256 Division \(2-100x\) \(\#4663\) [\#4672](https://github.com/apache/arrow-rs/pull/4672) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Fix MSRV CI [\#4671](https://github.com/apache/arrow-rs/pull/4671) ([tustvold](https://github.com/tustvold)) -- Fix equality of nested nullable FixedSizeBinary \(\#4637\) [\#4670](https://github.com/apache/arrow-rs/pull/4670) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Use ArrayFormatter in cast kernel [\#4668](https://github.com/apache/arrow-rs/pull/4668) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Minor: Improve API docs for FlightSQL metadata builders [\#4667](https://github.com/apache/arrow-rs/pull/4667) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([alamb](https://github.com/alamb)) -- Support `concat_batches` for 0 columns [\#4662](https://github.com/apache/arrow-rs/pull/4662) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) -- fix ownership of c stream error [\#4660](https://github.com/apache/arrow-rs/pull/4660) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([wjones127](https://github.com/wjones127)) -- Minor: Fix illustration for dict encoding [\#4657](https://github.com/apache/arrow-rs/pull/4657) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([JayjeetAtGithub](https://github.com/JayjeetAtGithub)) -- minor: move comment to the correct location [\#4655](https://github.com/apache/arrow-rs/pull/4655) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) -- Update packed\_simd and run miri tests on simd code [\#4654](https://github.com/apache/arrow-rs/pull/4654) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) -- impl `From>` for `BufferBuilder` and `MutableBuffer` [\#4650](https://github.com/apache/arrow-rs/pull/4650) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) -- Filter record batch with 0 columns [\#4648](https://github.com/apache/arrow-rs/pull/4648) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) -- Account for child `Bucket` size in OrderPreservingInterner [\#4646](https://github.com/apache/arrow-rs/pull/4646) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Implement `Default`,`Extend` and `FromIterator` for `BufferBuilder` [\#4638](https://github.com/apache/arrow-rs/pull/4638) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) -- fix\(select\): handle `NullArray` in `nullif` [\#4635](https://github.com/apache/arrow-rs/pull/4635) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kawadakk](https://github.com/kawadakk)) -- Move `BufferBuilder` to `arrow-buffer` [\#4630](https://github.com/apache/arrow-rs/pull/4630) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) -- allow zero sized empty fixed [\#4626](https://github.com/apache/arrow-rs/pull/4626) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([smiklos](https://github.com/smiklos)) -- fix: compute\_dictionary\_mapping use wrong offsetSize [\#4625](https://github.com/apache/arrow-rs/pull/4625) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jackwener](https://github.com/jackwener)) -- impl `FromIterator` for `MutableBuffer` [\#4624](https://github.com/apache/arrow-rs/pull/4624) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) -- expand docs for FixedSizeListArray [\#4622](https://github.com/apache/arrow-rs/pull/4622) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([smiklos](https://github.com/smiklos)) -- fix\(buffer\): panic on end index overflow in `MutableBuffer::set_null_bits` [\#4621](https://github.com/apache/arrow-rs/pull/4621) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kawadakk](https://github.com/kawadakk)) -- impl `Default` for `arrow_buffer::buffer::MutableBuffer` [\#4619](https://github.com/apache/arrow-rs/pull/4619) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([mbrobbel](https://github.com/mbrobbel)) -- Minor: improve docs and add example for lexicographical\_partition\_ranges [\#4615](https://github.com/apache/arrow-rs/pull/4615) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Cleanup sort [\#4613](https://github.com/apache/arrow-rs/pull/4613) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add rank function \(\#4606\) [\#4609](https://github.com/apache/arrow-rs/pull/4609) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add more docs and examples for ListArray and OffsetsBuffer [\#4607](https://github.com/apache/arrow-rs/pull/4607) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Simplify dictionary sort [\#4605](https://github.com/apache/arrow-rs/pull/4605) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Consolidate sort benchmarks [\#4604](https://github.com/apache/arrow-rs/pull/4604) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Don't Reorder Nulls in sort\_to\_indices \(\#4545\) [\#4603](https://github.com/apache/arrow-rs/pull/4603) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- fix\(data\): create child arrays of correct length when building a sparse union null array [\#4601](https://github.com/apache/arrow-rs/pull/4601) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kawadakk](https://github.com/kawadakk)) -- Use u32 metadata\_len when parsing footer of parquet. [\#4599](https://github.com/apache/arrow-rs/pull/4599) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Berrysoft](https://github.com/Berrysoft)) -- fix\(data\): map type ID to child index before indexing a union child array [\#4598](https://github.com/apache/arrow-rs/pull/4598) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kawadakk](https://github.com/kawadakk)) -- Remove deprecated arithmetic kernels \(\#4481\) [\#4594](https://github.com/apache/arrow-rs/pull/4594) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Test Disabled Page Statistics \(\#4587\) [\#4589](https://github.com/apache/arrow-rs/pull/4589) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Cleanup ArrayData::buffers [\#4583](https://github.com/apache/arrow-rs/pull/4583) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Use contains\_nulls in ArrayData equality of byte arrays [\#4582](https://github.com/apache/arrow-rs/pull/4582) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Vectorized lexicographical\_partition\_ranges \(~80% faster\) [\#4575](https://github.com/apache/arrow-rs/pull/4575) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- chore: add datatype new\_list [\#4561](https://github.com/apache/arrow-rs/pull/4561) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([fansehep](https://github.com/fansehep)) +- Parquet f32/f64 handle signed zeros in statistics [\#5048](https://github.com/apache/arrow-rs/pull/5048) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Jefffrey](https://github.com/Jefffrey)) +- Fix serialization of large integers in JSON \(\#5038\) [\#5042](https://github.com/apache/arrow-rs/pull/5042) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix RowSelection::intersection \(\#5036\) [\#5041](https://github.com/apache/arrow-rs/pull/5041) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Cast from integer/timestamp to timestamp/integer [\#5040](https://github.com/apache/arrow-rs/pull/5040) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- doc: update comment on sort\_to\_indices to reflect correct ordering [\#5033](https://github.com/apache/arrow-rs/pull/5033) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([westonpace](https://github.com/westonpace)) +- Support casting from integer to binary [\#5015](https://github.com/apache/arrow-rs/pull/5015) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Update tracing-log requirement from 0.1 to 0.2 [\#4998](https://github.com/apache/arrow-rs/pull/4998) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- feat\(flight-sql\): Allow custom commands in get-flight-info [\#4997](https://github.com/apache/arrow-rs/pull/4997) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([amartins23](https://github.com/amartins23)) +- \[MINOR\] No need to jump to web pages [\#4994](https://github.com/apache/arrow-rs/pull/4994) ([smallzhongfeng](https://github.com/smallzhongfeng)) +- Support metadata in SchemaBuilder [\#4987](https://github.com/apache/arrow-rs/pull/4987) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat: support schema change by idx and reverse [\#4985](https://github.com/apache/arrow-rs/pull/4985) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([fansehep](https://github.com/fansehep)) +- Bump actions/setup-node from 3 to 4 [\#4982](https://github.com/apache/arrow-rs/pull/4982) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add arrow\_cast::base64 and document usage in arrow\_json [\#4975](https://github.com/apache/arrow-rs/pull/4975) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add SchemaBuilder::remove \(\#4952\) [\#4964](https://github.com/apache/arrow-rs/pull/4964) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add `Field::remove()`, `Schema::remove()`, and `RecordBatch::remove_column()` APIs [\#4959](https://github.com/apache/arrow-rs/pull/4959) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Folyd](https://github.com/Folyd)) +- Add `RecordReader` trait and proc macro to implement it for a struct [\#4773](https://github.com/apache/arrow-rs/pull/4773) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Joseph-Rance](https://github.com/Joseph-Rance)) diff --git a/Cargo.toml b/Cargo.toml index b118c937ca36..d5e834316b91 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,7 @@ members = [ "arrow", "arrow-arith", "arrow-array", + "arrow-avro", "arrow-buffer", "arrow-cast", "arrow-csv", @@ -61,7 +62,7 @@ exclude = [ ] [workspace.package] -version = "46.0.0" +version = "49.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -76,18 +77,20 @@ edition = "2021" rust-version = "1.62" [workspace.dependencies] -arrow = { version = "46.0.0", path = "./arrow", default-features = false } -arrow-arith = { version = "46.0.0", path = "./arrow-arith" } -arrow-array = { version = "46.0.0", path = "./arrow-array" } -arrow-buffer = { version = "46.0.0", path = "./arrow-buffer" } -arrow-cast = { version = "46.0.0", path = "./arrow-cast" } -arrow-csv = { version = "46.0.0", path = "./arrow-csv" } -arrow-data = { version = "46.0.0", path = "./arrow-data" } -arrow-ipc = { version = "46.0.0", path = "./arrow-ipc" } -arrow-json = { version = "46.0.0", path = "./arrow-json" } -arrow-ord = { version = "46.0.0", path = "./arrow-ord" } -arrow-row = { version = "46.0.0", path = "./arrow-row" } -arrow-schema = { version = "46.0.0", path = "./arrow-schema" } -arrow-select = { version = "46.0.0", path = "./arrow-select" } -arrow-string = { version = "46.0.0", path = "./arrow-string" } -parquet = { version = "46.0.0", path = "./parquet", default-features = false } +arrow = { version = "49.0.0", path = "./arrow", default-features = false } +arrow-arith = { version = "49.0.0", path = "./arrow-arith" } +arrow-array = { version = "49.0.0", path = "./arrow-array" } +arrow-buffer = { version = "49.0.0", path = "./arrow-buffer" } +arrow-cast = { version = "49.0.0", path = "./arrow-cast" } +arrow-csv = { version = "49.0.0", path = "./arrow-csv" } +arrow-data = { version = "49.0.0", path = "./arrow-data" } +arrow-ipc = { version = "49.0.0", path = "./arrow-ipc" } +arrow-json = { version = "49.0.0", path = "./arrow-json" } +arrow-ord = { version = "49.0.0", path = "./arrow-ord" } +arrow-row = { version = "49.0.0", path = "./arrow-row" } +arrow-schema = { version = "49.0.0", path = "./arrow-schema" } +arrow-select = { version = "49.0.0", path = "./arrow-select" } +arrow-string = { version = "49.0.0", path = "./arrow-string" } +parquet = { version = "49.0.0", path = "./parquet", default-features = false } + +chrono = { version = "0.4.31", default-features = false, features = ["clock"] } diff --git a/README.md b/README.md index c3108917e87a..8cd3ec970b53 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,6 @@ There is more information in the [contributing] guide. [flight-readme]: arrow-flight/README.md [datafusion-readme]: https://github.com/apache/arrow-datafusion/blob/master/README.md [ballista-readme]: https://github.com/apache/arrow-ballista/blob/master/README.md -[objectstore-readme]: https://github.com/apache/arrow-rs/blob/master/object_store/README.md +[objectstore-readme]: object_store/README.md [issues]: https://github.com/apache/arrow-rs/issues [discussions]: https://github.com/apache/arrow-rs/discussions diff --git a/arrow-arith/Cargo.toml b/arrow-arith/Cargo.toml index b5ea2e3c4354..57dc033e9645 100644 --- a/arrow-arith/Cargo.toml +++ b/arrow-arith/Cargo.toml @@ -38,7 +38,7 @@ arrow-array = { workspace = true } arrow-buffer = { workspace = true } arrow-data = { workspace = true } arrow-schema = { workspace = true } -chrono = { version = "0.4.23", default-features = false } +chrono = { workspace = true } half = { version = "2.1", default-features = false } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/arrow-arith/src/aggregate.rs b/arrow-arith/src/aggregate.rs index 04417c666c85..0dabaa50f5f6 100644 --- a/arrow-arith/src/aggregate.rs +++ b/arrow-arith/src/aggregate.rs @@ -207,15 +207,15 @@ where } let iter = ArrayIter::new(array); - let sum = - iter.into_iter() - .try_fold(T::default_value(), |accumulator, value| { - if let Some(value) = value { - accumulator.add_checked(value) - } else { - Ok(accumulator) - } - })?; + let sum = iter + .into_iter() + .try_fold(T::default_value(), |accumulator, value| { + if let Some(value) = value { + accumulator.add_checked(value) + } else { + Ok(accumulator) + } + })?; Ok(Some(sum)) } @@ -230,11 +230,7 @@ where T: ArrowNumericType, T::Native: ArrowNativeType, { - min_max_array_helper::( - array, - |a, b| (is_nan(*a) & !is_nan(*b)) || a > b, - min, - ) + min_max_array_helper::(array, |a, b| (is_nan(*a) & !is_nan(*b)) || a > b, min) } /// Returns the max of values in the array of `ArrowNumericType` type, or dictionary @@ -244,11 +240,7 @@ where T: ArrowNumericType, T::Native: ArrowNativeType, { - min_max_array_helper::( - array, - |a, b| (!is_nan(*a) & is_nan(*b)) || a < b, - max, - ) + min_max_array_helper::(array, |a, b| (!is_nan(*a) & is_nan(*b)) || a < b, max) } fn min_max_array_helper, F, M>( @@ -501,10 +493,7 @@ mod simd { fn init_accumulator_chunk() -> Self::SimdAccumulator; /// Updates the accumulator with the values of one chunk - fn accumulate_chunk_non_null( - accumulator: &mut Self::SimdAccumulator, - chunk: T::Simd, - ); + fn accumulate_chunk_non_null(accumulator: &mut Self::SimdAccumulator, chunk: T::Simd); /// Updates the accumulator with the values of one chunk according to the given vector mask fn accumulate_chunk_nullable( @@ -602,10 +591,7 @@ mod simd { (T::init(T::default_value()), T::mask_init(false)) } - fn accumulate_chunk_non_null( - accumulator: &mut Self::SimdAccumulator, - chunk: T::Simd, - ) { + fn accumulate_chunk_non_null(accumulator: &mut Self::SimdAccumulator, chunk: T::Simd) { let acc_is_nan = !T::eq(accumulator.0, accumulator.0); let is_lt = acc_is_nan | T::lt(chunk, accumulator.0); let first_or_lt = !accumulator.1 | is_lt; @@ -627,10 +613,7 @@ mod simd { accumulator.1 |= vecmask; } - fn accumulate_scalar( - accumulator: &mut Self::ScalarAccumulator, - value: T::Native, - ) { + fn accumulate_scalar(accumulator: &mut Self::ScalarAccumulator, value: T::Native) { if !accumulator.1 { accumulator.0 = value; } else { @@ -690,10 +673,7 @@ mod simd { (T::init(T::default_value()), T::mask_init(false)) } - fn accumulate_chunk_non_null( - accumulator: &mut Self::SimdAccumulator, - chunk: T::Simd, - ) { + fn accumulate_chunk_non_null(accumulator: &mut Self::SimdAccumulator, chunk: T::Simd) { let chunk_is_nan = !T::eq(chunk, chunk); let is_gt = chunk_is_nan | T::gt(chunk, accumulator.0); let first_or_gt = !accumulator.1 | is_gt; @@ -715,10 +695,7 @@ mod simd { accumulator.1 |= vecmask; } - fn accumulate_scalar( - accumulator: &mut Self::ScalarAccumulator, - value: T::Native, - ) { + fn accumulate_scalar(accumulator: &mut Self::ScalarAccumulator, value: T::Native) { if !accumulator.1 { accumulator.0 = value; } else { @@ -1009,8 +986,7 @@ mod tests { #[test] fn test_primitive_array_bool_or_with_nulls() { - let a = - BooleanArray::from(vec![None, Some(false), Some(false), None, Some(false)]); + let a = BooleanArray::from(vec![None, Some(false), Some(false), None, Some(false)]); assert!(!bool_or(&a).unwrap()); } @@ -1297,8 +1273,7 @@ mod tests { assert_eq!(Some(false), min_boolean(&a)); assert_eq!(Some(true), max_boolean(&a)); - let a = - BooleanArray::from(vec![Some(false), Some(true), None, Some(false), None]); + let a = BooleanArray::from(vec![Some(false), Some(true), None, Some(false), None]); assert_eq!(Some(false), min_boolean(&a)); assert_eq!(Some(true), max_boolean(&a)); } diff --git a/arrow-arith/src/arithmetic.rs b/arrow-arith/src/arithmetic.rs index 8635ce0ddd80..124614d77f97 100644 --- a/arrow-arith/src/arithmetic.rs +++ b/arrow-arith/src/arithmetic.rs @@ -48,8 +48,7 @@ fn get_fixed_point_info( ))); } - let divisor = - i256::from_i128(10).pow_wrapping((product_scale - required_scale) as u32); + let divisor = i256::from_i128(10).pow_wrapping((product_scale - required_scale) as u32); Ok((precision, product_scale, divisor)) } @@ -78,8 +77,7 @@ pub fn multiply_fixed_point_dyn( let left = left.as_any().downcast_ref::().unwrap(); let right = right.as_any().downcast_ref::().unwrap(); - multiply_fixed_point(left, right, required_scale) - .map(|a| Arc::new(a) as ArrayRef) + multiply_fixed_point(left, right, required_scale).map(|a| Arc::new(a) as ArrayRef) } (_, _) => Err(ArrowError::CastError(format!( "Unsupported data type {}, {}", @@ -113,10 +111,8 @@ pub fn multiply_fixed_point_checked( )?; if required_scale == product_scale { - return try_binary::<_, _, _, Decimal128Type>(left, right, |a, b| { - a.mul_checked(b) - })? - .with_precision_and_scale(precision, required_scale); + return try_binary::<_, _, _, Decimal128Type>(left, right, |a, b| a.mul_checked(b))? + .with_precision_and_scale(precision, required_scale); } try_binary::<_, _, _, Decimal128Type>(left, right, |a, b| { @@ -213,17 +209,16 @@ mod tests { .unwrap(); let err = mul(&a, &b).unwrap_err(); - assert!(err.to_string().contains( - "Overflow happened on: 123456789000000000000000000 * 10000000000000000000" - )); + assert!(err + .to_string() + .contains("Overflow happened on: 123456789000000000000000000 * 10000000000000000000")); // Allow precision loss. let result = multiply_fixed_point_checked(&a, &b, 28).unwrap(); // [1234567890] - let expected = - Decimal128Array::from(vec![12345678900000000000000000000000000000]) - .with_precision_and_scale(38, 28) - .unwrap(); + let expected = Decimal128Array::from(vec![12345678900000000000000000000000000000]) + .with_precision_and_scale(38, 28) + .unwrap(); assert_eq!(&expected, &result); assert_eq!( @@ -233,13 +228,9 @@ mod tests { // Rounding case // [0.000000000000000001, 123456789.555555555555555555, 1.555555555555555555] - let a = Decimal128Array::from(vec![ - 1, - 123456789555555555555555555, - 1555555555555555555, - ]) - .with_precision_and_scale(38, 18) - .unwrap(); + let a = Decimal128Array::from(vec![1, 123456789555555555555555555, 1555555555555555555]) + .with_precision_and_scale(38, 18) + .unwrap(); // [1.555555555555555555, 11.222222222222222222, 0.000000000000000001] let b = Decimal128Array::from(vec![1555555555555555555, 11222222222222222222, 1]) @@ -311,10 +302,9 @@ mod tests { )); let result = multiply_fixed_point(&a, &b, 28).unwrap(); - let expected = - Decimal128Array::from(vec![62946009661555981610246871926660136960]) - .with_precision_and_scale(38, 28) - .unwrap(); + let expected = Decimal128Array::from(vec![62946009661555981610246871926660136960]) + .with_precision_and_scale(38, 28) + .unwrap(); assert_eq!(&expected, &result); } @@ -338,10 +328,9 @@ mod tests { // Avoid overflow by reducing the scale. let result = multiply_fixed_point(&a, &b, 28).unwrap(); // [1234567890] - let expected = - Decimal128Array::from(vec![12345678900000000000000000000000000000]) - .with_precision_and_scale(38, 28) - .unwrap(); + let expected = Decimal128Array::from(vec![12345678900000000000000000000000000000]) + .with_precision_and_scale(38, 28) + .unwrap(); assert_eq!(&expected, &result); assert_eq!( diff --git a/arrow-arith/src/arity.rs b/arrow-arith/src/arity.rs index f3118d104536..ff8b82a5d943 100644 --- a/arrow-arith/src/arity.rs +++ b/arrow-arith/src/arity.rs @@ -49,10 +49,7 @@ where } /// See [`PrimitiveArray::try_unary`] -pub fn try_unary( - array: &PrimitiveArray, - op: F, -) -> Result, ArrowError> +pub fn try_unary(array: &PrimitiveArray, op: F) -> Result, ArrowError> where I: ArrowPrimitiveType, O: ArrowPrimitiveType, @@ -86,10 +83,7 @@ where } /// A helper function that applies a fallible unary function to a dictionary array with primitive value type. -fn try_unary_dict( - array: &DictionaryArray, - op: F, -) -> Result +fn try_unary_dict(array: &DictionaryArray, op: F) -> Result where K: ArrowDictionaryKeyType + ArrowNumericType, T: ArrowPrimitiveType, @@ -299,8 +293,7 @@ where try_binary_no_nulls(len, a, b, op) } else { let nulls = - NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()) - .unwrap(); + NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()).unwrap(); let mut buffer = BufferBuilder::::new(len); buffer.append_n_zeroed(len); @@ -308,8 +301,7 @@ where nulls.try_for_each_valid_idx(|idx| { unsafe { - *slice.get_unchecked_mut(idx) = - op(a.value_unchecked(idx), b.value_unchecked(idx))? + *slice.get_unchecked_mut(idx) = op(a.value_unchecked(idx), b.value_unchecked(idx))? }; Ok::<_, ArrowError>(()) })?; @@ -360,8 +352,7 @@ where try_binary_no_nulls_mut(len, a, b, op) } else { let nulls = - NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()) - .unwrap(); + NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()).unwrap(); let mut builder = a.into_builder()?; @@ -440,8 +431,7 @@ mod tests { #[test] #[allow(deprecated)] fn test_unary_f64_slice() { - let input = - Float64Array::from(vec![Some(5.1f64), None, Some(6.8), None, Some(7.2)]); + let input = Float64Array::from(vec![Some(5.1f64), None, Some(6.8), None, Some(7.2)]); let input_slice = input.slice(1, 4); let result = unary(&input_slice, |n| n.round()); assert_eq!( diff --git a/arrow-arith/src/bitwise.rs b/arrow-arith/src/bitwise.rs index a5dec4638703..c7885952f8ba 100644 --- a/arrow-arith/src/bitwise.rs +++ b/arrow-arith/src/bitwise.rs @@ -212,10 +212,8 @@ mod tests { #[test] fn test_bitwise_shift_left() { let left = UInt64Array::from(vec![Some(1), Some(2), None, Some(4), Some(8)]); - let right = - UInt64Array::from(vec![Some(5), Some(10), Some(8), Some(12), Some(u64::MAX)]); - let expected = - UInt64Array::from(vec![Some(32), Some(2048), None, Some(16384), Some(0)]); + let right = UInt64Array::from(vec![Some(5), Some(10), Some(8), Some(12), Some(u64::MAX)]); + let expected = UInt64Array::from(vec![Some(32), Some(2048), None, Some(16384), Some(0)]); let result = bitwise_shift_left(&left, &right).unwrap(); assert_eq!(expected, result); } @@ -224,18 +222,15 @@ mod tests { fn test_bitwise_shift_left_scalar() { let left = UInt64Array::from(vec![Some(1), Some(2), None, Some(4), Some(8)]); let scalar = 2; - let expected = - UInt64Array::from(vec![Some(4), Some(8), None, Some(16), Some(32)]); + let expected = UInt64Array::from(vec![Some(4), Some(8), None, Some(16), Some(32)]); let result = bitwise_shift_left_scalar(&left, scalar).unwrap(); assert_eq!(expected, result); } #[test] fn test_bitwise_shift_right() { - let left = - UInt64Array::from(vec![Some(32), Some(2048), None, Some(16384), Some(3)]); - let right = - UInt64Array::from(vec![Some(5), Some(10), Some(8), Some(12), Some(65)]); + let left = UInt64Array::from(vec![Some(32), Some(2048), None, Some(16384), Some(3)]); + let right = UInt64Array::from(vec![Some(5), Some(10), Some(8), Some(12), Some(65)]); let expected = UInt64Array::from(vec![Some(1), Some(2), None, Some(4), Some(1)]); let result = bitwise_shift_right(&left, &right).unwrap(); assert_eq!(expected, result); @@ -243,11 +238,9 @@ mod tests { #[test] fn test_bitwise_shift_right_scalar() { - let left = - UInt64Array::from(vec![Some(32), Some(2048), None, Some(16384), Some(3)]); + let left = UInt64Array::from(vec![Some(32), Some(2048), None, Some(16384), Some(3)]); let scalar = 2; - let expected = - UInt64Array::from(vec![Some(8), Some(512), None, Some(4096), Some(0)]); + let expected = UInt64Array::from(vec![Some(8), Some(512), None, Some(4096), Some(0)]); let result = bitwise_shift_right_scalar(&left, scalar).unwrap(); assert_eq!(expected, result); } diff --git a/arrow-arith/src/boolean.rs b/arrow-arith/src/boolean.rs index 46e5998208f1..269a36d66c2b 100644 --- a/arrow-arith/src/boolean.rs +++ b/arrow-arith/src/boolean.rs @@ -57,10 +57,7 @@ use arrow_schema::ArrowError; /// # Fails /// /// If the operands have different lengths -pub fn and_kleene( - left: &BooleanArray, - right: &BooleanArray, -) -> Result { +pub fn and_kleene(left: &BooleanArray, right: &BooleanArray) -> Result { if left.len() != right.len() { return Err(ArrowError::ComputeError( "Cannot perform bitwise operation on arrays of different length".to_string(), @@ -155,10 +152,7 @@ pub fn and_kleene( /// # Fails /// /// If the operands have different lengths -pub fn or_kleene( - left: &BooleanArray, - right: &BooleanArray, -) -> Result { +pub fn or_kleene(left: &BooleanArray, right: &BooleanArray) -> Result { if left.len() != right.len() { return Err(ArrowError::ComputeError( "Cannot perform bitwise operation on arrays of different length".to_string(), @@ -257,10 +251,7 @@ where /// let and_ab = and(&a, &b).unwrap(); /// assert_eq!(and_ab, BooleanArray::from(vec![Some(false), Some(true), None])); /// ``` -pub fn and( - left: &BooleanArray, - right: &BooleanArray, -) -> Result { +pub fn and(left: &BooleanArray, right: &BooleanArray) -> Result { binary_boolean_kernel(left, right, |a, b| a & b) } @@ -581,8 +572,7 @@ mod tests { let a = a.as_any().downcast_ref::().unwrap(); let c = not(a).unwrap(); - let expected = - BooleanArray::from(vec![Some(false), Some(true), None, Some(false)]); + let expected = BooleanArray::from(vec![Some(false), Some(true), None, Some(false)]); assert_eq!(c, expected); } @@ -631,12 +621,10 @@ mod tests { #[test] fn test_bool_array_and_sliced_same_offset() { let a = BooleanArray::from(vec![ - false, false, false, false, false, false, false, false, false, false, true, - true, + false, false, false, false, false, false, false, false, false, false, true, true, ]); let b = BooleanArray::from(vec![ - false, false, false, false, false, false, false, false, false, true, false, - true, + false, false, false, false, false, false, false, false, false, true, false, true, ]); let a = a.slice(8, 4); @@ -654,12 +642,10 @@ mod tests { #[test] fn test_bool_array_and_sliced_same_offset_mod8() { let a = BooleanArray::from(vec![ - false, false, true, true, false, false, false, false, false, false, false, - false, + false, false, true, true, false, false, false, false, false, false, false, false, ]); let b = BooleanArray::from(vec![ - false, false, false, false, false, false, false, false, false, true, false, - true, + false, false, false, false, false, false, false, false, false, true, false, true, ]); let a = a.slice(0, 4); @@ -677,8 +663,7 @@ mod tests { #[test] fn test_bool_array_and_sliced_offset1() { let a = BooleanArray::from(vec![ - false, false, false, false, false, false, false, false, false, false, true, - true, + false, false, false, false, false, false, false, false, false, false, true, true, ]); let b = BooleanArray::from(vec![false, true, false, true]); @@ -696,8 +681,7 @@ mod tests { fn test_bool_array_and_sliced_offset2() { let a = BooleanArray::from(vec![false, false, true, true]); let b = BooleanArray::from(vec![ - false, false, false, false, false, false, false, false, false, true, false, - true, + false, false, false, false, false, false, false, false, false, true, false, true, ]); let b = b.slice(8, 4); @@ -730,8 +714,7 @@ mod tests { let c = and(a, b).unwrap(); - let expected = - BooleanArray::from(vec![Some(false), Some(false), None, Some(true)]); + let expected = BooleanArray::from(vec![Some(false), Some(false), None, Some(true)]); assert_eq!(expected, c); } diff --git a/arrow-arith/src/numeric.rs b/arrow-arith/src/numeric.rs index c47731ed5125..b2c87bba5143 100644 --- a/arrow-arith/src/numeric.rs +++ b/arrow-arith/src/numeric.rs @@ -144,13 +144,13 @@ pub fn neg(array: &dyn Array) -> Result { let a = array .as_primitive::() .try_unary::<_, IntervalMonthDayNanoType, ArrowError>(|x| { - let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(x); - Ok(IntervalMonthDayNanoType::make_value( - months.neg_checked()?, - days.neg_checked()?, - nanos.neg_checked()?, - )) - })?; + let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(x); + Ok(IntervalMonthDayNanoType::make_value( + months.neg_checked()?, + days.neg_checked()?, + nanos.neg_checked()?, + )) + })?; Ok(Arc::new(a)) } t => Err(ArrowError::InvalidArgumentError(format!( @@ -201,11 +201,7 @@ impl Op { } /// Dispatch the given `op` to the appropriate specialized kernel -fn arithmetic_op( - op: Op, - lhs: &dyn Datum, - rhs: &dyn Datum, -) -> Result { +fn arithmetic_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> Result { use DataType::*; use IntervalUnit::*; use TimeUnit::*; @@ -675,8 +671,7 @@ fn date_op( (Date64, Op::Sub | Op::SubWrapping, Date64) => { let l = l.as_primitive::(); let r = r.as_primitive::(); - let result = - try_op_ref!(DurationMillisecondType, l, l_s, r, r_s, l.sub_checked(r)); + let result = try_op_ref!(DurationMillisecondType, l, l_s, r, r_s, l.sub_checked(r)); return Ok(result); } _ => {} @@ -800,8 +795,7 @@ fn decimal_op( let mul_pow = result_scale - s1 + s2; // p1 - s1 + s2 + result_scale - let result_precision = - (mul_pow.saturating_add(*p1 as i8) as u8).min(T::MAX_PRECISION); + let result_precision = (mul_pow.saturating_add(*p1 as i8) as u8).min(T::MAX_PRECISION); let (l_mul, r_mul) = match mul_pow.cmp(&0) { Ordering::Greater => ( @@ -1158,7 +1152,10 @@ mod tests { .with_precision_and_scale(3, -1) .unwrap(); let err = add(&a, &b).unwrap_err().to_string(); - assert_eq!(err, "Compute error: Overflow happened on: 10 * 100000000000000000000000000000000000000"); + assert_eq!( + err, + "Compute error: Overflow happened on: 10 * 100000000000000000000000000000000000000" + ); let b = Decimal128Array::from(vec![0]) .with_precision_and_scale(1, 1) @@ -1199,9 +1196,7 @@ mod tests { "1960-01-30T04:23:20Z", ] .into_iter() - .map(|x| { - T::make_value(DateTime::parse_from_rfc3339(x).unwrap().naive_utc()).unwrap() - }) + .map(|x| T::make_value(DateTime::parse_from_rfc3339(x).unwrap().naive_utc()).unwrap()) .collect(); let a = PrimitiveArray::::new(values, None); diff --git a/arrow-arith/src/temporal.rs b/arrow-arith/src/temporal.rs index 7855b6fc6e46..a9c3de5401c1 100644 --- a/arrow-arith/src/temporal.rs +++ b/arrow-arith/src/temporal.rs @@ -23,9 +23,7 @@ use chrono::{DateTime, Datelike, NaiveDateTime, NaiveTime, Offset, Timelike}; use arrow_array::builder::*; use arrow_array::iterator::ArrayIter; -use arrow_array::temporal_conversions::{ - as_datetime, as_datetime_with_timezone, as_time, -}; +use arrow_array::temporal_conversions::{as_datetime, as_datetime_with_timezone, as_time}; use arrow_array::timezone::Tz; use arrow_array::types::*; use arrow_array::*; @@ -209,12 +207,9 @@ where } DataType::Timestamp(_, Some(tz)) => { let iter = ArrayIter::new(array); - extract_component_from_datetime_array::<&PrimitiveArray, T, _>( - iter, - b, - tz, - |t| t.hour() as i32, - ) + extract_component_from_datetime_array::<&PrimitiveArray, T, _>(iter, b, tz, |t| { + t.hour() as i32 + }) } _ => return_compute_error_with!("hour does not support", array.data_type()), } @@ -289,9 +284,7 @@ pub fn num_days_from_monday_dyn(array: &dyn Array) -> Result( - array: &PrimitiveArray, -) -> Result +pub fn num_days_from_monday(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: From, @@ -318,9 +311,7 @@ pub fn num_days_from_sunday_dyn(array: &dyn Array) -> Result( - array: &PrimitiveArray, -) -> Result +pub fn num_days_from_sunday(array: &PrimitiveArray) -> Result where T: ArrowTemporalType + ArrowNumericType, i64: From, @@ -449,11 +440,7 @@ pub fn millisecond_dyn(array: &dyn Array) -> Result { } /// Extracts the time fraction of a given temporal array as an array of integers -fn time_fraction_dyn( - array: &dyn Array, - name: &str, - op: F, -) -> Result +fn time_fraction_dyn(array: &dyn Array, name: &str, op: F) -> Result where F: Fn(NaiveDateTime) -> i32, { @@ -498,14 +485,9 @@ where } DataType::Timestamp(_, Some(tz)) => { let iter = ArrayIter::new(array); - extract_component_from_datetime_array::<_, T, _>(iter, b, tz, |t| { - op(t.naive_local()) - }) + extract_component_from_datetime_array::<_, T, _>(iter, b, tz, |t| op(t.naive_local())) } - _ => return_compute_error_with!( - format!("{name} does not support"), - array.data_type() - ), + _ => return_compute_error_with!(format!("{name} does not support"), array.data_type()), } } @@ -559,8 +541,7 @@ mod tests { #[test] fn test_temporal_array_time64_micro_hour() { - let a: PrimitiveArray = - vec![37800000000, 86339000000].into(); + let a: PrimitiveArray = vec![37800000000, 86339000000].into(); let b = hour(&a).unwrap(); assert_eq!(10, b.value(0)); @@ -623,12 +604,10 @@ mod tests { #[test] fn test_temporal_array_timestamp_quarter_with_timezone() { // 24 * 60 * 60 = 86400 - let a = TimestampSecondArray::from(vec![86400 * 90]) - .with_timezone("+00:00".to_string()); + let a = TimestampSecondArray::from(vec![86400 * 90]).with_timezone("+00:00".to_string()); let b = quarter(&a).unwrap(); assert_eq!(2, b.value(0)); - let a = TimestampSecondArray::from(vec![86400 * 90]) - .with_timezone("-10:00".to_string()); + let a = TimestampSecondArray::from(vec![86400 * 90]).with_timezone("-10:00".to_string()); let b = quarter(&a).unwrap(); assert_eq!(1, b.value(0)); } @@ -659,12 +638,10 @@ mod tests { #[test] fn test_temporal_array_timestamp_month_with_timezone() { // 24 * 60 * 60 = 86400 - let a = TimestampSecondArray::from(vec![86400 * 31]) - .with_timezone("+00:00".to_string()); + let a = TimestampSecondArray::from(vec![86400 * 31]).with_timezone("+00:00".to_string()); let b = month(&a).unwrap(); assert_eq!(2, b.value(0)); - let a = TimestampSecondArray::from(vec![86400 * 31]) - .with_timezone("-10:00".to_string()); + let a = TimestampSecondArray::from(vec![86400 * 31]).with_timezone("-10:00".to_string()); let b = month(&a).unwrap(); assert_eq!(1, b.value(0)); } @@ -672,12 +649,10 @@ mod tests { #[test] fn test_temporal_array_timestamp_day_with_timezone() { // 24 * 60 * 60 = 86400 - let a = - TimestampSecondArray::from(vec![86400]).with_timezone("+00:00".to_string()); + let a = TimestampSecondArray::from(vec![86400]).with_timezone("+00:00".to_string()); let b = day(&a).unwrap(); assert_eq!(2, b.value(0)); - let a = - TimestampSecondArray::from(vec![86400]).with_timezone("-10:00".to_string()); + let a = TimestampSecondArray::from(vec![86400]).with_timezone("-10:00".to_string()); let b = day(&a).unwrap(); assert_eq!(1, b.value(0)); } @@ -857,8 +832,7 @@ mod tests { #[test] fn test_temporal_array_timestamp_second_with_timezone() { - let a = - TimestampSecondArray::from(vec![10, 20]).with_timezone("+00:00".to_string()); + let a = TimestampSecondArray::from(vec![10, 20]).with_timezone("+00:00".to_string()); let b = second(&a).unwrap(); assert_eq!(10, b.value(0)); assert_eq!(20, b.value(1)); @@ -866,8 +840,7 @@ mod tests { #[test] fn test_temporal_array_timestamp_minute_with_timezone() { - let a = - TimestampSecondArray::from(vec![0, 60]).with_timezone("+00:50".to_string()); + let a = TimestampSecondArray::from(vec![0, 60]).with_timezone("+00:50".to_string()); let b = minute(&a).unwrap(); assert_eq!(50, b.value(0)); assert_eq!(51, b.value(1)); @@ -875,48 +848,42 @@ mod tests { #[test] fn test_temporal_array_timestamp_minute_with_negative_timezone() { - let a = - TimestampSecondArray::from(vec![60 * 55]).with_timezone("-00:50".to_string()); + let a = TimestampSecondArray::from(vec![60 * 55]).with_timezone("-00:50".to_string()); let b = minute(&a).unwrap(); assert_eq!(5, b.value(0)); } #[test] fn test_temporal_array_timestamp_hour_with_timezone() { - let a = TimestampSecondArray::from(vec![60 * 60 * 10]) - .with_timezone("+01:00".to_string()); + let a = TimestampSecondArray::from(vec![60 * 60 * 10]).with_timezone("+01:00".to_string()); let b = hour(&a).unwrap(); assert_eq!(11, b.value(0)); } #[test] fn test_temporal_array_timestamp_hour_with_timezone_without_colon() { - let a = TimestampSecondArray::from(vec![60 * 60 * 10]) - .with_timezone("+0100".to_string()); + let a = TimestampSecondArray::from(vec![60 * 60 * 10]).with_timezone("+0100".to_string()); let b = hour(&a).unwrap(); assert_eq!(11, b.value(0)); } #[test] fn test_temporal_array_timestamp_hour_with_timezone_without_minutes() { - let a = TimestampSecondArray::from(vec![60 * 60 * 10]) - .with_timezone("+01".to_string()); + let a = TimestampSecondArray::from(vec![60 * 60 * 10]).with_timezone("+01".to_string()); let b = hour(&a).unwrap(); assert_eq!(11, b.value(0)); } #[test] fn test_temporal_array_timestamp_hour_with_timezone_without_initial_sign() { - let a = TimestampSecondArray::from(vec![60 * 60 * 10]) - .with_timezone("0100".to_string()); + let a = TimestampSecondArray::from(vec![60 * 60 * 10]).with_timezone("0100".to_string()); let err = hour(&a).unwrap_err().to_string(); assert!(err.contains("Invalid timezone"), "{}", err); } #[test] fn test_temporal_array_timestamp_hour_with_timezone_with_only_colon() { - let a = TimestampSecondArray::from(vec![60 * 60 * 10]) - .with_timezone("01:00".to_string()); + let a = TimestampSecondArray::from(vec![60 * 60 * 10]).with_timezone("01:00".to_string()); let err = hour(&a).unwrap_err().to_string(); assert!(err.contains("Invalid timezone"), "{}", err); } @@ -960,10 +927,8 @@ mod tests { let b = hour_dyn(&dict).unwrap(); - let expected_dict = DictionaryArray::new( - keys.clone(), - Arc::new(Int32Array::from(vec![11, 21, 7])), - ); + let expected_dict = + DictionaryArray::new(keys.clone(), Arc::new(Int32Array::from(vec![11, 21, 7]))); let expected = Arc::new(expected_dict) as ArrayRef; assert_eq!(&expected, &b); @@ -987,8 +952,7 @@ mod tests { assert_eq!(&expected, &b); assert_eq!(&expected, &b_old); - let b = - time_fraction_dyn(&dict, "nanosecond", |t| t.nanosecond() as i32).unwrap(); + let b = time_fraction_dyn(&dict, "nanosecond", |t| t.nanosecond() as i32).unwrap(); let expected_dict = DictionaryArray::new(keys, Arc::new(Int32Array::from(vec![0, 0, 0, 0, 0]))); @@ -998,8 +962,7 @@ mod tests { #[test] fn test_year_dictionary_array() { - let a: PrimitiveArray = - vec![Some(1514764800000), Some(1550636625000)].into(); + let a: PrimitiveArray = vec![Some(1514764800000), Some(1550636625000)].into(); let keys = Int8Array::from_iter_values([0_i8, 1, 1, 0]); let dict = DictionaryArray::new(keys.clone(), Arc::new(a)); @@ -1018,24 +981,20 @@ mod tests { fn test_quarter_month_dictionary_array() { //1514764800000 -> 2018-01-01 //1566275025000 -> 2019-08-20 - let a: PrimitiveArray = - vec![Some(1514764800000), Some(1566275025000)].into(); + let a: PrimitiveArray = vec![Some(1514764800000), Some(1566275025000)].into(); let keys = Int8Array::from_iter_values([0_i8, 1, 1, 0]); let dict = DictionaryArray::new(keys.clone(), Arc::new(a)); let b = quarter_dyn(&dict).unwrap(); - let expected = DictionaryArray::new( - keys.clone(), - Arc::new(Int32Array::from(vec![1, 3, 3, 1])), - ); + let expected = + DictionaryArray::new(keys.clone(), Arc::new(Int32Array::from(vec![1, 3, 3, 1]))); assert_eq!(b.as_ref(), &expected); let b = month_dyn(&dict).unwrap(); - let expected = - DictionaryArray::new(keys, Arc::new(Int32Array::from(vec![1, 8, 8, 1]))); + let expected = DictionaryArray::new(keys, Arc::new(Int32Array::from(vec![1, 8, 8, 1]))); assert_eq!(b.as_ref(), &expected); } @@ -1043,8 +1002,7 @@ mod tests { fn test_num_days_from_monday_sunday_day_doy_week_dictionary_array() { //1514764800000 -> 2018-01-01 (Monday) //1550636625000 -> 2019-02-20 (Wednesday) - let a: PrimitiveArray = - vec![Some(1514764800000), Some(1550636625000)].into(); + let a: PrimitiveArray = vec![Some(1514764800000), Some(1550636625000)].into(); let keys = Int8Array::from(vec![Some(0_i8), Some(1), Some(1), Some(0), None]); let dict = DictionaryArray::new(keys.clone(), Arc::new(a)); diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index 80a6eb3f541e..4f7ab24f9708 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -44,7 +44,7 @@ ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] arrow-buffer = { workspace = true } arrow-schema = { workspace = true } arrow-data = { workspace = true } -chrono = { version = "0.4.24", default-features = false, features = ["clock"] } +chrono = { workspace = true } chrono-tz = { version = "0.8", optional = true } num = { version = "0.4.1", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false, features = ["num-traits"] } diff --git a/arrow-array/src/arithmetic.rs b/arrow-array/src/arithmetic.rs index b0ecef70ee19..c9be39d44144 100644 --- a/arrow-array/src/arithmetic.rs +++ b/arrow-array/src/arithmetic.rs @@ -229,10 +229,7 @@ macro_rules! native_type_op { #[inline] fn pow_checked(self, exp: u32) -> Result { self.checked_pow(exp).ok_or_else(|| { - ArrowError::ComputeError(format!( - "Overflow happened on: {:?} ^ {exp:?}", - self - )) + ArrowError::ComputeError(format!("Overflow happened on: {:?} ^ {exp:?}", self)) }) } diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs index 75880bec30ce..6b18cbc2d9f7 100644 --- a/arrow-array/src/array/binary_array.rs +++ b/arrow-array/src/array/binary_array.rs @@ -16,9 +16,7 @@ // under the License. use crate::types::{ByteArrayType, GenericBinaryType}; -use crate::{ - Array, GenericByteArray, GenericListArray, GenericStringArray, OffsetSizeTrait, -}; +use crate::{Array, GenericByteArray, GenericListArray, GenericStringArray, OffsetSizeTrait}; use arrow_data::ArrayData; use arrow_schema::DataType; @@ -102,9 +100,7 @@ impl GenericBinaryArray { } } -impl From>> - for GenericBinaryArray -{ +impl From>> for GenericBinaryArray { fn from(v: Vec>) -> Self { Self::from_opt_vec(v) } @@ -376,9 +372,11 @@ mod tests { .unwrap(); let binary_array1 = GenericBinaryArray::::from(array_data1); - let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new( - Field::new("item", DataType::UInt8, false), - )); + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new( + "item", + DataType::UInt8, + false, + ))); let array_data2 = ArrayData::builder(data_type) .len(3) @@ -423,9 +421,11 @@ mod tests { let offsets = [0, 5, 8, 15].map(|n| O::from_usize(n).unwrap()); let null_buffer = Buffer::from_slice_ref([0b101]); - let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new( - Field::new("item", DataType::UInt8, false), - )); + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new( + "item", + DataType::UInt8, + false, + ))); // [None, Some(b"Parquet")] let array_data = ArrayData::builder(data_type) @@ -456,9 +456,7 @@ mod tests { _test_generic_binary_array_from_list_array_with_offset::(); } - fn _test_generic_binary_array_from_list_array_with_child_nulls_failed< - O: OffsetSizeTrait, - >() { + fn _test_generic_binary_array_from_list_array_with_child_nulls_failed() { let values = b"HelloArrow"; let child_data = ArrayData::builder(DataType::UInt8) .len(10) @@ -468,9 +466,11 @@ mod tests { .unwrap(); let offsets = [0, 5, 10].map(|n| O::from_usize(n).unwrap()); - let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new( - Field::new("item", DataType::UInt8, true), - )); + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new( + "item", + DataType::UInt8, + true, + ))); // [None, Some(b"Parquet")] let array_data = ArrayData::builder(data_type) @@ -558,8 +558,7 @@ mod tests { .unwrap(); let offsets: [i32; 4] = [0, 5, 5, 12]; - let data_type = - DataType::List(Arc::new(Field::new("item", DataType::UInt32, false))); + let data_type = DataType::List(Arc::new(Field::new("item", DataType::UInt32, false))); let array_data = ArrayData::builder(data_type) .len(3) .add_buffer(Buffer::from_slice_ref(offsets)) @@ -575,8 +574,7 @@ mod tests { expected = "Trying to access an element at index 4 from a BinaryArray of length 3" )] fn test_binary_array_get_value_index_out_of_bound() { - let values: [u8; 12] = - [104, 101, 108, 108, 111, 112, 97, 114, 113, 117, 101, 116]; + let values: [u8; 12] = [104, 101, 108, 108, 111, 112, 97, 114, 113, 117, 101, 116]; let offsets: [i32; 4] = [0, 5, 5, 12]; let array_data = ArrayData::builder(DataType::Binary) .len(3) diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index 4d19babe3e4b..a778dc92ea35 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -238,11 +238,7 @@ impl BooleanArray { /// /// This function panics if left and right are not the same length /// - pub fn from_binary( - left: T, - right: S, - mut op: F, - ) -> Self + pub fn from_binary(left: T, right: S, mut op: F) -> Self where F: FnMut(T::Item, S::Item) -> bool, { @@ -362,8 +358,7 @@ impl From for BooleanArray { 1, "BooleanArray data should contain a single buffer only (values buffer)" ); - let values = - BooleanBuffer::new(data.buffers()[0].clone(), data.offset(), data.len()); + let values = BooleanBuffer::new(data.buffers()[0].clone(), data.offset(), data.len()); Self { values, @@ -591,9 +586,7 @@ mod tests { } #[test] - #[should_panic( - expected = "BooleanArray expected ArrayData with type Boolean got Int32" - )] + #[should_panic(expected = "BooleanArray expected ArrayData with type Boolean got Int32")] fn test_from_array_data_validation() { let _ = BooleanArray::from(ArrayData::new_empty(&DataType::Int32)); } diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index 37d8de931e99..db825bbea97d 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -197,8 +197,7 @@ impl GenericByteArray { let (_, data_len) = iter.size_hint(); let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound. - let mut offsets = - MutableBuffer::new((data_len + 1) * std::mem::size_of::()); + let mut offsets = MutableBuffer::new((data_len + 1) * std::mem::size_of::()); offsets.push(T::Offset::usize_as(0)); let mut values = MutableBuffer::new(0); @@ -335,8 +334,7 @@ impl GenericByteArray { /// offset and data buffers are not shared by others. pub fn into_builder(self) -> Result, Self> { let len = self.len(); - let value_len = - T::Offset::as_usize(self.value_offsets()[len] - self.value_offsets()[0]); + let value_len = T::Offset::as_usize(self.value_offsets()[len] - self.value_offsets()[0]); let data = self.into_data(); let null_bit_buffer = data.nulls().map(|b| b.inner().sliced()); @@ -578,17 +576,14 @@ mod tests { let nulls = NullBuffer::new_null(3); let err = - StringArray::try_new(offsets.clone(), data.clone(), Some(nulls.clone())) - .unwrap_err(); + StringArray::try_new(offsets.clone(), data.clone(), Some(nulls.clone())).unwrap_err(); assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for StringArray, expected 2 got 3"); - let err = - BinaryArray::try_new(offsets.clone(), data.clone(), Some(nulls)).unwrap_err(); + let err = BinaryArray::try_new(offsets.clone(), data.clone(), Some(nulls)).unwrap_err(); assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for BinaryArray, expected 2 got 3"); let non_utf8_data = Buffer::from_slice_ref(b"he\xFFloworld"); - let err = StringArray::try_new(offsets.clone(), non_utf8_data.clone(), None) - .unwrap_err(); + let err = StringArray::try_new(offsets.clone(), non_utf8_data.clone(), None).unwrap_err(); assert_eq!(err.to_string(), "Invalid argument error: Encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 2"); BinaryArray::new(offsets, non_utf8_data, None); @@ -611,8 +606,7 @@ mod tests { BinaryArray::new(offsets, non_ascii_data.clone(), None); let offsets = OffsetBuffer::new(vec![0, 3, 10].into()); - let err = StringArray::try_new(offsets.clone(), non_ascii_data.clone(), None) - .unwrap_err(); + let err = StringArray::try_new(offsets.clone(), non_ascii_data.clone(), None).unwrap_err(); assert_eq!( err.to_string(), "Invalid argument error: Split UTF-8 codepoint at offset 3" diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index 5896cf02dfaa..1f4d83b1c5d0 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -286,10 +286,7 @@ impl DictionaryArray { /// # Errors /// /// Returns an error if any `keys[i] >= values.len() || keys[i] < 0` - pub fn try_new( - keys: PrimitiveArray, - values: ArrayRef, - ) -> Result { + pub fn try_new(keys: PrimitiveArray, values: ArrayRef) -> Result { let data_type = DataType::Dictionary( Box::new(keys.data_type().clone()), Box::new(values.data_type().clone()), @@ -298,9 +295,11 @@ impl DictionaryArray { let zero = K::Native::usize_as(0); let values_len = values.len(); - if let Some((idx, v)) = keys.values().iter().enumerate().find(|(idx, v)| { - (v.is_lt(zero) || v.as_usize() >= values_len) && keys.is_valid(*idx) - }) { + if let Some((idx, v)) = + keys.values().iter().enumerate().find(|(idx, v)| { + (v.is_lt(zero) || v.as_usize() >= values_len) && keys.is_valid(*idx) + }) + { return Err(ArrowError::InvalidArgumentError(format!( "Invalid dictionary key {v:?} at index {idx}, expected 0 <= key < {values_len}", ))); @@ -349,8 +348,7 @@ impl DictionaryArray { /// /// Panics if `values` is not a [`StringArray`]. pub fn lookup_key(&self, value: &str) -> Option { - let rd_buf: &StringArray = - self.values.as_any().downcast_ref::().unwrap(); + let rd_buf: &StringArray = self.values.as_any().downcast_ref::().unwrap(); (0..rd_buf.len()) .position(|i| rd_buf.value(i) == value) @@ -463,10 +461,8 @@ impl DictionaryArray { /// pub fn with_values(&self, values: ArrayRef) -> Self { assert!(values.len() >= self.values.len()); - let data_type = DataType::Dictionary( - Box::new(K::DATA_TYPE), - Box::new(values.data_type().clone()), - ); + let data_type = + DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(values.data_type().clone())); Self { data_type, keys: self.keys.clone(), @@ -477,9 +473,7 @@ impl DictionaryArray { /// Returns `PrimitiveDictionaryBuilder` of this dictionary array for mutating /// its keys and values if the underlying data buffer is not shared by others. - pub fn into_primitive_dict_builder( - self, - ) -> Result, Self> + pub fn into_primitive_dict_builder(self) -> Result, Self> where V: ArrowPrimitiveType, { @@ -540,8 +534,7 @@ impl DictionaryArray { V: ArrowPrimitiveType, F: Fn(V::Native) -> V::Native, { - let mut builder: PrimitiveDictionaryBuilder = - self.into_primitive_dict_builder()?; + let mut builder: PrimitiveDictionaryBuilder = self.into_primitive_dict_builder()?; builder .values_slice_mut() .iter_mut() @@ -806,9 +799,7 @@ impl<'a, K: ArrowDictionaryKeyType, V> Clone for TypedDictionaryArray<'a, K, V> impl<'a, K: ArrowDictionaryKeyType, V> Copy for TypedDictionaryArray<'a, K, V> {} -impl<'a, K: ArrowDictionaryKeyType, V> std::fmt::Debug - for TypedDictionaryArray<'a, K, V> -{ +impl<'a, K: ArrowDictionaryKeyType, V> std::fmt::Debug for TypedDictionaryArray<'a, K, V> { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { writeln!(f, "TypedDictionaryArray({:?})", self.dictionary) } @@ -1005,7 +996,7 @@ impl AnyDictionaryArray for DictionaryArray { let v_len = self.values().len(); assert_ne!(v_len, 0); let iter = self.keys().values().iter(); - iter.map(|x| x.as_usize().min(v_len)).collect() + iter.map(|x| x.as_usize().min(v_len - 1)).collect() } fn with_values(&self, values: ArrayRef) -> ArrayRef { @@ -1040,8 +1031,7 @@ mod tests { // Construct a dictionary array from the above two let key_type = DataType::Int16; let value_type = DataType::Int8; - let dict_data_type = - DataType::Dictionary(Box::new(key_type), Box::new(value_type)); + let dict_data_type = DataType::Dictionary(Box::new(key_type), Box::new(value_type)); let dict_data = ArrayData::builder(dict_data_type.clone()) .len(3) .add_buffer(keys.clone()) @@ -1079,8 +1069,7 @@ mod tests { #[test] fn test_dictionary_array_fmt_debug() { - let mut builder = - PrimitiveDictionaryBuilder::::with_capacity(3, 2); + let mut builder = PrimitiveDictionaryBuilder::::with_capacity(3, 2); builder.append(12345678).unwrap(); builder.append_null(); builder.append(22345678).unwrap(); @@ -1090,8 +1079,7 @@ mod tests { format!("{array:?}") ); - let mut builder = - PrimitiveDictionaryBuilder::::with_capacity(20, 2); + let mut builder = PrimitiveDictionaryBuilder::::with_capacity(20, 2); for _ in 0..20 { builder.append(1).unwrap(); } @@ -1267,9 +1255,7 @@ mod tests { } #[test] - #[should_panic( - expected = "Invalid dictionary key 3 at index 1, expected 0 <= key < 2" - )] + #[should_panic(expected = "Invalid dictionary key 3 at index 1, expected 0 <= key < 2")] fn test_try_new_index_too_large() { let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); // dictionary only has 2 values, so offset 3 is out of bounds @@ -1278,9 +1264,7 @@ mod tests { } #[test] - #[should_panic( - expected = "Invalid dictionary key -100 at index 0, expected 0 <= key < 2" - )] + #[should_panic(expected = "Invalid dictionary key -100 at index 0, expected 0 <= key < 2")] fn test_try_new_index_too_small() { let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); let keys: Int32Array = [Some(-100)].into_iter().collect(); @@ -1288,9 +1272,7 @@ mod tests { } #[test] - #[should_panic( - expected = "DictionaryArray's data type must match, expected Int64 got Int32" - )] + #[should_panic(expected = "DictionaryArray's data type must match, expected Int64 got Int32")] fn test_from_array_data_validation() { let a = DictionaryArray::::from_iter(["32"]); let _ = DictionaryArray::::from(a.into_data()); @@ -1335,8 +1317,7 @@ mod tests { let boxed: ArrayRef = Arc::new(dict_array); - let col: DictionaryArray = - DictionaryArray::::from(boxed.to_data()); + let col: DictionaryArray = DictionaryArray::::from(boxed.to_data()); let err = col.into_primitive_dict_builder::(); let returned = err.unwrap_err(); @@ -1385,4 +1366,13 @@ mod tests { .collect(); assert_eq!(values, &[Some(50), None, None, Some(2)]) } + + #[test] + fn test_normalized_keys() { + let values = vec![132, 0, 1].into(); + let nulls = NullBuffer::from(vec![false, true, true]); + let keys = Int32Array::new(values, Some(nulls)); + let dictionary = DictionaryArray::new(keys, Arc::new(Int32Array::new_null(2))); + assert_eq!(&dictionary.normalized_keys(), &[1, 0, 1]) + } } diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index 74a7c4c7a84a..d89bbd5ad084 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -81,10 +81,7 @@ impl FixedSizeBinaryArray { ) -> Result { let data_type = DataType::FixedSizeBinary(size); let s = size.to_usize().ok_or_else(|| { - ArrowError::InvalidArgumentError(format!( - "Size cannot be negative, got {}", - size - )) + ArrowError::InvalidArgumentError(format!("Size cannot be negative, got {}", size)) })?; let len = values.len() / s; @@ -179,9 +176,18 @@ impl FixedSizeBinaryArray { self.value_length } - /// Returns a clone of the value data buffer - pub fn value_data(&self) -> Buffer { - self.value_data.clone() + /// Returns the values of this array. + /// + /// Unlike [`Self::value_data`] this returns the [`Buffer`] + /// allowing for zero-copy cloning. + #[inline] + pub fn values(&self) -> &Buffer { + &self.value_data + } + + /// Returns the raw value data. + pub fn value_data(&self) -> &[u8] { + self.value_data.as_slice() } /// Returns a zero-copy slice of this array with the indicated offset and length. @@ -324,10 +330,7 @@ impl FixedSizeBinaryArray { /// # Errors /// /// Returns error if argument has length zero, or sizes of nested slices don't match. - pub fn try_from_sparse_iter_with_size( - mut iter: T, - size: i32, - ) -> Result + pub fn try_from_sparse_iter_with_size(mut iter: T, size: i32) -> Result where T: Iterator>, U: AsRef<[u8]>, @@ -803,8 +806,7 @@ mod tests { let none_option: Option<[u8; 32]> = None; let input_arg = vec![none_option, none_option, none_option]; #[allow(deprecated)] - let arr = - FixedSizeBinaryArray::try_from_sparse_iter(input_arg.into_iter()).unwrap(); + let arr = FixedSizeBinaryArray::try_from_sparse_iter(input_arg.into_iter()).unwrap(); assert_eq!(0, arr.value_length()); assert_eq!(3, arr.len()) } @@ -819,16 +821,12 @@ mod tests { Some(vec![13, 14]), ]; #[allow(deprecated)] - let arr = FixedSizeBinaryArray::try_from_sparse_iter(input_arg.iter().cloned()) - .unwrap(); + let arr = FixedSizeBinaryArray::try_from_sparse_iter(input_arg.iter().cloned()).unwrap(); assert_eq!(2, arr.value_length()); assert_eq!(5, arr.len()); - let arr = FixedSizeBinaryArray::try_from_sparse_iter_with_size( - input_arg.into_iter(), - 2, - ) - .unwrap(); + let arr = + FixedSizeBinaryArray::try_from_sparse_iter_with_size(input_arg.into_iter(), 2).unwrap(); assert_eq!(2, arr.value_length()); assert_eq!(5, arr.len()); } @@ -837,11 +835,8 @@ mod tests { fn test_fixed_size_binary_array_from_sparse_iter_with_size_all_none() { let input_arg = vec![None, None, None, None, None] as Vec>>; - let arr = FixedSizeBinaryArray::try_from_sparse_iter_with_size( - input_arg.into_iter(), - 16, - ) - .unwrap(); + let arr = FixedSizeBinaryArray::try_from_sparse_iter_with_size(input_arg.into_iter(), 16) + .unwrap(); assert_eq!(16, arr.value_length()); assert_eq!(5, arr.len()) } @@ -908,8 +903,7 @@ mod tests { fn fixed_size_binary_array_all_null() { let data = vec![None] as Vec>; let array = - FixedSizeBinaryArray::try_from_sparse_iter_with_size(data.into_iter(), 0) - .unwrap(); + FixedSizeBinaryArray::try_from_sparse_iter_with_size(data.into_iter(), 0).unwrap(); array .into_data() .validate_full() @@ -919,8 +913,7 @@ mod tests { #[test] // Test for https://github.com/apache/arrow-rs/issues/1390 fn fixed_size_binary_array_all_null_in_batch_with_schema() { - let schema = - Schema::new(vec![Field::new("a", DataType::FixedSizeBinary(2), true)]); + let schema = Schema::new(vec![Field::new("a", DataType::FixedSizeBinary(2), true)]); let none_option: Option<[u8; 2]> = None; let item = FixedSizeBinaryArray::try_from_sparse_iter_with_size( diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index db3ccbe0617b..f8f01516e3d4 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -130,12 +130,7 @@ impl FixedSizeListArray { /// # Panics /// /// Panics if [`Self::try_new`] returns an error - pub fn new( - field: FieldRef, - size: i32, - values: ArrayRef, - nulls: Option, - ) -> Self { + pub fn new(field: FieldRef, size: i32, values: ArrayRef, nulls: Option) -> Self { Self::try_new(field, size, values, nulls).unwrap() } @@ -154,10 +149,7 @@ impl FixedSizeListArray { nulls: Option, ) -> Result { let s = size.to_usize().ok_or_else(|| { - ArrowError::InvalidArgumentError(format!( - "Size cannot be negative, got {}", - size - )) + ArrowError::InvalidArgumentError(format!("Size cannot be negative, got {}", size)) })?; let len = values.len() / s.max(1); @@ -350,9 +342,8 @@ impl From for FixedSizeListArray { }; let size = value_length as usize; - let values = make_array( - data.child_data()[0].slice(data.offset() * size, data.len() * size), - ); + let values = + make_array(data.child_data()[0].slice(data.offset() * size, data.len() * size)); Self { data_type: data.data_type().clone(), values, @@ -483,10 +474,8 @@ mod tests { .unwrap(); // Construct a list array from the above two - let list_data_type = DataType::FixedSizeList( - Arc::new(Field::new("item", DataType::Int32, false)), - 3, - ); + let list_data_type = + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, false)), 3); let list_data = ArrayData::builder(list_data_type.clone()) .len(3) .add_child_data(value_data.clone()) @@ -538,10 +527,8 @@ mod tests { .unwrap(); // Construct a list array from the above two - let list_data_type = DataType::FixedSizeList( - Arc::new(Field::new("item", DataType::Int32, false)), - 3, - ); + let list_data_type = + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, false)), 3); let list_data = unsafe { ArrayData::builder(list_data_type) .len(3) @@ -569,10 +556,8 @@ mod tests { bit_util::set_bit(&mut null_bits, 4); // Construct a fixed size list array from the above two - let list_data_type = DataType::FixedSizeList( - Arc::new(Field::new("item", DataType::Int32, false)), - 2, - ); + let list_data_type = + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, false)), 2); let list_data = ArrayData::builder(list_data_type) .len(5) .add_child_data(value_data.clone()) @@ -611,9 +596,7 @@ mod tests { } #[test] - #[should_panic( - expected = "the offset of the new Buffer cannot exceed the existing length" - )] + #[should_panic(expected = "the offset of the new Buffer cannot exceed the existing length")] fn test_fixed_size_list_array_index_out_of_bound() { // Construct a value array let value_data = ArrayData::builder(DataType::Int32) @@ -631,10 +614,8 @@ mod tests { bit_util::set_bit(&mut null_bits, 4); // Construct a fixed size list array from the above two - let list_data_type = DataType::FixedSizeList( - Arc::new(Field::new("item", DataType::Int32, false)), - 2, - ); + let list_data_type = + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, false)), 2); let list_data = ArrayData::builder(list_data_type) .len(5) .add_child_data(value_data) @@ -668,8 +649,7 @@ mod tests { let list = FixedSizeListArray::new(field.clone(), 4, values.clone(), None); assert_eq!(list.len(), 1); - let err = FixedSizeListArray::try_new(field.clone(), -1, values.clone(), None) - .unwrap_err(); + let err = FixedSizeListArray::try_new(field.clone(), -1, values.clone(), None).unwrap_err(); assert_eq!( err.to_string(), "Invalid argument error: Size cannot be negative, got -1" @@ -679,13 +659,11 @@ mod tests { assert_eq!(list.len(), 6); let nulls = NullBuffer::new_null(2); - let err = FixedSizeListArray::try_new(field, 2, values.clone(), Some(nulls)) - .unwrap_err(); + let err = FixedSizeListArray::try_new(field, 2, values.clone(), Some(nulls)).unwrap_err(); assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for FixedSizeListArray, expected 3 got 2"); let field = Arc::new(Field::new("item", DataType::Int32, false)); - let err = FixedSizeListArray::try_new(field.clone(), 2, values.clone(), None) - .unwrap_err(); + let err = FixedSizeListArray::try_new(field.clone(), 2, values.clone(), None).unwrap_err(); assert_eq!(err.to_string(), "Invalid argument error: Found unmasked nulls for non-nullable FixedSizeListArray field \"item\""); // Valid as nulls in child masked by parent diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index e36d0ac4434f..9758c112a1ef 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -372,9 +372,8 @@ impl GenericListArray { impl From for GenericListArray { fn from(data: ArrayData) -> Self { - Self::try_new_from_array_data(data).expect( - "Expected infallible creation of GenericListArray from ArrayDataRef failed", - ) + Self::try_new_from_array_data(data) + .expect("Expected infallible creation of GenericListArray from ArrayDataRef failed") } } @@ -391,17 +390,14 @@ impl From> for ArrayDa } } -impl From - for GenericListArray -{ +impl From for GenericListArray { fn from(value: FixedSizeListArray) -> Self { let (field, size) = match value.data_type() { DataType::FixedSizeList(f, size) => (f, *size as usize), _ => unreachable!(), }; - let offsets = - OffsetBuffer::from_lengths(std::iter::repeat(size).take(value.len())); + let offsets = OffsetBuffer::from_lengths(std::iter::repeat(size).take(value.len())); Self { data_type: Self::DATA_TYPE_CONSTRUCTOR(field.clone()), @@ -415,9 +411,10 @@ impl From impl GenericListArray { fn try_new_from_array_data(data: ArrayData) -> Result { if data.buffers().len() != 1 { - return Err(ArrowError::InvalidArgumentError( - format!("ListArray data should contain a single buffer only (value offsets), had {}", - data.buffers().len()))); + return Err(ArrowError::InvalidArgumentError(format!( + "ListArray data should contain a single buffer only (value offsets), had {}", + data.buffers().len() + ))); } if data.child_data().len() != 1 { @@ -593,8 +590,7 @@ mod tests { let value_offsets = Buffer::from([]); // Construct a list array from the above two - let list_data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(0) .add_buffer(value_offsets) @@ -620,8 +616,7 @@ mod tests { let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); // Construct a list array from the above two - let list_data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type.clone()) .len(3) .add_buffer(value_offsets.clone()) @@ -807,8 +802,7 @@ mod tests { bit_util::set_bit(&mut null_bits, 8); // Construct a list array from the above two - let list_data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(9) .add_buffer(value_offsets) @@ -839,8 +833,7 @@ mod tests { } // Check offset and length for each non-null value. - let sliced_list_array = - sliced_array.as_any().downcast_ref::().unwrap(); + let sliced_list_array = sliced_array.as_any().downcast_ref::().unwrap(); assert_eq!(2, sliced_list_array.value_offsets()[2]); assert_eq!(2, sliced_list_array.value_length(2)); assert_eq!(4, sliced_list_array.value_offsets()[3]); @@ -951,9 +944,7 @@ mod tests { list_array.value(10); } #[test] - #[should_panic( - expected = "ListArray data should contain a single buffer only (value offsets)" - )] + #[should_panic(expected = "ListArray data should contain a single buffer only (value offsets)")] // Different error messages, so skip for now // https://github.com/apache/arrow-rs/issues/1545 #[cfg(not(feature = "force_validate"))] @@ -964,8 +955,7 @@ mod tests { .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) .build_unchecked() }; - let list_data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = unsafe { ArrayData::builder(list_data_type) .len(3) @@ -976,16 +966,13 @@ mod tests { } #[test] - #[should_panic( - expected = "ListArray should contain a single child array (values array)" - )] + #[should_panic(expected = "ListArray should contain a single child array (values array)")] // Different error messages, so skip for now // https://github.com/apache/arrow-rs/issues/1545 #[cfg(not(feature = "force_validate"))] fn test_list_array_invalid_child_array_len() { let value_offsets = Buffer::from_slice_ref([0, 2, 5, 7]); - let list_data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = unsafe { ArrayData::builder(list_data_type) .len(3) @@ -996,9 +983,7 @@ mod tests { } #[test] - #[should_panic( - expected = "[Large]ListArray's datatype must be [Large]ListArray(). It is List" - )] + #[should_panic(expected = "[Large]ListArray's datatype must be [Large]ListArray(). It is List")] fn test_from_array_data_validation() { let mut builder = ListBuilder::new(Int32Builder::new()); builder.values().append_value(1); @@ -1017,8 +1002,7 @@ mod tests { let value_offsets = Buffer::from_slice_ref([2, 2, 5, 7]); - let list_data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -1033,9 +1017,7 @@ mod tests { } #[test] - #[should_panic( - expected = "Memory pointer is not aligned with the specified scalar type" - )] + #[should_panic(expected = "Memory pointer is not aligned with the specified scalar type")] // Different error messages, so skip for now // https://github.com/apache/arrow-rs/issues/1545 #[cfg(not(feature = "force_validate"))] @@ -1051,9 +1033,7 @@ mod tests { } #[test] - #[should_panic( - expected = "Memory pointer is not aligned with the specified scalar type" - )] + #[should_panic(expected = "Memory pointer is not aligned with the specified scalar type")] // Different error messages, so skip for now // https://github.com/apache/arrow-rs/issues/1545 #[cfg(not(feature = "force_validate"))] @@ -1068,8 +1048,7 @@ mod tests { .build_unchecked() }; - let list_data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); let list_data = unsafe { ArrayData::builder(list_data_type) .add_buffer(buf2) @@ -1187,9 +1166,8 @@ mod tests { let nulls = NullBuffer::new_null(3); let offsets = OffsetBuffer::new(vec![0, 1, 2, 4, 5].into()); - let err = - LargeListArray::try_new(field, offsets.clone(), values.clone(), Some(nulls)) - .unwrap_err(); + let err = LargeListArray::try_new(field, offsets.clone(), values.clone(), Some(nulls)) + .unwrap_err(); assert_eq!( err.to_string(), @@ -1197,9 +1175,8 @@ mod tests { ); let field = Arc::new(Field::new("element", DataType::Int64, false)); - let err = - LargeListArray::try_new(field.clone(), offsets.clone(), values.clone(), None) - .unwrap_err(); + let err = LargeListArray::try_new(field.clone(), offsets.clone(), values.clone(), None) + .unwrap_err(); assert_eq!( err.to_string(), @@ -1210,8 +1187,8 @@ mod tests { let values = Int64Array::new(vec![0; 7].into(), Some(nulls)); let values = Arc::new(values); - let err = LargeListArray::try_new(field, offsets.clone(), values.clone(), None) - .unwrap_err(); + let err = + LargeListArray::try_new(field, offsets.clone(), values.clone(), None).unwrap_err(); assert_eq!( err.to_string(), @@ -1222,8 +1199,7 @@ mod tests { LargeListArray::new(field.clone(), offsets.clone(), values, None); let values = Int64Array::new(vec![0; 2].into(), None); - let err = - LargeListArray::try_new(field, offsets, Arc::new(values), None).unwrap_err(); + let err = LargeListArray::try_new(field, offsets, Arc::new(values), None).unwrap_err(); assert_eq!( err.to_string(), diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index fca49cd7836f..bde7fdd5a953 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -17,9 +17,7 @@ use crate::array::{get_offsets, print_long_array}; use crate::iterator::MapArrayIter; -use crate::{ - make_array, Array, ArrayAccessor, ArrayRef, ListArray, StringArray, StructArray, -}; +use crate::{make_array, Array, ArrayAccessor, ArrayRef, ListArray, StringArray, StructArray}; use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, OffsetBuffer, ToByteSlice}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType, Field, FieldRef}; @@ -264,9 +262,10 @@ impl MapArray { } if data.buffers().len() != 1 { - return Err(ArrowError::InvalidArgumentError( - format!("MapArray data should contain a single buffer only (value offsets), had {}", - data.len()))); + return Err(ArrowError::InvalidArgumentError(format!( + "MapArray data should contain a single buffer only (value offsets), had {}", + data.len() + ))); } if data.child_data().len() != 1 { @@ -281,9 +280,9 @@ impl MapArray { if let DataType::Struct(fields) = entries.data_type() { if fields.len() != 2 { return Err(ArrowError::InvalidArgumentError(format!( - "MapArray should contain a struct array with 2 fields, have {} fields", - fields.len() - ))); + "MapArray should contain a struct array with 2 fields, have {} fields", + fields.len() + ))); } } else { return Err(ArrowError::InvalidArgumentError(format!( @@ -330,7 +329,7 @@ impl MapArray { Arc::new(Field::new( "entries", entry_struct.data_type().clone(), - true, + false, )), false, ); @@ -477,7 +476,7 @@ mod tests { Arc::new(Field::new( "entries", entry_struct.data_type().clone(), - true, + false, )), false, ); @@ -523,7 +522,7 @@ mod tests { Arc::new(Field::new( "entries", entry_struct.data_type().clone(), - true, + false, )), false, ); @@ -576,8 +575,7 @@ mod tests { assert_eq!(2, map_array.value_length(1)); let key_array = Arc::new(Int32Array::from(vec![3, 4, 5])) as ArrayRef; - let value_array = - Arc::new(UInt32Array::from(vec![None, Some(40), None])) as ArrayRef; + let value_array = Arc::new(UInt32Array::from(vec![None, Some(40), None])) as ArrayRef; let struct_array = StructArray::from(vec![(keys_field, key_array), (values_field, value_array)]); assert_eq!( @@ -645,7 +643,7 @@ mod tests { Arc::new(Field::new( "entries", entry_struct.data_type().clone(), - true, + false, )), false, ); @@ -669,9 +667,7 @@ mod tests { } #[test] - #[should_panic( - expected = "MapArray expected ArrayData with DataType::Map got Dictionary" - )] + #[should_panic(expected = "MapArray expected ArrayData with DataType::Map got Dictionary")] fn test_from_array_data_validation() { // A DictionaryArray has similar buffer layout to a MapArray // but the meaning of the values differs @@ -692,12 +688,9 @@ mod tests { // [[a, b, c], [d, e, f], [g, h]] let entry_offsets = [0, 3, 6, 8]; - let map_array = MapArray::new_from_strings( - keys.clone().into_iter(), - &values_data, - &entry_offsets, - ) - .unwrap(); + let map_array = + MapArray::new_from_strings(keys.clone().into_iter(), &values_data, &entry_offsets) + .unwrap(); assert_eq!( &values_data, @@ -768,9 +761,8 @@ mod tests { "Invalid argument error: Incorrect length of null buffer for MapArray, expected 4 got 3" ); - let err = - MapArray::try_new(field, offsets.clone(), entries.slice(0, 2), None, false) - .unwrap_err(); + let err = MapArray::try_new(field, offsets.clone(), entries.slice(0, 2), None, false) + .unwrap_err(); assert_eq!( err.to_string(), @@ -783,9 +775,7 @@ mod tests { .to_string(); assert!( - err.starts_with( - "Invalid argument error: MapArray expected data type Int64 got Struct" - ), + err.starts_with("Invalid argument error: MapArray expected data type Int64 got Struct"), "{err}" ); diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 905ec1e5431b..f19406c1610b 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -173,20 +173,22 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// ``` fn offset(&self) -> usize; - /// Returns the null buffer of this array if any + /// Returns the null buffer of this array if any. /// - /// Note: some arrays can encode their nullability in their children, for example, + /// The null buffer encodes the "physical" nulls of an array. + /// However, some arrays can also encode nullability in their children, for example, /// [`DictionaryArray::values`] values or [`RunArray::values`], or without a null buffer, - /// such as [`NullArray`]. Use [`Array::logical_nulls`] to obtain a computed mask encoding this + /// such as [`NullArray`]. To determine if each element of such an array is logically null, + /// you can use the slower [`Array::logical_nulls`] to obtain a computed mask . fn nulls(&self) -> Option<&NullBuffer>; - /// Returns the logical null buffer of this array if any + /// Returns a potentially computed [`NullBuffer`] that represent the logical null values of this array, if any. /// /// In most cases this will be the same as [`Array::nulls`], except for: /// - /// * DictionaryArray where [`DictionaryArray::values`] contains nulls - /// * RunArray where [`RunArray::values`] contains nulls - /// * NullArray where all indices are nulls + /// * [`DictionaryArray`] where [`DictionaryArray::values`] contains nulls + /// * [`RunArray`] where [`RunArray::values`] contains nulls + /// * [`NullArray`] where all indices are nulls /// /// In these cases a logical [`NullBuffer`] will be computed, encoding the logical nullability /// of these arrays, beyond what is encoded in [`Array::nulls`] @@ -194,31 +196,33 @@ pub trait Array: std::fmt::Debug + Send + Sync { self.nulls().cloned() } - /// Returns whether the element at `index` is null. - /// When using this function on a slice, the index is relative to the slice. + /// Returns whether the element at `index` is null according to [`Array::nulls`] /// - /// Note: this method returns the physical nullability, i.e. that encoded in [`Array::nulls`] - /// see [`Array::logical_nulls`] for logical nullability + /// Note: For performance reasons, this method returns nullability solely as determined by the + /// null buffer. This difference can lead to surprising results, for example, [`NullArray::is_null`] always + /// returns `false` as the array lacks a null buffer. Similarly [`DictionaryArray`] and [`RunArray`] may + /// encode nullability in their children. See [`Self::logical_nulls`] for more information. /// /// # Example: /// /// ``` - /// use arrow_array::{Array, Int32Array}; + /// use arrow_array::{Array, Int32Array, NullArray}; /// /// let array = Int32Array::from(vec![Some(1), None]); - /// /// assert_eq!(array.is_null(0), false); /// assert_eq!(array.is_null(1), true); + /// + /// // NullArrays do not have a null buffer, and therefore always + /// // return false for is_null. + /// let array = NullArray::new(1); + /// assert_eq!(array.is_null(0), false); /// ``` fn is_null(&self, index: usize) -> bool { self.nulls().map(|n| n.is_null(index)).unwrap_or_default() } - /// Returns whether the element at `index` is not null. - /// When using this function on a slice, the index is relative to the slice. - /// - /// Note: this method returns the physical nullability, i.e. that encoded in [`Array::nulls`] - /// see [`Array::logical_nulls`] for logical nullability + /// Returns whether the element at `index` is *not* null, the + /// opposite of [`Self::is_null`]. /// /// # Example: /// @@ -532,9 +536,7 @@ pub fn make_array(data: ArrayData) -> ArrayRef { DataType::Float64 => Arc::new(Float64Array::from(data)) as ArrayRef, DataType::Date32 => Arc::new(Date32Array::from(data)) as ArrayRef, DataType::Date64 => Arc::new(Date64Array::from(data)) as ArrayRef, - DataType::Time32(TimeUnit::Second) => { - Arc::new(Time32SecondArray::from(data)) as ArrayRef - } + DataType::Time32(TimeUnit::Second) => Arc::new(Time32SecondArray::from(data)) as ArrayRef, DataType::Time32(TimeUnit::Millisecond) => { Arc::new(Time32MillisecondArray::from(data)) as ArrayRef } @@ -579,9 +581,7 @@ pub fn make_array(data: ArrayData) -> ArrayRef { } DataType::Binary => Arc::new(BinaryArray::from(data)) as ArrayRef, DataType::LargeBinary => Arc::new(LargeBinaryArray::from(data)) as ArrayRef, - DataType::FixedSizeBinary(_) => { - Arc::new(FixedSizeBinaryArray::from(data)) as ArrayRef - } + DataType::FixedSizeBinary(_) => Arc::new(FixedSizeBinaryArray::from(data)) as ArrayRef, DataType::Utf8 => Arc::new(StringArray::from(data)) as ArrayRef, DataType::LargeUtf8 => Arc::new(LargeStringArray::from(data)) as ArrayRef, DataType::List(_) => Arc::new(ListArray::from(data)) as ArrayRef, @@ -589,50 +589,24 @@ pub fn make_array(data: ArrayData) -> ArrayRef { DataType::Struct(_) => Arc::new(StructArray::from(data)) as ArrayRef, DataType::Map(_, _) => Arc::new(MapArray::from(data)) as ArrayRef, DataType::Union(_, _) => Arc::new(UnionArray::from(data)) as ArrayRef, - DataType::FixedSizeList(_, _) => { - Arc::new(FixedSizeListArray::from(data)) as ArrayRef - } + DataType::FixedSizeList(_, _) => Arc::new(FixedSizeListArray::from(data)) as ArrayRef, DataType::Dictionary(ref key_type, _) => match key_type.as_ref() { - DataType::Int8 => { - Arc::new(DictionaryArray::::from(data)) as ArrayRef - } - DataType::Int16 => { - Arc::new(DictionaryArray::::from(data)) as ArrayRef - } - DataType::Int32 => { - Arc::new(DictionaryArray::::from(data)) as ArrayRef - } - DataType::Int64 => { - Arc::new(DictionaryArray::::from(data)) as ArrayRef - } - DataType::UInt8 => { - Arc::new(DictionaryArray::::from(data)) as ArrayRef - } - DataType::UInt16 => { - Arc::new(DictionaryArray::::from(data)) as ArrayRef - } - DataType::UInt32 => { - Arc::new(DictionaryArray::::from(data)) as ArrayRef - } - DataType::UInt64 => { - Arc::new(DictionaryArray::::from(data)) as ArrayRef - } + DataType::Int8 => Arc::new(DictionaryArray::::from(data)) as ArrayRef, + DataType::Int16 => Arc::new(DictionaryArray::::from(data)) as ArrayRef, + DataType::Int32 => Arc::new(DictionaryArray::::from(data)) as ArrayRef, + DataType::Int64 => Arc::new(DictionaryArray::::from(data)) as ArrayRef, + DataType::UInt8 => Arc::new(DictionaryArray::::from(data)) as ArrayRef, + DataType::UInt16 => Arc::new(DictionaryArray::::from(data)) as ArrayRef, + DataType::UInt32 => Arc::new(DictionaryArray::::from(data)) as ArrayRef, + DataType::UInt64 => Arc::new(DictionaryArray::::from(data)) as ArrayRef, dt => panic!("Unexpected dictionary key type {dt:?}"), }, - DataType::RunEndEncoded(ref run_ends_type, _) => { - match run_ends_type.data_type() { - DataType::Int16 => { - Arc::new(RunArray::::from(data)) as ArrayRef - } - DataType::Int32 => { - Arc::new(RunArray::::from(data)) as ArrayRef - } - DataType::Int64 => { - Arc::new(RunArray::::from(data)) as ArrayRef - } - dt => panic!("Unexpected data type for run_ends array {dt:?}"), - } - } + DataType::RunEndEncoded(ref run_ends_type, _) => match run_ends_type.data_type() { + DataType::Int16 => Arc::new(RunArray::::from(data)) as ArrayRef, + DataType::Int32 => Arc::new(RunArray::::from(data)) as ArrayRef, + DataType::Int64 => Arc::new(RunArray::::from(data)) as ArrayRef, + dt => panic!("Unexpected data type for run_ends array {dt:?}"), + }, DataType::Null => Arc::new(NullArray::from(data)) as ArrayRef, DataType::Decimal128(_, _) => Arc::new(Decimal128Array::from(data)) as ArrayRef, DataType::Decimal256(_, _) => Arc::new(Decimal256Array::from(data)) as ArrayRef, @@ -683,11 +657,8 @@ unsafe fn get_offsets(data: &ArrayData) -> OffsetBuffer { match data.is_empty() && data.buffers()[0].is_empty() { true => OffsetBuffer::new_empty(), false => { - let buffer = ScalarBuffer::new( - data.buffers()[0].clone(), - data.offset(), - data.len() + 1, - ); + let buffer = + ScalarBuffer::new(data.buffers()[0].clone(), data.offset(), data.len() + 1); // Safety: // ArrayData is valid unsafe { OffsetBuffer::new_unchecked(buffer) } @@ -696,11 +667,7 @@ unsafe fn get_offsets(data: &ArrayData) -> OffsetBuffer { } /// Helper function for printing potentially long arrays. -fn print_long_array( - array: &A, - f: &mut std::fmt::Formatter, - print_item: F, -) -> std::fmt::Result +fn print_long_array(array: &A, f: &mut std::fmt::Formatter, print_item: F) -> std::fmt::Result where A: Array, F: Fn(&A, usize, &mut std::fmt::Formatter) -> std::fmt::Result, @@ -763,8 +730,7 @@ mod tests { #[test] fn test_empty_list_primitive() { - let data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); + let data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, false))); let array = new_empty_array(&data_type); let a = array.as_any().downcast_ref::().unwrap(); assert_eq!(a.len(), 0); @@ -795,8 +761,7 @@ mod tests { fn test_null_struct() { // It is possible to create a null struct containing a non-nullable child // see https://github.com/apache/arrow-rs/pull/3244 for details - let struct_type = - DataType::Struct(vec![Field::new("data", DataType::Int64, false)].into()); + let struct_type = DataType::Struct(vec![Field::new("data", DataType::Int64, false)].into()); let array = new_null_array(&struct_type, 9); let a = array.as_any().downcast_ref::().unwrap(); @@ -823,8 +788,7 @@ mod tests { #[test] fn test_null_list_primitive() { - let data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); + let data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); let array = new_null_array(&data_type, 9); let a = array.as_any().downcast_ref::().unwrap(); assert_eq!(a.len(), 9); @@ -858,8 +822,8 @@ mod tests { #[test] fn test_null_dictionary() { - let values = vec![None, None, None, None, None, None, None, None, None] - as Vec>; + let values = + vec![None, None, None, None, None, None, None, None, None] as Vec>; let array: DictionaryArray = values.into_iter().collect(); let array = Arc::new(array) as ArrayRef; @@ -961,8 +925,7 @@ mod tests { #[test] fn test_memory_size_primitive() { let arr = PrimitiveArray::::from_iter_values(0..128); - let empty = - PrimitiveArray::::from(ArrayData::new_empty(arr.data_type())); + let empty = PrimitiveArray::::from(ArrayData::new_empty(arr.data_type())); // subtract empty array to avoid magic numbers for the size of additional fields assert_eq!( diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 4c07e81468aa..1112acacfcd9 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -917,8 +917,8 @@ impl PrimitiveArray { let null_bit_buffer = data.nulls().map(|b| b.inner().sliced()); let element_len = std::mem::size_of::(); - let buffer = data.buffers()[0] - .slice_with_length(data.offset() * element_len, len * element_len); + let buffer = + data.buffers()[0].slice_with_length(data.offset() * element_len, len * element_len); drop(data); @@ -1116,10 +1116,9 @@ impl std::fmt::Debug for PrimitiveArray { }, // if the time zone is invalid, shows NaiveDateTime with an error message Err(_) => match as_datetime::(v) { - Some(datetime) => write!( - f, - "{datetime:?} (Unknown Time Zone '{tz_string}')" - ), + Some(datetime) => { + write!(f, "{datetime:?} (Unknown Time Zone '{tz_string}')") + } None => write!(f, "null"), }, } @@ -1191,25 +1190,19 @@ def_from_for_primitive!(Float64Type, f64); def_from_for_primitive!(Decimal128Type, i128); def_from_for_primitive!(Decimal256Type, i256); -impl From::Native>> - for NativeAdapter -{ +impl From::Native>> for NativeAdapter { fn from(value: Option<::Native>) -> Self { NativeAdapter { native: value } } } -impl From<&Option<::Native>> - for NativeAdapter -{ +impl From<&Option<::Native>> for NativeAdapter { fn from(value: &Option<::Native>) -> Self { NativeAdapter { native: *value } } } -impl>> FromIterator - for PrimitiveArray -{ +impl>> FromIterator for PrimitiveArray { fn from_iter>(iter: I) -> Self { let iter = iter.into_iter(); let (lower, _) = iter.size_hint(); @@ -1265,15 +1258,8 @@ impl PrimitiveArray { let (null, buffer) = trusted_len_unzip(iterator); - let data = ArrayData::new_unchecked( - T::DATA_TYPE, - len, - None, - Some(null), - 0, - vec![buffer], - vec![], - ); + let data = + ArrayData::new_unchecked(T::DATA_TYPE, len, None, Some(null), 0, vec![buffer], vec![]); PrimitiveArray::from(data) } } @@ -1294,9 +1280,7 @@ macro_rules! def_numeric_from_vec { } // Constructs a primitive array from a vector. Should only be used for testing. - impl From::Native>>> - for PrimitiveArray<$ty> - { + impl From::Native>>> for PrimitiveArray<$ty> { fn from(data: Vec::Native>>) -> Self { PrimitiveArray::from_iter(data.iter()) } @@ -1392,8 +1376,7 @@ impl From for PrimitiveArray { "PrimitiveArray data should contain a single buffer only (values buffer)" ); - let values = - ScalarBuffer::new(data.buffers()[0].clone(), data.offset(), data.len()); + let values = ScalarBuffer::new(data.buffers()[0].clone(), data.offset(), data.len()); Self { data_type: data.data_type().clone(), values, @@ -1407,11 +1390,7 @@ impl PrimitiveArray { /// specified precision and scale. /// /// See [`validate_decimal_precision_and_scale`] - pub fn with_precision_and_scale( - self, - precision: u8, - scale: i8, - ) -> Result { + pub fn with_precision_and_scale(self, precision: u8, scale: i8) -> Result { validate_decimal_precision_and_scale::(precision, scale)?; Ok(Self { data_type: T::TYPE_CONSTRUCTOR(precision, scale), @@ -1575,8 +1554,7 @@ mod tests { // 1: 00:00:00.001 // 37800005: 10:30:00.005 // 86399210: 23:59:59.210 - let arr: PrimitiveArray = - vec![1, 37_800_005, 86_399_210].into(); + let arr: PrimitiveArray = vec![1, 37_800_005, 86_399_210].into(); assert_eq!(3, arr.len()); assert_eq!(0, arr.offset()); assert_eq!(0, arr.null_count()); @@ -1858,11 +1836,7 @@ mod tests { #[test] fn test_timestamp_fmt_debug() { let arr: PrimitiveArray = - TimestampMillisecondArray::from(vec![ - 1546214400000, - 1546214400000, - -1546214400000, - ]); + TimestampMillisecondArray::from(vec![1546214400000, 1546214400000, -1546214400000]); assert_eq!( "PrimitiveArray\n[\n 2018-12-31T00:00:00,\n 2018-12-31T00:00:00,\n 1921-01-02T00:00:00,\n]", format!("{arr:?}") @@ -1872,12 +1846,8 @@ mod tests { #[test] fn test_timestamp_utc_fmt_debug() { let arr: PrimitiveArray = - TimestampMillisecondArray::from(vec![ - 1546214400000, - 1546214400000, - -1546214400000, - ]) - .with_timezone_utc(); + TimestampMillisecondArray::from(vec![1546214400000, 1546214400000, -1546214400000]) + .with_timezone_utc(); assert_eq!( "PrimitiveArray\n[\n 2018-12-31T00:00:00+00:00,\n 2018-12-31T00:00:00+00:00,\n 1921-01-02T00:00:00+00:00,\n]", format!("{arr:?}") @@ -1888,12 +1858,8 @@ mod tests { #[cfg(feature = "chrono-tz")] fn test_timestamp_with_named_tz_fmt_debug() { let arr: PrimitiveArray = - TimestampMillisecondArray::from(vec![ - 1546214400000, - 1546214400000, - -1546214400000, - ]) - .with_timezone("Asia/Taipei".to_string()); + TimestampMillisecondArray::from(vec![1546214400000, 1546214400000, -1546214400000]) + .with_timezone("Asia/Taipei".to_string()); assert_eq!( "PrimitiveArray\n[\n 2018-12-31T08:00:00+08:00,\n 2018-12-31T08:00:00+08:00,\n 1921-01-02T08:00:00+08:00,\n]", format!("{:?}", arr) @@ -1904,12 +1870,8 @@ mod tests { #[cfg(not(feature = "chrono-tz"))] fn test_timestamp_with_named_tz_fmt_debug() { let arr: PrimitiveArray = - TimestampMillisecondArray::from(vec![ - 1546214400000, - 1546214400000, - -1546214400000, - ]) - .with_timezone("Asia/Taipei".to_string()); + TimestampMillisecondArray::from(vec![1546214400000, 1546214400000, -1546214400000]) + .with_timezone("Asia/Taipei".to_string()); println!("{arr:?}"); @@ -1922,12 +1884,8 @@ mod tests { #[test] fn test_timestamp_with_fixed_offset_tz_fmt_debug() { let arr: PrimitiveArray = - TimestampMillisecondArray::from(vec![ - 1546214400000, - 1546214400000, - -1546214400000, - ]) - .with_timezone("+08:00".to_string()); + TimestampMillisecondArray::from(vec![1546214400000, 1546214400000, -1546214400000]) + .with_timezone("+08:00".to_string()); assert_eq!( "PrimitiveArray\n[\n 2018-12-31T08:00:00+08:00,\n 2018-12-31T08:00:00+08:00,\n 1921-01-02T08:00:00+08:00,\n]", format!("{arr:?}") @@ -1937,12 +1895,8 @@ mod tests { #[test] fn test_timestamp_with_incorrect_tz_fmt_debug() { let arr: PrimitiveArray = - TimestampMillisecondArray::from(vec![ - 1546214400000, - 1546214400000, - -1546214400000, - ]) - .with_timezone("xxx".to_string()); + TimestampMillisecondArray::from(vec![1546214400000, 1546214400000, -1546214400000]) + .with_timezone("xxx".to_string()); assert_eq!( "PrimitiveArray\n[\n 2018-12-31T00:00:00 (Unknown Time Zone 'xxx'),\n 2018-12-31T00:00:00 (Unknown Time Zone 'xxx'),\n 1921-01-02T00:00:00 (Unknown Time Zone 'xxx'),\n]", format!("{arr:?}") @@ -1952,14 +1906,13 @@ mod tests { #[test] #[cfg(feature = "chrono-tz")] fn test_timestamp_with_tz_with_daylight_saving_fmt_debug() { - let arr: PrimitiveArray = - TimestampMillisecondArray::from(vec![ - 1647161999000, - 1647162000000, - 1667717999000, - 1667718000000, - ]) - .with_timezone("America/Denver".to_string()); + let arr: PrimitiveArray = TimestampMillisecondArray::from(vec![ + 1647161999000, + 1647162000000, + 1667717999000, + 1667718000000, + ]) + .with_timezone("America/Denver".to_string()); assert_eq!( "PrimitiveArray\n[\n 2022-03-13T01:59:59-07:00,\n 2022-03-13T03:00:00-06:00,\n 2022-11-06T00:59:59-06:00,\n 2022-11-06T01:00:00-06:00,\n]", format!("{:?}", arr) @@ -1997,8 +1950,7 @@ mod tests { #[test] fn test_timestamp_micros_out_of_range() { // replicate the issue from https://github.com/apache/arrow-datafusion/issues/3832 - let arr: PrimitiveArray = - vec![9065525203050843594].into(); + let arr: PrimitiveArray = vec![9065525203050843594].into(); assert_eq!( "PrimitiveArray\n[\n null,\n]", format!("{arr:?}") @@ -2143,8 +2095,7 @@ mod tests { #[test] fn test_decimal256() { - let values: Vec<_> = - vec![i256::ZERO, i256::ONE, i256::MINUS_ONE, i256::MIN, i256::MAX]; + let values: Vec<_> = vec![i256::ZERO, i256::ONE, i256::MINUS_ONE, i256::MIN, i256::MAX]; let array: PrimitiveArray = PrimitiveArray::from_iter(values.iter().copied()); @@ -2166,8 +2117,8 @@ mod tests { // let val_8887: [u8; 16] = [192, 219, 180, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; // let val_neg_8887: [u8; 16] = [64, 36, 75, 238, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255]; let values: [u8; 32] = [ - 192, 219, 180, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 36, 75, 238, 253, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 192, 219, 180, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 36, 75, 238, 253, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, ]; let array_data = ArrayData::builder(DataType::Decimal128(38, 6)) .len(2) @@ -2232,8 +2183,7 @@ mod tests { #[test] fn test_decimal_from_iter() { - let array: Decimal128Array = - vec![Some(-100), None, Some(101)].into_iter().collect(); + let array: Decimal128Array = vec![Some(-100), None, Some(101)].into_iter().collect(); assert_eq!(array.len(), 3); assert_eq!(array.data_type(), &DataType::Decimal128(38, 10)); assert_eq!(-100_i128, array.value(0)); @@ -2343,8 +2293,7 @@ mod tests { #[test] fn test_decimal_array_set_null_if_overflow_with_precision() { - let array = - Decimal128Array::from(vec![Some(123456), Some(123), None, Some(123456)]); + let array = Decimal128Array::from(vec![Some(123456), Some(123), None, Some(123456)]); let result = array.null_if_overflow_precision(5); let expected = Decimal128Array::from(vec![None, Some(123), None, None]); assert_eq!(result, expected); @@ -2361,8 +2310,7 @@ mod tests { let decimal2 = i256::from_i128(56789); builder.append_value(decimal2); - let array: Decimal256Array = - builder.finish().with_precision_and_scale(76, 6).unwrap(); + let array: Decimal256Array = builder.finish().with_precision_and_scale(76, 6).unwrap(); let collected: Vec<_> = array.iter().collect(); assert_eq!(vec![Some(decimal1), None, Some(decimal2)], collected); @@ -2387,8 +2335,7 @@ mod tests { #[test] fn test_from_iter_decimal128array() { - let mut array: Decimal128Array = - vec![Some(-100), None, Some(101)].into_iter().collect(); + let mut array: Decimal128Array = vec![Some(-100), None, Some(101)].into_iter().collect(); array = array.with_precision_and_scale(38, 10).unwrap(); assert_eq!(array.len(), 3); assert_eq!(array.data_type(), &DataType::Decimal128(38, 10)); @@ -2404,13 +2351,11 @@ mod tests { let array = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7]); let r = array.unary_opt::<_, Int32Type>(|x| (x % 2 != 0).then_some(x)); - let expected = - Int32Array::from(vec![Some(1), None, Some(3), None, Some(5), None, Some(7)]); + let expected = Int32Array::from(vec![Some(1), None, Some(3), None, Some(5), None, Some(7)]); assert_eq!(r, expected); let r = expected.unary_opt::<_, Int32Type>(|x| (x % 3 != 0).then_some(x)); - let expected = - Int32Array::from(vec![Some(1), None, None, None, Some(5), None, Some(7)]); + let expected = Int32Array::from(vec![Some(1), None, None, None, Some(5), None, Some(7)]); assert_eq!(r, expected); } @@ -2513,9 +2458,8 @@ mod tests { Int32Array::new(vec![1, 2, 3, 4].into(), None); Int32Array::new(vec![1, 2, 3, 4].into(), Some(NullBuffer::new_null(4))); - let err = - Int32Array::try_new(vec![1, 2, 3, 4].into(), Some(NullBuffer::new_null(3))) - .unwrap_err(); + let err = Int32Array::try_new(vec![1, 2, 3, 4].into(), Some(NullBuffer::new_null(3))) + .unwrap_err(); assert_eq!( err.to_string(), diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index ba6986c28463..4877f9f850a3 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -91,10 +91,7 @@ impl RunArray { /// Attempts to create RunArray using given run_ends (index where a run ends) /// and the values (value of the run). Returns an error if the given data is not compatible /// with RunEndEncoded specification. - pub fn try_new( - run_ends: &PrimitiveArray, - values: &dyn Array, - ) -> Result { + pub fn try_new(run_ends: &PrimitiveArray, values: &dyn Array) -> Result { let run_ends_type = run_ends.data_type().clone(); let values_type = values.data_type().clone(); let ree_array_type = DataType::RunEndEncoded( @@ -182,10 +179,7 @@ impl RunArray { /// scaled well for larger inputs. /// See for more details. #[inline] - pub fn get_physical_indices( - &self, - logical_indices: &[I], - ) -> Result, ArrowError> + pub fn get_physical_indices(&self, logical_indices: &[I]) -> Result, ArrowError> where I: ArrowNativeType, { @@ -211,8 +205,7 @@ impl RunArray { }); // Return early if all the logical indices cannot be converted to physical indices. - let largest_logical_index = - logical_indices[*ordered_indices.last().unwrap()].as_usize(); + let largest_logical_index = logical_indices[*ordered_indices.last().unwrap()].as_usize(); if largest_logical_index >= len { return Err(ArrowError::InvalidArgumentError(format!( "Cannot convert all logical indices to physical indices. The logical index cannot be converted is {largest_logical_index}.", @@ -225,8 +218,7 @@ impl RunArray { let mut physical_indices = vec![0; indices_len]; let mut ordered_index = 0_usize; - for (physical_index, run_end) in - self.run_ends.values().iter().enumerate().skip(skip_value) + for (physical_index, run_end) in self.run_ends.values().iter().enumerate().skip(skip_value) { // Get the run end index (relative to offset) of current physical index let run_end_value = run_end.as_usize() - offset; @@ -234,8 +226,7 @@ impl RunArray { // All the `logical_indices` that are less than current run end index // belongs to current physical index. while ordered_index < indices_len - && logical_indices[ordered_indices[ordered_index]].as_usize() - < run_end_value + && logical_indices[ordered_indices[ordered_index]].as_usize() < run_end_value { physical_indices[ordered_indices[ordered_index]] = physical_index; ordered_index += 1; @@ -245,8 +236,7 @@ impl RunArray { // If there are input values >= run_ends.last_value then we'll not be able to convert // all logical indices to physical indices. if ordered_index < logical_indices.len() { - let logical_index = - logical_indices[ordered_indices[ordered_index]].as_usize(); + let logical_index = logical_indices[ordered_indices[ordered_index]].as_usize(); return Err(ArrowError::InvalidArgumentError(format!( "Cannot convert all logical indices to physical indices. The logical index cannot be converted is {logical_index}.", ))); @@ -704,8 +694,7 @@ mod tests { seed.shuffle(&mut rng); } // repeat the items between 1 and 8 times. Cap the length for smaller sized arrays - let num = - max_run_length.min(rand::thread_rng().gen_range(1..=max_run_length)); + let num = max_run_length.min(rand::thread_rng().gen_range(1..=max_run_length)); for _ in 0..num { result.push(seed[ix]); } @@ -749,19 +738,16 @@ mod tests { #[test] fn test_run_array() { // Construct a value array - let value_data = PrimitiveArray::::from_iter_values([ - 10_i8, 11, 12, 13, 14, 15, 16, 17, - ]); + let value_data = + PrimitiveArray::::from_iter_values([10_i8, 11, 12, 13, 14, 15, 16, 17]); // Construct a run_ends array: let run_ends_values = [4_i16, 6, 7, 9, 13, 18, 20, 22]; - let run_ends_data = PrimitiveArray::::from_iter_values( - run_ends_values.iter().copied(), - ); + let run_ends_data = + PrimitiveArray::::from_iter_values(run_ends_values.iter().copied()); // Construct a run ends encoded array from the above two - let ree_array = - RunArray::::try_new(&run_ends_data, &value_data).unwrap(); + let ree_array = RunArray::::try_new(&run_ends_data, &value_data).unwrap(); assert_eq!(ree_array.len(), 22); assert_eq!(ree_array.null_count(), 0); @@ -872,8 +858,7 @@ mod tests { let values: StringArray = [Some("foo"), Some("bar"), None, Some("baz")] .into_iter() .collect(); - let run_ends: Int32Array = - [Some(1), Some(2), Some(3), Some(4)].into_iter().collect(); + let run_ends: Int32Array = [Some(1), Some(2), Some(3), Some(4)].into_iter().collect(); let array = RunArray::::try_new(&run_ends, &values).unwrap(); assert_eq!(array.values().data_type(), &DataType::Utf8); @@ -924,7 +909,10 @@ mod tests { let run_ends: Int32Array = [Some(1), None, Some(3)].into_iter().collect(); let actual = RunArray::::try_new(&run_ends, &values); - let expected = ArrowError::InvalidArgumentError("Found null values in run_ends array. The run_ends array should not have null values.".to_string()); + let expected = ArrowError::InvalidArgumentError( + "Found null values in run_ends array. The run_ends array should not have null values." + .to_string(), + ); assert_eq!(expected.to_string(), actual.err().unwrap().to_string()); } @@ -1003,8 +991,7 @@ mod tests { let mut rng = thread_rng(); logical_indices.shuffle(&mut rng); - let physical_indices = - run_array.get_physical_indices(&logical_indices).unwrap(); + let physical_indices = run_array.get_physical_indices(&logical_indices).unwrap(); assert_eq!(logical_indices.len(), physical_indices.len()); diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index cac4651f4496..9d266e0ca4b8 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -59,9 +59,7 @@ impl GenericStringArray { /// Fallibly creates a [`GenericStringArray`] from a [`GenericBinaryArray`] returning /// an error if [`GenericBinaryArray`] contains invalid UTF-8 data - pub fn try_from_binary( - v: GenericBinaryArray, - ) -> Result { + pub fn try_from_binary(v: GenericBinaryArray) -> Result { let (offsets, values, nulls) = v.into_parts(); Self::try_new(offsets, values, nulls) } @@ -83,9 +81,7 @@ impl From> } } -impl From>> - for GenericStringArray -{ +impl From>> for GenericStringArray { fn from(v: Vec>) -> Self { v.into_iter().collect() } @@ -97,9 +93,7 @@ impl From> for GenericStringArray From>> - for GenericStringArray -{ +impl From>> for GenericStringArray { fn from(v: Vec>) -> Self { v.into_iter().collect() } @@ -438,13 +432,11 @@ mod tests { let expected: LargeStringArray = data.clone().into_iter().map(Some).collect(); // Iterator reports too many items - let arr = - LargeStringArray::from_iter_values(BadIterator::new(3, 10, data.clone())); + let arr = LargeStringArray::from_iter_values(BadIterator::new(3, 10, data.clone())); assert_eq!(expected, arr); // Iterator reports too few items - let arr = - LargeStringArray::from_iter_values(BadIterator::new(3, 1, data.clone())); + let arr = LargeStringArray::from_iter_values(BadIterator::new(3, 1, data.clone())); assert_eq!(expected, arr); } @@ -460,9 +452,11 @@ mod tests { let offsets = [0, 5, 8, 15].map(|n| O::from_usize(n).unwrap()); let null_buffer = Buffer::from_slice_ref([0b101]); - let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new( - Field::new("item", DataType::UInt8, false), - )); + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new( + "item", + DataType::UInt8, + false, + ))); // [None, Some("Parquet")] let array_data = ArrayData::builder(data_type) @@ -493,9 +487,7 @@ mod tests { _test_generic_string_array_from_list_array::(); } - fn _test_generic_string_array_from_list_array_with_child_nulls_failed< - O: OffsetSizeTrait, - >() { + fn _test_generic_string_array_from_list_array_with_child_nulls_failed() { let values = b"HelloArrow"; let child_data = ArrayData::builder(DataType::UInt8) .len(10) @@ -508,9 +500,11 @@ mod tests { // It is possible to create a null struct containing a non-nullable child // see https://github.com/apache/arrow-rs/pull/3244 for details - let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new( - Field::new("item", DataType::UInt8, true), - )); + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new( + "item", + DataType::UInt8, + true, + ))); // [None, Some(b"Parquet")] let array_data = ArrayData::builder(data_type) @@ -544,9 +538,11 @@ mod tests { .unwrap(); let offsets = [0, 2, 3].map(|n| O::from_usize(n).unwrap()); - let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new( - Field::new("item", DataType::UInt16, false), - )); + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new( + "item", + DataType::UInt16, + false, + ))); let array_data = ArrayData::builder(data_type) .len(2) diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index 284c3b26a946..699da28cf7a3 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -197,6 +197,23 @@ impl StructArray { } } + /// Create a new [`StructArray`] containing no fields + /// + /// # Panics + /// + /// If `len != nulls.len()` + pub fn new_empty_fields(len: usize, nulls: Option) -> Self { + if let Some(n) = &nulls { + assert_eq!(len, n.len()) + } + Self { + len, + data_type: DataType::Struct(Fields::empty()), + fields: vec![], + nulls, + } + } + /// Deconstruct this array into its constituent parts pub fn into_parts(self) -> (Fields, Vec, Option) { let f = match self.data_type { @@ -445,9 +462,7 @@ impl Index<&str> for StructArray { mod tests { use super::*; - use crate::{ - BooleanArray, Float32Array, Float64Array, Int32Array, Int64Array, StringArray, - }; + use crate::{BooleanArray, Float32Array, Float64Array, Int32Array, Int64Array, StringArray}; use arrow_buffer::ToByteSlice; use std::sync::Arc; @@ -523,12 +538,10 @@ mod tests { None, Some("mark"), ])); - let ints: ArrayRef = - Arc::new(Int32Array::from(vec![Some(1), Some(2), None, Some(4)])); + let ints: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(2), None, Some(4)])); let arr = - StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) - .unwrap(); + StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]).unwrap(); let struct_data = arr.into_data(); assert_eq!(4, struct_data.len()); @@ -561,13 +574,11 @@ mod tests { None, // 3 elements, not 4 ])); - let ints: ArrayRef = - Arc::new(Int32Array::from(vec![Some(1), Some(2), None, Some(4)])); + let ints: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(2), None, Some(4)])); - let err = - StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) - .unwrap_err() - .to_string(); + let err = StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) + .unwrap_err() + .to_string(); assert_eq!( err, @@ -582,8 +593,7 @@ mod tests { fn test_struct_array_from_mismatched_types_single() { drop(StructArray::from(vec![( Arc::new(Field::new("b", DataType::Int16, false)), - Arc::new(BooleanArray::from(vec![false, false, true, true])) - as Arc, + Arc::new(BooleanArray::from(vec![false, false, true, true])) as Arc, )])); } @@ -595,8 +605,7 @@ mod tests { drop(StructArray::from(vec![ ( Arc::new(Field::new("b", DataType::Int16, false)), - Arc::new(BooleanArray::from(vec![false, false, true, true])) - as Arc, + Arc::new(BooleanArray::from(vec![false, false, true, true])) as Arc, ), ( Arc::new(Field::new("c", DataType::Utf8, false)), @@ -716,9 +725,7 @@ mod tests { } #[test] - #[should_panic( - expected = "Found unmasked nulls for non-nullable StructArray field \\\"c\\\"" - )] + #[should_panic(expected = "Found unmasked nulls for non-nullable StructArray field \\\"c\\\"")] fn test_struct_array_from_mismatched_nullability() { drop(StructArray::from(vec![( Arc::new(Field::new("c", DataType::Int32, false)), diff --git a/arrow-array/src/array/union_array.rs b/arrow-array/src/array/union_array.rs index 74a5f1efa767..94ac0bc879e4 100644 --- a/arrow-array/src/array/union_array.rs +++ b/arrow-array/src/array/union_array.rs @@ -179,8 +179,7 @@ impl UnionArray { if let Some(b) = &value_offsets { if ((type_ids.len()) * 4) != b.len() { return Err(ArrowError::InvalidArgumentError( - "Type Ids and Offsets represent a different number of array slots." - .to_string(), + "Type Ids and Offsets represent a different number of array slots.".to_string(), )); } } @@ -216,9 +215,8 @@ impl UnionArray { // Unsafe Justification: arguments were validated above (and // re-revalidated as part of data().validate() below) - let new_self = unsafe { - Self::new_unchecked(field_type_ids, type_ids, value_offsets, child_arrays) - }; + let new_self = + unsafe { Self::new_unchecked(field_type_ids, type_ids, value_offsets, child_arrays) }; new_self.to_data().validate()?; Ok(new_self) @@ -1059,7 +1057,13 @@ mod tests { let mut builder = UnionBuilder::new_sparse(); builder.append::("a", 1.0).unwrap(); let err = builder.append::("a", 1).unwrap_err().to_string(); - assert!(err.contains("Attempt to write col \"a\" with type Int32 doesn't match existing type Float32"), "{}", err); + assert!( + err.contains( + "Attempt to write col \"a\" with type Int32 doesn't match existing type Float32" + ), + "{}", + err + ); } #[test] diff --git a/arrow-array/src/builder/boolean_builder.rs b/arrow-array/src/builder/boolean_builder.rs index 5f0013269677..7e59d940a50e 100644 --- a/arrow-array/src/builder/boolean_builder.rs +++ b/arrow-array/src/builder/boolean_builder.rs @@ -127,11 +127,7 @@ impl BooleanBuilder { /// /// Returns an error if the slices are of different lengths #[inline] - pub fn append_values( - &mut self, - values: &[bool], - is_valid: &[bool], - ) -> Result<(), ArrowError> { + pub fn append_values(&mut self, values: &[bool], is_valid: &[bool]) -> Result<(), ArrowError> { if values.len() != is_valid.len() { Err(ArrowError::InvalidArgumentError( "Value and validity lengths must be equal".to_string(), @@ -250,8 +246,7 @@ mod tests { #[test] fn test_boolean_array_builder_append_slice() { - let arr1 = - BooleanArray::from(vec![Some(true), Some(false), None, None, Some(false)]); + let arr1 = BooleanArray::from(vec![Some(true), Some(false), None, None, Some(false)]); let mut builder = BooleanArray::builder(0); builder.append_slice(&[true, false]); diff --git a/arrow-array/src/builder/buffer_builder.rs b/arrow-array/src/builder/buffer_builder.rs index 01e4c1d4e217..2b66a8187fa9 100644 --- a/arrow-array/src/builder/buffer_builder.rs +++ b/arrow-array/src/builder/buffer_builder.rs @@ -45,11 +45,9 @@ pub type Float32BufferBuilder = BufferBuilder; pub type Float64BufferBuilder = BufferBuilder; /// Buffer builder for 128-bit decimal type. -pub type Decimal128BufferBuilder = - BufferBuilder<::Native>; +pub type Decimal128BufferBuilder = BufferBuilder<::Native>; /// Buffer builder for 256-bit decimal type. -pub type Decimal256BufferBuilder = - BufferBuilder<::Native>; +pub type Decimal256BufferBuilder = BufferBuilder<::Native>; /// Buffer builder for timestamp type of second unit. pub type TimestampSecondBufferBuilder = @@ -107,9 +105,7 @@ pub type DurationNanosecondBufferBuilder = #[cfg(test)] mod tests { - use crate::builder::{ - ArrayBuilder, Int32BufferBuilder, Int8Builder, UInt8BufferBuilder, - }; + use crate::builder::{ArrayBuilder, Int32BufferBuilder, Int8Builder, UInt8BufferBuilder}; use crate::Array; #[test] diff --git a/arrow-array/src/builder/fixed_size_binary_builder.rs b/arrow-array/src/builder/fixed_size_binary_builder.rs index 180150e988f3..0a50eb8a50e9 100644 --- a/arrow-array/src/builder/fixed_size_binary_builder.rs +++ b/arrow-array/src/builder/fixed_size_binary_builder.rs @@ -75,7 +75,8 @@ impl FixedSizeBinaryBuilder { pub fn append_value(&mut self, value: impl AsRef<[u8]>) -> Result<(), ArrowError> { if self.value_length != value.as_ref().len() as i32 { Err(ArrowError::InvalidArgumentError( - "Byte slice does not have the same length as FixedSizeBinaryBuilder value lengths".to_string() + "Byte slice does not have the same length as FixedSizeBinaryBuilder value lengths" + .to_string(), )) } else { self.values_builder.append_slice(value.as_ref()); @@ -95,11 +96,10 @@ impl FixedSizeBinaryBuilder { /// Builds the [`FixedSizeBinaryArray`] and reset this builder. pub fn finish(&mut self) -> FixedSizeBinaryArray { let array_length = self.len(); - let array_data_builder = - ArrayData::builder(DataType::FixedSizeBinary(self.value_length)) - .add_buffer(self.values_builder.finish()) - .nulls(self.null_buffer_builder.finish()) - .len(array_length); + let array_data_builder = ArrayData::builder(DataType::FixedSizeBinary(self.value_length)) + .add_buffer(self.values_builder.finish()) + .nulls(self.null_buffer_builder.finish()) + .len(array_length); let array_data = unsafe { array_data_builder.build_unchecked() }; FixedSizeBinaryArray::from(array_data) } @@ -108,11 +108,10 @@ impl FixedSizeBinaryBuilder { pub fn finish_cloned(&self) -> FixedSizeBinaryArray { let array_length = self.len(); let values_buffer = Buffer::from_slice_ref(self.values_builder.as_slice()); - let array_data_builder = - ArrayData::builder(DataType::FixedSizeBinary(self.value_length)) - .add_buffer(values_buffer) - .nulls(self.null_buffer_builder.finish_cloned()) - .len(array_length); + let array_data_builder = ArrayData::builder(DataType::FixedSizeBinary(self.value_length)) + .add_buffer(values_buffer) + .nulls(self.null_buffer_builder.finish_cloned()) + .len(array_length); let array_data = unsafe { array_data_builder.build_unchecked() }; FixedSizeBinaryArray::from(array_data) } diff --git a/arrow-array/src/builder/generic_byte_run_builder.rs b/arrow-array/src/builder/generic_byte_run_builder.rs index 41165208de55..3cde76c4a039 100644 --- a/arrow-array/src/builder/generic_byte_run_builder.rs +++ b/arrow-array/src/builder/generic_byte_run_builder.rs @@ -19,10 +19,7 @@ use crate::types::bytes::ByteArrayNativeType; use std::{any::Any, sync::Arc}; use crate::{ - types::{ - BinaryType, ByteArrayType, LargeBinaryType, LargeUtf8Type, RunEndIndexType, - Utf8Type, - }, + types::{BinaryType, ByteArrayType, LargeBinaryType, LargeUtf8Type, RunEndIndexType, Utf8Type}, ArrayRef, ArrowPrimitiveType, RunArray, }; @@ -112,10 +109,7 @@ where pub fn with_capacity(capacity: usize, data_capacity: usize) -> Self { Self { run_ends_builder: PrimitiveBuilder::with_capacity(capacity), - values_builder: GenericByteBuilder::::with_capacity( - capacity, - data_capacity, - ), + values_builder: GenericByteBuilder::::with_capacity(capacity, data_capacity), current_value: Vec::new(), has_current_value: false, current_run_end_index: 0, @@ -282,12 +276,13 @@ where } fn run_end_index_as_native(&self) -> R::Native { - R::Native::from_usize(self.current_run_end_index) - .unwrap_or_else(|| panic!( + R::Native::from_usize(self.current_run_end_index).unwrap_or_else(|| { + panic!( "Cannot convert the value {} from `usize` to native form of arrow datatype {}", self.current_run_end_index, R::DATA_TYPE - )) + ) + }) } } @@ -413,8 +408,7 @@ mod tests { // Values are polymorphic and so require a downcast. let av = array.values(); - let ava: &GenericByteArray = - av.as_any().downcast_ref::>().unwrap(); + let ava: &GenericByteArray = av.as_any().downcast_ref::>().unwrap(); assert_eq!(*ava.value(0), *values[0]); assert!(ava.is_null(1)); @@ -459,8 +453,7 @@ mod tests { // Values are polymorphic and so require a downcast. let av = array.values(); - let ava: &GenericByteArray = - av.as_any().downcast_ref::>().unwrap(); + let ava: &GenericByteArray = av.as_any().downcast_ref::>().unwrap(); assert_eq!(ava.value(0), values[0]); assert!(ava.is_null(1)); diff --git a/arrow-array/src/builder/generic_bytes_builder.rs b/arrow-array/src/builder/generic_bytes_builder.rs index d84be8c2fca6..2c7ee7a3e448 100644 --- a/arrow-array/src/builder/generic_bytes_builder.rs +++ b/arrow-array/src/builder/generic_bytes_builder.rs @@ -68,12 +68,8 @@ impl GenericByteBuilder { let value_builder = BufferBuilder::::new_from_buffer(value_buffer); let null_buffer_builder = null_buffer - .map(|buffer| { - NullBufferBuilder::new_from_buffer(buffer, offsets_builder.len() - 1) - }) - .unwrap_or_else(|| { - NullBufferBuilder::new_with_len(offsets_builder.len() - 1) - }); + .map(|buffer| NullBufferBuilder::new_from_buffer(buffer, offsets_builder.len() - 1)) + .unwrap_or_else(|| NullBufferBuilder::new_with_len(offsets_builder.len() - 1)); Self { offsets_builder, @@ -84,8 +80,7 @@ impl GenericByteBuilder { #[inline] fn next_offset(&self) -> T::Offset { - T::Offset::from_usize(self.value_builder.len()) - .expect("byte array offset overflow") + T::Offset::from_usize(self.value_builder.len()).expect("byte array offset overflow") } /// Appends a value into the builder. diff --git a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs index 282f423fa6d1..b0c722ae7cda 100644 --- a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs +++ b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs @@ -16,9 +16,7 @@ // under the License. use crate::builder::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder}; -use crate::types::{ - ArrowDictionaryKeyType, ByteArrayType, GenericBinaryType, GenericStringType, -}; +use crate::types::{ArrowDictionaryKeyType, ByteArrayType, GenericBinaryType, GenericStringType}; use crate::{Array, ArrayRef, DictionaryArray, GenericByteArray}; use arrow_buffer::ArrowNativeType; use arrow_schema::{ArrowError, DataType}; @@ -91,10 +89,7 @@ where state: Default::default(), dedup: Default::default(), keys_builder: PrimitiveBuilder::with_capacity(keys_capacity), - values_builder: GenericByteBuilder::::with_capacity( - value_capacity, - data_capacity, - ), + values_builder: GenericByteBuilder::::with_capacity(value_capacity, data_capacity), } } @@ -131,8 +126,7 @@ where let mut dedup = HashMap::with_capacity_and_hasher(dict_len, ()); let values_len = dictionary_values.value_data().len(); - let mut values_builder = - GenericByteBuilder::::with_capacity(dict_len, values_len); + let mut values_builder = GenericByteBuilder::::with_capacity(dict_len, values_len); K::Native::from_usize(dictionary_values.len()) .ok_or(ArrowError::DictionaryKeyOverflowError)?; @@ -214,10 +208,7 @@ where /// value is appended to the values array. /// /// Returns an error if the new index would overflow the key type. - pub fn append( - &mut self, - value: impl AsRef, - ) -> Result { + pub fn append(&mut self, value: impl AsRef) -> Result { let value_native: &T::Native = value.as_ref(); let value_bytes: &[u8] = value_native.as_ref(); @@ -240,8 +231,7 @@ where state.hash_one(get_bytes(storage, *idx)) }); - K::Native::from_usize(idx) - .ok_or(ArrowError::DictionaryKeyOverflowError)? + K::Native::from_usize(idx).ok_or(ArrowError::DictionaryKeyOverflowError)? } }; self.keys_builder.append_value(key); @@ -283,8 +273,7 @@ where let values = self.values_builder.finish(); let keys = self.keys_builder.finish(); - let data_type = - DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(T::DATA_TYPE)); + let data_type = DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(T::DATA_TYPE)); let builder = keys .into_data() @@ -300,8 +289,7 @@ where let values = self.values_builder.finish_cloned(); let keys = self.keys_builder.finish_cloned(); - let data_type = - DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(T::DATA_TYPE)); + let data_type = DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(T::DATA_TYPE)); let builder = keys .into_data() @@ -367,12 +355,10 @@ fn get_bytes(values: &GenericByteBuilder, idx: usize) -> &[ /// assert_eq!(ava.value(1), "def"); /// /// ``` -pub type StringDictionaryBuilder = - GenericByteDictionaryBuilder>; +pub type StringDictionaryBuilder = GenericByteDictionaryBuilder>; /// Builder for [`DictionaryArray`] of [`LargeStringArray`](crate::array::LargeStringArray) -pub type LargeStringDictionaryBuilder = - GenericByteDictionaryBuilder>; +pub type LargeStringDictionaryBuilder = GenericByteDictionaryBuilder>; /// Builder for [`DictionaryArray`] of [`BinaryArray`](crate::array::BinaryArray) /// @@ -407,12 +393,10 @@ pub type LargeStringDictionaryBuilder = /// assert_eq!(ava.value(1), b"def"); /// /// ``` -pub type BinaryDictionaryBuilder = - GenericByteDictionaryBuilder>; +pub type BinaryDictionaryBuilder = GenericByteDictionaryBuilder>; /// Builder for [`DictionaryArray`] of [`LargeBinaryArray`](crate::array::LargeBinaryArray) -pub type LargeBinaryDictionaryBuilder = - GenericByteDictionaryBuilder>; +pub type LargeBinaryDictionaryBuilder = GenericByteDictionaryBuilder>; #[cfg(test)] mod tests { @@ -444,8 +428,7 @@ mod tests { // Values are polymorphic and so require a downcast. let av = array.values(); - let ava: &GenericByteArray = - av.as_any().downcast_ref::>().unwrap(); + let ava: &GenericByteArray = av.as_any().downcast_ref::>().unwrap(); assert_eq!(*ava.value(0), *values[0]); assert_eq!(*ava.value(1), *values[1]); @@ -483,8 +466,7 @@ mod tests { // Values are polymorphic and so require a downcast. let av = array.values(); - let ava: &GenericByteArray = - av.as_any().downcast_ref::>().unwrap(); + let ava: &GenericByteArray = av.as_any().downcast_ref::>().unwrap(); assert_eq!(ava.value(0), values[0]); assert_eq!(ava.value(1), values[1]); @@ -542,11 +524,8 @@ mod tests { ::Native: AsRef<::Native>, { let mut builder = - GenericByteDictionaryBuilder::::new_with_dictionary( - 6, - &dictionary, - ) - .unwrap(); + GenericByteDictionaryBuilder::::new_with_dictionary(6, &dictionary) + .unwrap(); builder.append(values[0]).unwrap(); builder.append_null(); builder.append(values[1]).unwrap(); @@ -562,8 +541,7 @@ mod tests { // Values are polymorphic and so require a downcast. let av = array.values(); - let ava: &GenericByteArray = - av.as_any().downcast_ref::>().unwrap(); + let ava: &GenericByteArray = av.as_any().downcast_ref::>().unwrap(); assert!(!ava.is_valid(0)); assert_eq!(ava.value(1), values[1]); @@ -597,11 +575,8 @@ mod tests { ::Native: AsRef<::Native>, { let mut builder = - GenericByteDictionaryBuilder::::new_with_dictionary( - 4, - &dictionary, - ) - .unwrap(); + GenericByteDictionaryBuilder::::new_with_dictionary(4, &dictionary) + .unwrap(); builder.append(values[0]).unwrap(); builder.append_null(); builder.append(values[1]).unwrap(); diff --git a/arrow-array/src/builder/generic_list_builder.rs b/arrow-array/src/builder/generic_list_builder.rs index 5cc7f7b04e0a..21eaadd5208a 100644 --- a/arrow-array/src/builder/generic_list_builder.rs +++ b/arrow-array/src/builder/generic_list_builder.rs @@ -353,7 +353,7 @@ where #[cfg(test)] mod tests { use super::*; - use crate::builder::{Int32Builder, ListBuilder}; + use crate::builder::{make_builder, Int32Builder, ListBuilder}; use crate::cast::AsArray; use crate::types::Int32Type; use crate::{Array, Int32Array}; @@ -548,4 +548,204 @@ mod tests { assert_eq!(elements.null_count(), 1); assert!(elements.is_null(3)); } + + #[test] + fn test_boxed_primitive_aray_builder() { + let values_builder = make_builder(&DataType::Int32, 5); + let mut builder = ListBuilder::new(values_builder); + + builder + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_slice(&[1, 2, 3]); + builder.append(true); + + builder + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_slice(&[4, 5, 6]); + builder.append(true); + + let arr = builder.finish(); + assert_eq!(2, arr.len()); + + let elements = arr.values().as_primitive::(); + assert_eq!(elements.values(), &[1, 2, 3, 4, 5, 6]); + } + + #[test] + fn test_boxed_list_list_array_builder() { + // This test is same as `test_list_list_array_builder` but uses boxed builders. + let values_builder = make_builder( + &DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), + 10, + ); + let mut builder = ListBuilder::new(values_builder); + + // [[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], null, [[9, 10]]] + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an ListBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(1); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an ListBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(2); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an ListBuilder") + .append(true); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an ListBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(3); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an ListBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(4); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an ListBuilder") + .append(true); + builder.append(true); + + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an ListBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(5); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an ListBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(6); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an ListBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(7); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an ListBuilder") + .append(true); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an ListBuilder") + .append(false); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an ListBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(8); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an ListBuilder") + .append(true); + builder.append(true); + + builder.append(false); + + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an ListBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(9); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an ListBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(10); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an ListBuilder") + .append(true); + builder.append(true); + + let l1 = builder.finish(); + + assert_eq!(4, l1.len()); + assert_eq!(1, l1.null_count()); + + assert_eq!(l1.value_offsets(), &[0, 2, 5, 5, 6]); + let l2 = l1.values().as_list::(); + + assert_eq!(6, l2.len()); + assert_eq!(1, l2.null_count()); + assert_eq!(l2.value_offsets(), &[0, 2, 4, 7, 7, 8, 10]); + + let i1 = l2.values().as_primitive::(); + assert_eq!(10, i1.len()); + assert_eq!(0, i1.null_count()); + assert_eq!(i1.values(), &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); + } } diff --git a/arrow-array/src/builder/map_builder.rs b/arrow-array/src/builder/map_builder.rs index 4e3ec4a7944d..3a5244ed81a0 100644 --- a/arrow-array/src/builder/map_builder.rs +++ b/arrow-array/src/builder/map_builder.rs @@ -86,11 +86,7 @@ impl Default for MapFieldNames { impl MapBuilder { /// Creates a new `MapBuilder` - pub fn new( - field_names: Option, - key_builder: K, - value_builder: V, - ) -> Self { + pub fn new(field_names: Option, key_builder: K, value_builder: V) -> Self { let capacity = key_builder.len(); Self::with_capacity(field_names, key_builder, value_builder, capacity) } @@ -243,12 +239,9 @@ mod tests { use super::*; #[test] - #[should_panic( - expected = "Keys array must have no null values, found 1 null value(s)" - )] + #[should_panic(expected = "Keys array must have no null values, found 1 null value(s)")] fn test_map_builder_with_null_keys_panics() { - let mut builder = - MapBuilder::new(None, StringBuilder::new(), Int32Builder::new()); + let mut builder = MapBuilder::new(None, StringBuilder::new(), Int32Builder::new()); builder.keys().append_null(); builder.values().append_value(42); builder.append(true).unwrap(); diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index 38a7500dd55f..8382f7af87b0 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -265,6 +265,36 @@ pub trait ArrayBuilder: Any + Send { fn into_box_any(self: Box) -> Box; } +impl ArrayBuilder for Box { + fn len(&self) -> usize { + (**self).len() + } + + fn is_empty(&self) -> bool { + (**self).is_empty() + } + + fn finish(&mut self) -> ArrayRef { + (**self).finish() + } + + fn finish_cloned(&self) -> ArrayRef { + (**self).finish_cloned() + } + + fn as_any(&self) -> &dyn Any { + (**self).as_any() + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + (**self).as_any_mut() + } + + fn into_box_any(self: Box) -> Box { + self + } +} + /// Builder for [`ListArray`](crate::array::ListArray) pub type ListBuilder = GenericListBuilder; diff --git a/arrow-array/src/builder/primitive_builder.rs b/arrow-array/src/builder/primitive_builder.rs index b23d6bba36c4..0aad2dbfce0e 100644 --- a/arrow-array/src/builder/primitive_builder.rs +++ b/arrow-array/src/builder/primitive_builder.rs @@ -161,9 +161,7 @@ impl PrimitiveBuilder { let values_builder = BufferBuilder::::new_from_buffer(values_buffer); let null_buffer_builder = null_buffer - .map(|buffer| { - NullBufferBuilder::new_from_buffer(buffer, values_builder.len()) - }) + .map(|buffer| NullBufferBuilder::new_from_buffer(buffer, values_builder.len())) .unwrap_or_else(|| NullBufferBuilder::new_with_len(values_builder.len())); Self { @@ -256,10 +254,7 @@ impl PrimitiveBuilder { /// This requires the iterator be a trusted length. This could instead require /// the iterator implement `TrustedLen` once that is stabilized. #[inline] - pub unsafe fn append_trusted_len_iter( - &mut self, - iter: impl IntoIterator, - ) { + pub unsafe fn append_trusted_len_iter(&mut self, iter: impl IntoIterator) { let iter = iter.into_iter(); let len = iter .size_hint() @@ -328,11 +323,7 @@ impl PrimitiveBuilder { impl PrimitiveBuilder

{ /// Sets the precision and scale - pub fn with_precision_and_scale( - self, - precision: u8, - scale: i8, - ) -> Result { + pub fn with_precision_and_scale(self, precision: u8, scale: i8) -> Result { validate_decimal_precision_and_scale::

(precision, scale)?; Ok(Self { data_type: P::TYPE_CONSTRUCTOR(precision, scale), @@ -592,25 +583,21 @@ mod tests { #[test] fn test_primitive_array_builder_with_data_type() { - let mut builder = - Decimal128Builder::new().with_data_type(DataType::Decimal128(1, 2)); + let mut builder = Decimal128Builder::new().with_data_type(DataType::Decimal128(1, 2)); builder.append_value(1); let array = builder.finish(); assert_eq!(array.precision(), 1); assert_eq!(array.scale(), 2); let data_type = DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".into())); - let mut builder = - TimestampNanosecondBuilder::new().with_data_type(data_type.clone()); + let mut builder = TimestampNanosecondBuilder::new().with_data_type(data_type.clone()); builder.append_value(1); let array = builder.finish(); assert_eq!(array.data_type(), &data_type); } #[test] - #[should_panic( - expected = "incompatible data type for builder, expected Int32 got Int64" - )] + #[should_panic(expected = "incompatible data type for builder, expected Int32 got Int64")] fn test_invalid_with_data_type() { Int32Builder::new().with_data_type(DataType::Int64); } diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs index 7323ee57627d..a47b2d30d4f3 100644 --- a/arrow-array/src/builder/primitive_dictionary_builder.rs +++ b/arrow-array/src/builder/primitive_dictionary_builder.rs @@ -221,8 +221,7 @@ where let key = self.values_builder.len(); self.values_builder.append_value(value); vacant.insert(key); - K::Native::from_usize(key) - .ok_or(ArrowError::DictionaryKeyOverflowError)? + K::Native::from_usize(key).ok_or(ArrowError::DictionaryKeyOverflowError)? } Entry::Occupied(o) => K::Native::usize_as(*o.get()), }; @@ -266,10 +265,8 @@ where let values = self.values_builder.finish(); let keys = self.keys_builder.finish(); - let data_type = DataType::Dictionary( - Box::new(K::DATA_TYPE), - Box::new(values.data_type().clone()), - ); + let data_type = + DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(values.data_type().clone())); let builder = keys .into_data() @@ -285,8 +282,7 @@ where let values = self.values_builder.finish_cloned(); let keys = self.keys_builder.finish_cloned(); - let data_type = - DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(V::DATA_TYPE)); + let data_type = DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(V::DATA_TYPE)); let builder = keys .into_data() @@ -331,8 +327,7 @@ mod tests { #[test] fn test_primitive_dictionary_builder() { - let mut builder = - PrimitiveDictionaryBuilder::::with_capacity(3, 2); + let mut builder = PrimitiveDictionaryBuilder::::with_capacity(3, 2); builder.append(12345678).unwrap(); builder.append_null(); builder.append(22345678).unwrap(); @@ -384,8 +379,7 @@ mod tests { #[test] fn test_primitive_dictionary_with_builders() { let keys_builder = PrimitiveBuilder::::new(); - let values_builder = - Decimal128Builder::new().with_data_type(DataType::Decimal128(1, 2)); + let values_builder = Decimal128Builder::new().with_data_type(DataType::Decimal128(1, 2)); let mut builder = PrimitiveDictionaryBuilder::::new_from_empty_builders( keys_builder, diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index 0c878e621056..06b8385b3164 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -106,24 +106,18 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box Box::new(Float32Builder::with_capacity(capacity)), DataType::Float64 => Box::new(Float64Builder::with_capacity(capacity)), DataType::Binary => Box::new(BinaryBuilder::with_capacity(capacity, 1024)), - DataType::LargeBinary => { - Box::new(LargeBinaryBuilder::with_capacity(capacity, 1024)) - } + DataType::LargeBinary => Box::new(LargeBinaryBuilder::with_capacity(capacity, 1024)), DataType::FixedSizeBinary(len) => { Box::new(FixedSizeBinaryBuilder::with_capacity(capacity, *len)) } DataType::Decimal128(p, s) => Box::new( - Decimal128Builder::with_capacity(capacity) - .with_data_type(DataType::Decimal128(*p, *s)), + Decimal128Builder::with_capacity(capacity).with_data_type(DataType::Decimal128(*p, *s)), ), DataType::Decimal256(p, s) => Box::new( - Decimal256Builder::with_capacity(capacity) - .with_data_type(DataType::Decimal256(*p, *s)), + Decimal256Builder::with_capacity(capacity).with_data_type(DataType::Decimal256(*p, *s)), ), DataType::Utf8 => Box::new(StringBuilder::with_capacity(capacity, 1024)), - DataType::LargeUtf8 => { - Box::new(LargeStringBuilder::with_capacity(capacity, 1024)) - } + DataType::LargeUtf8 => Box::new(LargeStringBuilder::with_capacity(capacity, 1024)), DataType::Date32 => Box::new(Date32Builder::with_capacity(capacity)), DataType::Date64 => Box::new(Date64Builder::with_capacity(capacity)), DataType::Time32(TimeUnit::Second) => { @@ -175,19 +169,18 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box { Box::new(DurationNanosecondBuilder::with_capacity(capacity)) } - DataType::Struct(fields) => { - Box::new(StructBuilder::from_fields(fields.clone(), capacity)) + DataType::List(field) => { + let builder = make_builder(field.data_type(), capacity); + Box::new(ListBuilder::with_capacity(builder, capacity)) } + DataType::Struct(fields) => Box::new(StructBuilder::from_fields(fields.clone(), capacity)), t => panic!("Data type {t:?} is not currently supported"), } } impl StructBuilder { /// Creates a new `StructBuilder` - pub fn new( - fields: impl Into, - field_builders: Vec>, - ) -> Self { + pub fn new(fields: impl Into, field_builders: Vec>) -> Self { Self { field_builders, fields: fields.into(), @@ -233,6 +226,9 @@ impl StructBuilder { /// Builds the `StructArray` and reset this builder. pub fn finish(&mut self) -> StructArray { self.validate_content(); + if self.fields.is_empty() { + return StructArray::new_empty_fields(self.len(), self.null_buffer_builder.finish()); + } let arrays = self.field_builders.iter_mut().map(|f| f.finish()).collect(); let nulls = self.null_buffer_builder.finish(); @@ -243,6 +239,13 @@ impl StructBuilder { pub fn finish_cloned(&self) -> StructArray { self.validate_content(); + if self.fields.is_empty() { + return StructArray::new_empty_fields( + self.len(), + self.null_buffer_builder.finish_cloned(), + ); + } + let arrays = self .field_builders .iter() @@ -508,14 +511,18 @@ mod tests { #[test] #[should_panic( - expected = "Data type List(Field { name: \"item\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) is not currently supported" + expected = "Data type Map(Field { name: \"entries\", data_type: Struct([Field { name: \"keys\", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: \"values\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false) is not currently supported" )] fn test_struct_array_builder_from_schema_unsupported_type() { - let list_type = - DataType::List(Arc::new(Field::new("item", DataType::Int64, true))); + let keys = Arc::new(Field::new("keys", DataType::Int32, false)); + let values = Arc::new(Field::new("values", DataType::UInt32, false)); + let struct_type = DataType::Struct(Fields::from(vec![keys, values])); + let map_data_type = + DataType::Map(Arc::new(Field::new("entries", struct_type, false)), false); + let fields = vec![ Field::new("f1", DataType::Int16, false), - Field::new("f2", list_type, false), + Field::new("f2", map_data_type, false), ]; let _ = StructBuilder::from_fields(fields, 5); @@ -558,9 +565,7 @@ mod tests { } #[test] - #[should_panic( - expected = "Number of fields is not equal to the number of field_builders." - )] + #[should_panic(expected = "Number of fields is not equal to the number of field_builders.")] fn test_struct_array_builder_unequal_field_field_builders() { let int_builder = Int32Builder::with_capacity(10); @@ -591,4 +596,19 @@ mod tests { let mut sa = StructBuilder::new(fields, field_builders); sa.finish(); } + + #[test] + fn test_empty() { + let mut builder = StructBuilder::new(Fields::empty(), vec![]); + builder.append(true); + builder.append(false); + + let a1 = builder.finish_cloned(); + let a2 = builder.finish(); + assert_eq!(a1, a2); + assert_eq!(a1.len(), 2); + assert_eq!(a1.null_count(), 1); + assert!(a1.is_valid(0)); + assert!(a1.is_null(1)); + } } diff --git a/arrow-array/src/builder/union_builder.rs b/arrow-array/src/builder/union_builder.rs index f74afb2aa9aa..4f88c9d41b9a 100644 --- a/arrow-array/src/builder/union_builder.rs +++ b/arrow-array/src/builder/union_builder.rs @@ -65,11 +65,7 @@ impl FieldDataValues for BufferBuilder { impl FieldData { /// Creates a new `FieldData`. - fn new( - type_id: i8, - data_type: DataType, - capacity: usize, - ) -> Self { + fn new(type_id: i8, data_type: DataType, capacity: usize) -> Self { Self { type_id, data_type, @@ -222,7 +218,12 @@ impl UnionBuilder { let mut field_data = match self.fields.remove(&type_name) { Some(data) => { if data.data_type != T::DATA_TYPE { - return Err(ArrowError::InvalidArgumentError(format!("Attempt to write col \"{}\" with type {} doesn't match existing type {}", type_name, T::DATA_TYPE, data.data_type))); + return Err(ArrowError::InvalidArgumentError(format!( + "Attempt to write col \"{}\" with type {} doesn't match existing type {}", + type_name, + T::DATA_TYPE, + data.data_type + ))); } data } diff --git a/arrow-array/src/cast.rs b/arrow-array/src/cast.rs index b6cda44e8973..2e21f3e7e640 100644 --- a/arrow-array/src/cast.rs +++ b/arrow-array/src/cast.rs @@ -578,9 +578,7 @@ macro_rules! downcast_run_array { /// Force downcast of an [`Array`], such as an [`ArrayRef`] to /// [`GenericListArray`], panicking on failure. -pub fn as_generic_list_array( - arr: &dyn Array, -) -> &GenericListArray { +pub fn as_generic_list_array(arr: &dyn Array) -> &GenericListArray { arr.as_any() .downcast_ref::>() .expect("Unable to downcast to list array") @@ -612,9 +610,7 @@ pub fn as_large_list_array(arr: &dyn Array) -> &LargeListArray { /// Force downcast of an [`Array`], such as an [`ArrayRef`] to /// [`GenericBinaryArray`], panicking on failure. #[inline] -pub fn as_generic_binary_array( - arr: &dyn Array, -) -> &GenericBinaryArray { +pub fn as_generic_binary_array(arr: &dyn Array) -> &GenericBinaryArray { arr.as_any() .downcast_ref::>() .expect("Unable to downcast to binary array") @@ -826,8 +822,7 @@ pub trait AsArray: private::Sealed { } /// Downcast this to a [`DictionaryArray`] returning `None` if not possible - fn as_dictionary_opt(&self) - -> Option<&DictionaryArray>; + fn as_dictionary_opt(&self) -> Option<&DictionaryArray>; /// Downcast this to a [`DictionaryArray`] panicking if not possible fn as_dictionary(&self) -> &DictionaryArray { @@ -877,9 +872,7 @@ impl AsArray for dyn Array + '_ { self.as_any().downcast_ref() } - fn as_dictionary_opt( - &self, - ) -> Option<&DictionaryArray> { + fn as_dictionary_opt(&self) -> Option<&DictionaryArray> { self.as_any().downcast_ref() } @@ -926,9 +919,7 @@ impl AsArray for ArrayRef { self.as_any().downcast_ref() } - fn as_dictionary_opt( - &self, - ) -> Option<&DictionaryArray> { + fn as_dictionary_opt(&self) -> Option<&DictionaryArray> { self.as_ref().as_dictionary_opt() } @@ -972,9 +963,7 @@ mod tests { #[test] fn test_decimal256array() { - let a = Decimal256Array::from_iter_values( - [1, 2, 4, 5].into_iter().map(i256::from_i128), - ); + let a = Decimal256Array::from_iter_values([1, 2, 4, 5].into_iter().map(i256::from_i128)); assert!(!as_primitive_array::(&a).is_empty()); } } diff --git a/arrow-array/src/delta.rs b/arrow-array/src/delta.rs index bf9ee5ca685f..d9aa4aa6de5d 100644 --- a/arrow-array/src/delta.rs +++ b/arrow-array/src/delta.rs @@ -55,10 +55,7 @@ pub(crate) fn add_months_datetime( /// Add the given number of days to the given datetime. /// /// Returns `None` when it will result in overflow. -pub(crate) fn add_days_datetime( - dt: DateTime, - days: i32, -) -> Option> { +pub(crate) fn add_days_datetime(dt: DateTime, days: i32) -> Option> { match days.cmp(&0) { Ordering::Equal => Some(dt), Ordering::Greater => dt.checked_add_days(Days::new(days as u64)), @@ -83,10 +80,7 @@ pub(crate) fn sub_months_datetime( /// Substract the given number of days to the given datetime. /// /// Returns `None` when it will result in overflow. -pub(crate) fn sub_days_datetime( - dt: DateTime, - days: i32, -) -> Option> { +pub(crate) fn sub_days_datetime(dt: DateTime, days: i32) -> Option> { match days.cmp(&0) { Ordering::Equal => Some(dt), Ordering::Greater => dt.checked_sub_days(Days::new(days as u64)), diff --git a/arrow-array/src/iterator.rs b/arrow-array/src/iterator.rs index a198332ca5b5..3f9cc0d525c1 100644 --- a/arrow-array/src/iterator.rs +++ b/arrow-array/src/iterator.rs @@ -18,8 +18,8 @@ //! Idiomatic iterators for [`Array`](crate::Array) use crate::array::{ - ArrayAccessor, BooleanArray, FixedSizeBinaryArray, GenericBinaryArray, - GenericListArray, GenericStringArray, PrimitiveArray, + ArrayAccessor, BooleanArray, FixedSizeBinaryArray, GenericBinaryArray, GenericListArray, + GenericStringArray, PrimitiveArray, }; use crate::{FixedSizeListArray, MapArray}; use arrow_buffer::NullBuffer; @@ -187,8 +187,7 @@ mod tests { #[test] fn test_string_array_iter_round_trip() { - let array = - StringArray::from(vec![Some("a"), None, Some("aaa"), None, Some("aaaaa")]); + let array = StringArray::from(vec![Some("a"), None, Some("aaa"), None, Some("aaaaa")]); let array = Arc::new(array) as ArrayRef; let array = array.as_any().downcast_ref::().unwrap(); @@ -211,8 +210,7 @@ mod tests { // check if DoubleEndedIterator is implemented let result: StringArray = array.iter().rev().collect(); - let rev_array = - StringArray::from(vec![Some("aaaaa"), None, Some("aaa"), None, Some("a")]); + let rev_array = StringArray::from(vec![Some("aaaaa"), None, Some("aaa"), None, Some("a")]); assert_eq!(result, rev_array); // check if ExactSizeIterator is implemented let _ = array.iter().rposition(|opt_b| opt_b == Some("a")); diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs index afb7ec5e6e44..ef98c5efefb0 100644 --- a/arrow-array/src/lib.rs +++ b/arrow-array/src/lib.rs @@ -182,8 +182,7 @@ pub use array::*; mod record_batch; pub use record_batch::{ - RecordBatch, RecordBatchIterator, RecordBatchOptions, RecordBatchReader, - RecordBatchWriter, + RecordBatch, RecordBatchIterator, RecordBatchOptions, RecordBatchReader, RecordBatchWriter, }; mod arithmetic; diff --git a/arrow-array/src/numeric.rs b/arrow-array/src/numeric.rs index afc0e2c33010..b5e474ba696a 100644 --- a/arrow-array/src/numeric.rs +++ b/arrow-array/src/numeric.rs @@ -179,8 +179,8 @@ macro_rules! make_numeric_type { 16 => { // same general logic as for 8 lanes, extended to 16 bits let vecidx = i32x16::new( - 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, - 8192, 16384, 32768, + 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, + 32768, ); let vecmask = i32x16::splat((mask & 0xFFFF) as i32); @@ -194,21 +194,19 @@ macro_rules! make_numeric_type { let tmp = &mut [0_i16; 32]; let vecidx = i32x16::new( - 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, - 8192, 16384, 32768, + 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, + 32768, ); let vecmask = i32x16::splat((mask & 0xFFFF) as i32); let vecmask = (vecidx & vecmask).eq(vecidx); - i16x16::from_cast(vecmask) - .write_to_slice_unaligned(&mut tmp[0..16]); + i16x16::from_cast(vecmask).write_to_slice_unaligned(&mut tmp[0..16]); let vecmask = i32x16::splat(((mask >> 16) & 0xFFFF) as i32); let vecmask = (vecidx & vecmask).eq(vecidx); - i16x16::from_cast(vecmask) - .write_to_slice_unaligned(&mut tmp[16..32]); + i16x16::from_cast(vecmask).write_to_slice_unaligned(&mut tmp[16..32]); unsafe { std::mem::transmute(i16x32::from_slice_unaligned(tmp)) } } @@ -218,33 +216,29 @@ macro_rules! make_numeric_type { let tmp = &mut [0_i8; 64]; let vecidx = i32x16::new( - 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, - 8192, 16384, 32768, + 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, + 32768, ); let vecmask = i32x16::splat((mask & 0xFFFF) as i32); let vecmask = (vecidx & vecmask).eq(vecidx); - i8x16::from_cast(vecmask) - .write_to_slice_unaligned(&mut tmp[0..16]); + i8x16::from_cast(vecmask).write_to_slice_unaligned(&mut tmp[0..16]); let vecmask = i32x16::splat(((mask >> 16) & 0xFFFF) as i32); let vecmask = (vecidx & vecmask).eq(vecidx); - i8x16::from_cast(vecmask) - .write_to_slice_unaligned(&mut tmp[16..32]); + i8x16::from_cast(vecmask).write_to_slice_unaligned(&mut tmp[16..32]); let vecmask = i32x16::splat(((mask >> 32) & 0xFFFF) as i32); let vecmask = (vecidx & vecmask).eq(vecidx); - i8x16::from_cast(vecmask) - .write_to_slice_unaligned(&mut tmp[32..48]); + i8x16::from_cast(vecmask).write_to_slice_unaligned(&mut tmp[32..48]); let vecmask = i32x16::splat(((mask >> 48) & 0xFFFF) as i32); let vecmask = (vecidx & vecmask).eq(vecidx); - i8x16::from_cast(vecmask) - .write_to_slice_unaligned(&mut tmp[48..64]); + i8x16::from_cast(vecmask).write_to_slice_unaligned(&mut tmp[48..64]); unsafe { std::mem::transmute(i8x64::from_slice_unaligned(tmp)) } } @@ -269,11 +263,7 @@ macro_rules! make_numeric_type { /// Selects elements of `a` and `b` using `mask` #[inline] - fn mask_select( - mask: Self::SimdMask, - a: Self::Simd, - b: Self::Simd, - ) -> Self::Simd { + fn mask_select(mask: Self::SimdMask, a: Self::Simd, b: Self::Simd) -> Self::Simd { mask.select(a, b) } @@ -327,10 +317,7 @@ macro_rules! make_numeric_type { } #[inline] - fn unary_op Self::Simd>( - a: Self::Simd, - op: F, - ) -> Self::Simd { + fn unary_op Self::Simd>(a: Self::Simd, op: F) -> Self::Simd { op(a) } } @@ -581,8 +568,7 @@ mod tests { let mask = 0b1101; let actual = IntervalMonthDayNanoType::mask_from_u64(mask); let expected = expected_mask!(i128, mask); - let expected = - m128x4::from_cast(i128x4::from_slice_unaligned(expected.as_slice())); + let expected = m128x4::from_cast(i128x4::from_slice_unaligned(expected.as_slice())); assert_eq!(expected, actual); } @@ -612,8 +598,7 @@ mod tests { let mask = 0b10101010_10101010; let actual = Float32Type::mask_from_u64(mask); let expected = expected_mask!(i32, mask); - let expected = - m32x16::from_cast(i32x16::from_slice_unaligned(expected.as_slice())); + let expected = m32x16::from_cast(i32x16::from_slice_unaligned(expected.as_slice())); assert_eq!(expected, actual); } @@ -623,8 +608,7 @@ mod tests { let mask = 0b01010101_01010101; let actual = Int32Type::mask_from_u64(mask); let expected = expected_mask!(i32, mask); - let expected = - m32x16::from_cast(i32x16::from_slice_unaligned(expected.as_slice())); + let expected = m32x16::from_cast(i32x16::from_slice_unaligned(expected.as_slice())); assert_eq!(expected, actual); } @@ -634,17 +618,14 @@ mod tests { let mask = 0b01010101_01010101_10101010_10101010; let actual = UInt16Type::mask_from_u64(mask); let expected = expected_mask!(i16, mask); - dbg!(&expected); - let expected = - m16x32::from_cast(i16x32::from_slice_unaligned(expected.as_slice())); + let expected = m16x32::from_cast(i16x32::from_slice_unaligned(expected.as_slice())); assert_eq!(expected, actual); } #[test] fn test_mask_i8() { - let mask = - 0b01010101_01010101_10101010_10101010_01010101_01010101_10101010_10101010; + let mask = 0b01010101_01010101_10101010_10101010_01010101_01010101_10101010_10101010; let actual = Int8Type::mask_from_u64(mask); let expected = expected_mask!(i8, mask); let expected = m8x64::from_cast(i8x64::from_slice_unaligned(expected.as_slice())); diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index 27804447fba6..4e859fdfe7ea 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -107,10 +107,7 @@ impl RecordBatch { /// vec![Arc::new(id_array)] /// ).unwrap(); /// ``` - pub fn try_new( - schema: SchemaRef, - columns: Vec, - ) -> Result { + pub fn try_new(schema: SchemaRef, columns: Vec) -> Result { let options = RecordBatchOptions::new(); Self::try_new_impl(schema, columns, &options) } @@ -179,9 +176,7 @@ impl RecordBatch { // check that all columns have the same row count if columns.iter().any(|c| c.len() != row_count) { let err = match options.row_count { - Some(_) => { - "all columns in a record batch must have the specified row count" - } + Some(_) => "all columns in a record batch must have the specified row count", None => "all columns in a record batch must have the same length", }; return Err(ArrowError::InvalidArgumentError(err.to_string())); @@ -190,9 +185,7 @@ impl RecordBatch { // function for comparing column type and field type // return true if 2 types are not matched let type_not_match = if options.match_field_names { - |(_, (col_type, field_type)): &(usize, (&DataType, &DataType))| { - col_type != field_type - } + |(_, (col_type, field_type)): &(usize, (&DataType, &DataType))| col_type != field_type } else { |(_, (col_type, field_type)): &(usize, (&DataType, &DataType))| { !col_type.equals_datatype(field_type) @@ -334,6 +327,40 @@ impl RecordBatch { &self.columns[..] } + /// Remove column by index and return it. + /// + /// Return the `ArrayRef` if the column is removed. + /// + /// # Panics + /// + /// Panics if `index`` out of bounds. + /// + /// # Example + /// + /// ``` + /// use std::sync::Arc; + /// use arrow_array::{BooleanArray, Int32Array, RecordBatch}; + /// use arrow_schema::{DataType, Field, Schema}; + /// let id_array = Int32Array::from(vec![1, 2, 3, 4, 5]); + /// let bool_array = BooleanArray::from(vec![true, false, false, true, true]); + /// let schema = Schema::new(vec![ + /// Field::new("id", DataType::Int32, false), + /// Field::new("bool", DataType::Boolean, false), + /// ]); + /// + /// let mut batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(id_array), Arc::new(bool_array)]).unwrap(); + /// + /// let removed_column = batch.remove_column(0); + /// assert_eq!(removed_column.as_any().downcast_ref::().unwrap(), &Int32Array::from(vec![1, 2, 3, 4, 5])); + /// assert_eq!(batch.num_columns(), 1); + /// ``` + pub fn remove_column(&mut self, index: usize) -> ArrayRef { + let mut builder = SchemaBuilder::from(self.schema.fields()); + builder.remove(index); + self.schema = Arc::new(builder.finish()); + self.columns.remove(index) + } + /// Return a new RecordBatch where each column is sliced /// according to `offset` and `length` /// @@ -484,7 +511,11 @@ impl From for RecordBatch { fn from(value: StructArray) -> Self { let row_count = value.len(); let (fields, columns, nulls) = value.into_parts(); - assert_eq!(nulls.map(|n| n.null_count()).unwrap_or_default(), 0, "Cannot convert nullable StructArray to RecordBatch, see StructArray documentation"); + assert_eq!( + nulls.map(|n| n.null_count()).unwrap_or_default(), + 0, + "Cannot convert nullable StructArray to RecordBatch, see StructArray documentation" + ); RecordBatch { schema: Arc::new(Schema::new(fields)), @@ -588,9 +619,7 @@ where #[cfg(test)] mod tests { use super::*; - use crate::{ - BooleanArray, Int32Array, Int64Array, Int8Array, ListArray, StringArray, - }; + use crate::{BooleanArray, Int32Array, Int64Array, Int8Array, ListArray, StringArray}; use arrow_buffer::{Buffer, ToByteSlice}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::Fields; @@ -606,8 +635,7 @@ mod tests { let b = StringArray::from(vec!["a", "b", "c", "d", "e"]); let record_batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]) - .unwrap(); + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]).unwrap(); check_batch(record_batch, 5) } @@ -622,8 +650,7 @@ mod tests { let b = StringArray::from(vec!["a", "b", "c", "d", "e"]); let record_batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]) - .unwrap(); + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]).unwrap(); assert_eq!(record_batch.get_array_memory_size(), 364); } @@ -649,8 +676,7 @@ mod tests { let b = StringArray::from(vec!["a", "b", "c", "d", "e", "f", "h", "i"]); let record_batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]) - .unwrap(); + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]).unwrap(); let offset = 2; let length = 5; @@ -699,8 +725,8 @@ mod tests { ])); let b: ArrayRef = Arc::new(StringArray::from(vec!["a", "b", "c", "d", "e"])); - let record_batch = RecordBatch::try_from_iter(vec![("a", a), ("b", b)]) - .expect("valid conversion"); + let record_batch = + RecordBatch::try_from_iter(vec![("a", a), ("b", b)]).expect("valid conversion"); let expected_schema = Schema::new(vec![ Field::new("a", DataType::Int32, true), @@ -716,11 +742,9 @@ mod tests { let b: ArrayRef = Arc::new(StringArray::from(vec!["a", "b", "c", "d", "e"])); // Note there are no nulls in a or b, but we specify that b is nullable - let record_batch = RecordBatch::try_from_iter_with_nullable(vec![ - ("a", a, false), - ("b", b, true), - ]) - .expect("valid conversion"); + let record_batch = + RecordBatch::try_from_iter_with_nullable(vec![("a", a, false), ("b", b, true)]) + .expect("valid conversion"); let expected_schema = Schema::new(vec![ Field::new("a", DataType::Int32, false), @@ -792,8 +816,7 @@ mod tests { let a = Int32Array::from(vec![1, 2, 3, 4, 5]); let b = Int32Array::from(vec![1, 2, 3, 4, 5]); - let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]); assert!(batch.is_err()); } @@ -863,11 +886,8 @@ mod tests { Field::new("id", DataType::Int32, false), Field::new("val", DataType::Int32, false), ]); - let record_batch = RecordBatch::try_new( - Arc::new(schema1), - vec![id_arr.clone(), val_arr.clone()], - ) - .unwrap(); + let record_batch = + RecordBatch::try_new(Arc::new(schema1), vec![id_arr.clone(), val_arr.clone()]).unwrap(); assert_eq!(record_batch["id"].as_ref(), id_arr.as_ref()); assert_eq!(record_batch["val"].as_ref(), val_arr.as_ref()); @@ -1005,15 +1025,12 @@ mod tests { let b: ArrayRef = Arc::new(StringArray::from(vec!["a", "b", "c"])); let c: ArrayRef = Arc::new(StringArray::from(vec!["d", "e", "f"])); - let record_batch = RecordBatch::try_from_iter(vec![ - ("a", a.clone()), - ("b", b.clone()), - ("c", c.clone()), - ]) - .expect("valid conversion"); + let record_batch = + RecordBatch::try_from_iter(vec![("a", a.clone()), ("b", b.clone()), ("c", c.clone())]) + .expect("valid conversion"); - let expected = RecordBatch::try_from_iter(vec![("a", a), ("c", c)]) - .expect("valid conversion"); + let expected = + RecordBatch::try_from_iter(vec![("a", a), ("c", c)]).expect("valid conversion"); assert_eq!(expected, record_batch.project(&[0, 2]).unwrap()); } @@ -1049,8 +1066,7 @@ mod tests { let options = RecordBatchOptions::new().with_row_count(Some(10)); - let ok = - RecordBatch::try_new_with_options(schema.clone(), vec![], &options).unwrap(); + let ok = RecordBatch::try_new_with_options(schema.clone(), vec![], &options).unwrap(); assert_eq!(ok.num_rows(), 10); let a = ok.slice(2, 5); diff --git a/arrow-array/src/run_iterator.rs b/arrow-array/src/run_iterator.rs index 489aabf4756a..7a98fccb73b5 100644 --- a/arrow-array/src/run_iterator.rs +++ b/arrow-array/src/run_iterator.rs @@ -86,8 +86,7 @@ where // If current logical index is greater than current run end index then increment // the physical index. let run_ends = self.array.run_ends().values(); - if self.current_front_logical >= run_ends[self.current_front_physical].as_usize() - { + if self.current_front_logical >= run_ends[self.current_front_physical].as_usize() { // As the run_ends is expected to be strictly increasing, there // should be at least one logical entry in one physical entry. Because of this // reason the next value can be accessed by incrementing physical index once. @@ -136,8 +135,7 @@ where let run_ends = self.array.run_ends().values(); if self.current_back_physical > 0 - && self.current_back_logical - < run_ends[self.current_back_physical - 1].as_usize() + && self.current_back_logical < run_ends[self.current_back_physical - 1].as_usize() { // As the run_ends is expected to be strictly increasing, there // should be at least one logical entry in one physical entry. Because of this @@ -211,8 +209,7 @@ mod tests { seed.shuffle(&mut rng); } // repeat the items between 1 and 8 times. Cap the length for smaller sized arrays - let num = - max_run_length.min(rand::thread_rng().gen_range(1..=max_run_length)); + let num = max_run_length.min(rand::thread_rng().gen_range(1..=max_run_length)); for _ in 0..num { result.push(seed[ix]); } @@ -285,8 +282,7 @@ mod tests { for logical_len in logical_lengths { let input_array = build_input_array(logical_len); - let mut run_array_builder = - PrimitiveRunBuilder::::new(); + let mut run_array_builder = PrimitiveRunBuilder::::new(); run_array_builder.extend(input_array.iter().copied()); let run_array = run_array_builder.finish(); let typed_array = run_array.downcast::().unwrap(); @@ -327,8 +323,7 @@ mod tests { }) .collect(); - let result_asref: Vec> = - result.iter().map(|f| f.as_deref()).collect(); + let result_asref: Vec> = result.iter().map(|f| f.as_deref()).collect(); let expected_vec = vec![ Some("abb"), @@ -364,8 +359,7 @@ mod tests { // Iterate on sliced typed run array let actual: Vec> = sliced_typed_run_array.into_iter().collect(); - let expected: Vec> = - input_array.iter().take(slice_len).copied().collect(); + let expected: Vec> = input_array.iter().take(slice_len).copied().collect(); assert_eq!(expected, actual); // test for offset = total_len - slice_len, length = slice_len diff --git a/arrow-array/src/scalar.rs b/arrow-array/src/scalar.rs index 7dfdbddd964a..f2a696a8f329 100644 --- a/arrow-array/src/scalar.rs +++ b/arrow-array/src/scalar.rs @@ -98,9 +98,32 @@ impl Datum for &dyn Array { } } -/// A wrapper around a single value [`Array`] indicating kernels should treat it as a scalar value +/// A wrapper around a single value [`Array`] that implements +/// [`Datum`] and indicates [compute] kernels should treat this array +/// as a scalar value (a single value). /// -/// See [`Datum`] for more information +/// Using a [`Scalar`] is often much more efficient than creating an +/// [`Array`] with the same (repeated) value. +/// +/// See [`Datum`] for more information. +/// +/// # Example +/// +/// ```rust +/// # use arrow_array::{Scalar, Int32Array, ArrayRef}; +/// # fn get_array() -> ArrayRef { std::sync::Arc::new(Int32Array::from(vec![42])) } +/// // Create a (typed) scalar for Int32Array for the value 42 +/// let scalar = Scalar::new(Int32Array::from(vec![42])); +/// +/// // Create a scalar using PrimtiveArray::scalar +/// let scalar = Int32Array::new_scalar(42); +/// +/// // create a scalar from an ArrayRef (for dynamic typed Arrays) +/// let array: ArrayRef = get_array(); +/// let scalar = Scalar::new(array); +/// ``` +/// +/// [compute]: https://docs.rs/arrow/latest/arrow/compute/index.html #[derive(Debug, Copy, Clone)] pub struct Scalar(T); diff --git a/arrow-array/src/temporal_conversions.rs b/arrow-array/src/temporal_conversions.rs index f1f3f36d3c61..e0edcc9bc182 100644 --- a/arrow-array/src/temporal_conversions.rs +++ b/arrow-array/src/temporal_conversions.rs @@ -20,9 +20,7 @@ use crate::timezone::Tz; use crate::ArrowPrimitiveType; use arrow_schema::{DataType, TimeUnit}; -use chrono::{ - DateTime, Duration, NaiveDate, NaiveDateTime, NaiveTime, TimeZone, Timelike, Utc, -}; +use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, NaiveTime, TimeZone, Timelike, Utc}; /// Number of seconds in a day pub const SECONDS_IN_DAY: i64 = 86_400; @@ -221,10 +219,7 @@ pub fn as_datetime(v: i64) -> Option { } /// Converts an [`ArrowPrimitiveType`] to [`DateTime`] -pub fn as_datetime_with_timezone( - v: i64, - tz: Tz, -) -> Option> { +pub fn as_datetime_with_timezone(v: i64, tz: Tz) -> Option> { let naive = as_datetime::(v)?; Some(Utc.from_utc_datetime(&naive).with_timezone(&tz)) } @@ -274,8 +269,8 @@ pub fn as_duration(v: i64) -> Option { #[cfg(test)] mod tests { use crate::temporal_conversions::{ - date64_to_datetime, split_second, timestamp_ms_to_datetime, - timestamp_ns_to_datetime, timestamp_us_to_datetime, NANOSECONDS, + date64_to_datetime, split_second, timestamp_ms_to_datetime, timestamp_ns_to_datetime, + timestamp_us_to_datetime, NANOSECONDS, }; use chrono::NaiveDateTime; diff --git a/arrow-array/src/timezone.rs b/arrow-array/src/timezone.rs index f56189c46512..dc91886f34c5 100644 --- a/arrow-array/src/timezone.rs +++ b/arrow-array/src/timezone.rs @@ -38,8 +38,8 @@ fn parse_fixed_offset(tz: &str) -> Option { if values.iter().any(|x| *x > 9) { return None; } - let secs = (values[0] * 10 + values[1]) as i32 * 60 * 60 - + (values[2] * 10 + values[3]) as i32 * 60; + let secs = + (values[0] * 10 + values[1]) as i32 * 60 * 60 + (values[2] * 10 + values[3]) as i32 * 60; match bytes[0] { b'+' => FixedOffset::east_opt(secs), @@ -122,10 +122,7 @@ mod private { }) } - fn offset_from_local_datetime( - &self, - local: &NaiveDateTime, - ) -> LocalResult { + fn offset_from_local_datetime(&self, local: &NaiveDateTime) -> LocalResult { tz!(self, tz, { tz.offset_from_local_datetime(local).map(|x| TzOffset { tz: *self, @@ -285,10 +282,7 @@ mod private { self.0.offset_from_local_date(local).map(TzOffset) } - fn offset_from_local_datetime( - &self, - local: &NaiveDateTime, - ) -> LocalResult { + fn offset_from_local_datetime(&self, local: &NaiveDateTime) -> LocalResult { self.0.offset_from_local_datetime(local).map(TzOffset) } diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index d79b32a991ed..16d0e822d052 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -18,8 +18,7 @@ //! Zero-sized types used to parameterize generic array implementations use crate::delta::{ - add_days_datetime, add_months_datetime, shift_months, sub_days_datetime, - sub_months_datetime, + add_days_datetime, add_months_datetime, shift_months, sub_days_datetime, sub_months_datetime, }; use crate::temporal_conversions::as_datetime_with_timezone; use crate::timezone::Tz; @@ -27,9 +26,8 @@ use crate::{ArrowNativeTypeOp, OffsetSizeTrait}; use arrow_buffer::{i256, Buffer, OffsetBuffer}; use arrow_data::decimal::{validate_decimal256_precision, validate_decimal_precision}; use arrow_schema::{ - ArrowError, DataType, IntervalUnit, TimeUnit, DECIMAL128_MAX_PRECISION, - DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, - DECIMAL_DEFAULT_SCALE, + ArrowError, DataType, IntervalUnit, TimeUnit, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, + DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, DECIMAL_DEFAULT_SCALE, }; use chrono::{Duration, NaiveDate, NaiveDateTime}; use half::f16; @@ -875,9 +873,7 @@ impl IntervalDayTimeType { /// /// * `i` - The IntervalDayTimeType to convert #[inline] - pub fn to_parts( - i: ::Native, - ) -> (i32, i32) { + pub fn to_parts(i: ::Native) -> (i32, i32) { let days = (i >> 32) as i32; let ms = i as i32; (days, ms) @@ -1221,10 +1217,7 @@ pub trait DecimalType: fn format_decimal(value: Self::Native, precision: u8, scale: i8) -> String; /// Validates that `value` contains no more than `precision` decimal digits - fn validate_decimal_precision( - value: Self::Native, - precision: u8, - ) -> Result<(), ArrowError>; + fn validate_decimal_precision(value: Self::Native, precision: u8) -> Result<(), ArrowError>; } /// Validate that `precision` and `scale` are valid for `T` @@ -1368,12 +1361,14 @@ pub(crate) mod bytes { } impl ByteArrayNativeType for [u8] { + #[inline] unsafe fn from_bytes_unchecked(b: &[u8]) -> &Self { b } } impl ByteArrayNativeType for str { + #[inline] unsafe fn from_bytes_unchecked(b: &[u8]) -> &Self { std::str::from_utf8_unchecked(b) } @@ -1398,10 +1393,7 @@ pub trait ByteArrayType: 'static + Send + Sync + bytes::ByteArrayTypeSealed { const DATA_TYPE: DataType; /// Verifies that every consecutive pair of `offsets` denotes a valid slice of `values` - fn validate( - offsets: &OffsetBuffer, - values: &Buffer, - ) -> Result<(), ArrowError>; + fn validate(offsets: &OffsetBuffer, values: &Buffer) -> Result<(), ArrowError>; } /// [`ByteArrayType`] for string arrays @@ -1420,10 +1412,7 @@ impl ByteArrayType for GenericStringType { DataType::Utf8 }; - fn validate( - offsets: &OffsetBuffer, - values: &Buffer, - ) -> Result<(), ArrowError> { + fn validate(offsets: &OffsetBuffer, values: &Buffer) -> Result<(), ArrowError> { // Verify that the slice as a whole is valid UTF-8 let validated = std::str::from_utf8(values).map_err(|e| { ArrowError::InvalidArgumentError(format!("Encountered non UTF-8 data: {e}")) @@ -1469,10 +1458,7 @@ impl ByteArrayType for GenericBinaryType { DataType::Binary }; - fn validate( - offsets: &OffsetBuffer, - values: &Buffer, - ) -> Result<(), ArrowError> { + fn validate(offsets: &OffsetBuffer, values: &Buffer) -> Result<(), ArrowError> { // offsets are guaranteed to be monotonically increasing and non-empty let max_offset = offsets.last().unwrap().as_usize(); if values.len() < max_offset { diff --git a/arrow-avro/Cargo.toml b/arrow-avro/Cargo.toml new file mode 100644 index 000000000000..9575874c41d2 --- /dev/null +++ b/arrow-avro/Cargo.toml @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "arrow-avro" +version = { workspace = true } +description = "Support for parsing Avro format into the Arrow format" +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } +keywords = { workspace = true } +include = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } + +[lib] +name = "arrow_avro" +path = "src/lib.rs" +bench = false + +[dependencies] +arrow-array = { workspace = true } +arrow-buffer = { workspace = true } +arrow-cast = { workspace = true } +arrow-data = { workspace = true } +arrow-schema = { workspace = true } +serde_json = { version = "1.0", default-features = false, features = ["std"] } +serde = { version = "1.0.188", features = ["derive"] } + +[dev-dependencies] + diff --git a/arrow/src/datatypes/ffi.rs b/arrow-avro/src/compression.rs similarity index 69% rename from arrow/src/datatypes/ffi.rs rename to arrow-avro/src/compression.rs index b248758bc120..a1a44fc22b68 100644 --- a/arrow/src/datatypes/ffi.rs +++ b/arrow-avro/src/compression.rs @@ -14,3 +14,19 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. + +use serde::{Deserialize, Serialize}; + +/// The metadata key used for storing the JSON encoded [`CompressionCodec`] +pub const CODEC_METADATA_KEY: &str = "avro.codec"; + +#[derive(Debug, Copy, Clone, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum CompressionCodec { + Null, + Deflate, + BZip2, + Snappy, + XZ, + ZStandard, +} diff --git a/arrow-avro/src/lib.rs b/arrow-avro/src/lib.rs new file mode 100644 index 000000000000..c76ecb399a45 --- /dev/null +++ b/arrow-avro/src/lib.rs @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Convert data to / from the [Apache Arrow] memory format and [Apache Avro] +//! +//! [Apache Arrow]: https://arrow.apache.org +//! [Apache Avro]: https://avro.apache.org/ + +#![allow(unused)] // Temporary + +pub mod reader; +mod schema; + +mod compression; + +#[cfg(test)] +mod test_util { + pub fn arrow_test_data(path: &str) -> String { + match std::env::var("ARROW_TEST_DATA") { + Ok(dir) => format!("{dir}/{path}"), + Err(_) => format!("../testing/data/{path}"), + } + } +} diff --git a/arrow-avro/src/reader/block.rs b/arrow-avro/src/reader/block.rs new file mode 100644 index 000000000000..479f0ef90909 --- /dev/null +++ b/arrow-avro/src/reader/block.rs @@ -0,0 +1,141 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Decoder for [`Block`] + +use crate::reader::vlq::VLQDecoder; +use arrow_schema::ArrowError; + +/// A file data block +/// +/// +#[derive(Debug, Default)] +pub struct Block { + /// The number of objects in this block + pub count: usize, + /// The serialized objects within this block + pub data: Vec, + /// The sync marker + pub sync: [u8; 16], +} + +/// A decoder for [`Block`] +#[derive(Debug)] +pub struct BlockDecoder { + state: BlockDecoderState, + in_progress: Block, + vlq_decoder: VLQDecoder, + bytes_remaining: usize, +} + +#[derive(Debug)] +enum BlockDecoderState { + Count, + Size, + Data, + Sync, + Finished, +} + +impl Default for BlockDecoder { + fn default() -> Self { + Self { + state: BlockDecoderState::Count, + in_progress: Default::default(), + vlq_decoder: Default::default(), + bytes_remaining: 0, + } + } +} + +impl BlockDecoder { + /// Parse [`Block`] from `buf`, returning the number of bytes read + /// + /// This method can be called multiple times with consecutive chunks of data, allowing + /// integration with chunked IO systems like [`BufRead::fill_buf`] + /// + /// All errors should be considered fatal, and decoding aborted + /// + /// Once an entire [`Block`] has been decoded this method will not read any further + /// input bytes, until [`Self::flush`] is called. Afterwards [`Self::decode`] + /// can then be used again to read the next block, if any + /// + /// [`BufRead::fill_buf`]: std::io::BufRead::fill_buf + pub fn decode(&mut self, mut buf: &[u8]) -> Result { + let max_read = buf.len(); + while !buf.is_empty() { + match self.state { + BlockDecoderState::Count => { + if let Some(c) = self.vlq_decoder.long(&mut buf) { + self.in_progress.count = c.try_into().map_err(|_| { + ArrowError::ParseError(format!( + "Block count cannot be negative, got {c}" + )) + })?; + + self.state = BlockDecoderState::Size; + } + } + BlockDecoderState::Size => { + if let Some(c) = self.vlq_decoder.long(&mut buf) { + self.bytes_remaining = c.try_into().map_err(|_| { + ArrowError::ParseError(format!( + "Block size cannot be negative, got {c}" + )) + })?; + + self.in_progress.data.reserve(self.bytes_remaining); + self.state = BlockDecoderState::Data; + } + } + BlockDecoderState::Data => { + let to_read = self.bytes_remaining.min(buf.len()); + self.in_progress.data.extend_from_slice(&buf[..to_read]); + buf = &buf[to_read..]; + self.bytes_remaining -= to_read; + if self.bytes_remaining == 0 { + self.bytes_remaining = 16; + self.state = BlockDecoderState::Sync; + } + } + BlockDecoderState::Sync => { + let to_decode = buf.len().min(self.bytes_remaining); + let write = &mut self.in_progress.sync[16 - to_decode..]; + write[..to_decode].copy_from_slice(&buf[..to_decode]); + self.bytes_remaining -= to_decode; + buf = &buf[to_decode..]; + if self.bytes_remaining == 0 { + self.state = BlockDecoderState::Finished; + } + } + BlockDecoderState::Finished => return Ok(max_read - buf.len()), + } + } + Ok(max_read) + } + + /// Flush this decoder returning the parsed [`Block`] if any + pub fn flush(&mut self) -> Option { + match self.state { + BlockDecoderState::Finished => { + self.state = BlockDecoderState::Count; + Some(std::mem::take(&mut self.in_progress)) + } + _ => None, + } + } +} diff --git a/arrow-avro/src/reader/header.rs b/arrow-avro/src/reader/header.rs new file mode 100644 index 000000000000..00e85b39be73 --- /dev/null +++ b/arrow-avro/src/reader/header.rs @@ -0,0 +1,288 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Decoder for [`Header`] + +use crate::reader::vlq::VLQDecoder; +use crate::schema::Schema; +use arrow_schema::ArrowError; + +#[derive(Debug)] +enum HeaderDecoderState { + /// Decoding the [`MAGIC`] prefix + Magic, + /// Decoding a block count + BlockCount, + /// Decoding a block byte length + BlockLen, + /// Decoding a key length + KeyLen, + /// Decoding a key string + Key, + /// Decoding a value length + ValueLen, + /// Decoding a value payload + Value, + /// Decoding sync marker + Sync, + /// Finished decoding + Finished, +} + +/// A decoded header for an [Object Container File](https://avro.apache.org/docs/1.11.1/specification/#object-container-files) +#[derive(Debug, Clone)] +pub struct Header { + meta_offsets: Vec, + meta_buf: Vec, + sync: [u8; 16], +} + +impl Header { + /// Returns an iterator over the meta keys in this header + pub fn metadata(&self) -> impl Iterator { + let mut last = 0; + self.meta_offsets.windows(2).map(move |w| { + let start = last; + last = w[1]; + (&self.meta_buf[start..w[0]], &self.meta_buf[w[0]..w[1]]) + }) + } + + /// Returns the value for a given metadata key if present + pub fn get(&self, key: impl AsRef<[u8]>) -> Option<&[u8]> { + self.metadata() + .find_map(|(k, v)| (k == key.as_ref()).then_some(v)) + } + + /// Returns the sync token for this file + pub fn sync(&self) -> [u8; 16] { + self.sync + } +} + +/// A decoder for [`Header`] +/// +/// The avro file format does not encode the length of the header, and so it +/// is necessary to provide a push-based decoder that can be used with streams +#[derive(Debug)] +pub struct HeaderDecoder { + state: HeaderDecoderState, + vlq_decoder: VLQDecoder, + + /// The end offsets of strings in `meta_buf` + meta_offsets: Vec, + /// The raw binary data of the metadata map + meta_buf: Vec, + + /// The decoded sync marker + sync_marker: [u8; 16], + + /// The number of remaining tuples in the current block + tuples_remaining: usize, + /// The number of bytes remaining in the current string/bytes payload + bytes_remaining: usize, +} + +impl Default for HeaderDecoder { + fn default() -> Self { + Self { + state: HeaderDecoderState::Magic, + meta_offsets: vec![], + meta_buf: vec![], + sync_marker: [0; 16], + vlq_decoder: Default::default(), + tuples_remaining: 0, + bytes_remaining: MAGIC.len(), + } + } +} + +const MAGIC: &[u8; 4] = b"Obj\x01"; + +impl HeaderDecoder { + /// Parse [`Header`] from `buf`, returning the number of bytes read + /// + /// This method can be called multiple times with consecutive chunks of data, allowing + /// integration with chunked IO systems like [`BufRead::fill_buf`] + /// + /// All errors should be considered fatal, and decoding aborted + /// + /// Once the entire [`Header`] has been decoded this method will not read any further + /// input bytes, and the header can be obtained with [`Self::flush`] + /// + /// [`BufRead::fill_buf`]: std::io::BufRead::fill_buf + pub fn decode(&mut self, mut buf: &[u8]) -> Result { + let max_read = buf.len(); + while !buf.is_empty() { + match self.state { + HeaderDecoderState::Magic => { + let remaining = &MAGIC[MAGIC.len() - self.bytes_remaining..]; + let to_decode = buf.len().min(remaining.len()); + if !buf.starts_with(&remaining[..to_decode]) { + return Err(ArrowError::ParseError("Incorrect avro magic".to_string())); + } + self.bytes_remaining -= to_decode; + buf = &buf[to_decode..]; + if self.bytes_remaining == 0 { + self.state = HeaderDecoderState::BlockCount; + } + } + HeaderDecoderState::BlockCount => { + if let Some(block_count) = self.vlq_decoder.long(&mut buf) { + match block_count.try_into() { + Ok(0) => { + self.state = HeaderDecoderState::Sync; + self.bytes_remaining = 16; + } + Ok(remaining) => { + self.tuples_remaining = remaining; + self.state = HeaderDecoderState::KeyLen; + } + Err(_) => { + self.tuples_remaining = block_count.unsigned_abs() as _; + self.state = HeaderDecoderState::BlockLen; + } + } + } + } + HeaderDecoderState::BlockLen => { + if self.vlq_decoder.long(&mut buf).is_some() { + self.state = HeaderDecoderState::KeyLen + } + } + HeaderDecoderState::Key => { + let to_read = self.bytes_remaining.min(buf.len()); + self.meta_buf.extend_from_slice(&buf[..to_read]); + self.bytes_remaining -= to_read; + buf = &buf[to_read..]; + if self.bytes_remaining == 0 { + self.meta_offsets.push(self.meta_buf.len()); + self.state = HeaderDecoderState::ValueLen; + } + } + HeaderDecoderState::Value => { + let to_read = self.bytes_remaining.min(buf.len()); + self.meta_buf.extend_from_slice(&buf[..to_read]); + self.bytes_remaining -= to_read; + buf = &buf[to_read..]; + if self.bytes_remaining == 0 { + self.meta_offsets.push(self.meta_buf.len()); + + self.tuples_remaining -= 1; + match self.tuples_remaining { + 0 => self.state = HeaderDecoderState::BlockCount, + _ => self.state = HeaderDecoderState::KeyLen, + } + } + } + HeaderDecoderState::KeyLen => { + if let Some(len) = self.vlq_decoder.long(&mut buf) { + self.bytes_remaining = len as _; + self.state = HeaderDecoderState::Key; + } + } + HeaderDecoderState::ValueLen => { + if let Some(len) = self.vlq_decoder.long(&mut buf) { + self.bytes_remaining = len as _; + self.state = HeaderDecoderState::Value; + } + } + HeaderDecoderState::Sync => { + let to_decode = buf.len().min(self.bytes_remaining); + let write = &mut self.sync_marker[16 - to_decode..]; + write[..to_decode].copy_from_slice(&buf[..to_decode]); + self.bytes_remaining -= to_decode; + buf = &buf[to_decode..]; + if self.bytes_remaining == 0 { + self.state = HeaderDecoderState::Finished; + } + } + HeaderDecoderState::Finished => return Ok(max_read - buf.len()), + } + } + Ok(max_read) + } + + /// Flush this decoder returning the parsed [`Header`] if any + pub fn flush(&mut self) -> Option

{ + match self.state { + HeaderDecoderState::Finished => { + self.state = HeaderDecoderState::Magic; + Some(Header { + meta_offsets: std::mem::take(&mut self.meta_offsets), + meta_buf: std::mem::take(&mut self.meta_buf), + sync: self.sync_marker, + }) + } + _ => None, + } + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::reader::read_header; + use crate::schema::SCHEMA_METADATA_KEY; + use crate::test_util::arrow_test_data; + use std::fs::File; + use std::io::{BufRead, BufReader}; + + #[test] + fn test_header_decode() { + let mut decoder = HeaderDecoder::default(); + for m in MAGIC { + decoder.decode(std::slice::from_ref(m)).unwrap(); + } + + let mut decoder = HeaderDecoder::default(); + assert_eq!(decoder.decode(MAGIC).unwrap(), 4); + + let mut decoder = HeaderDecoder::default(); + decoder.decode(b"Ob").unwrap(); + let err = decoder.decode(b"s").unwrap_err().to_string(); + assert_eq!(err, "Parser error: Incorrect avro magic"); + } + + fn decode_file(file: &str) -> Header { + let file = File::open(file).unwrap(); + read_header(BufReader::with_capacity(100, file)).unwrap() + } + + #[test] + fn test_header() { + let header = decode_file(&arrow_test_data("avro/alltypes_plain.avro")); + let schema_json = header.get(SCHEMA_METADATA_KEY).unwrap(); + let expected = br#"{"type":"record","name":"topLevelRecord","fields":[{"name":"id","type":["int","null"]},{"name":"bool_col","type":["boolean","null"]},{"name":"tinyint_col","type":["int","null"]},{"name":"smallint_col","type":["int","null"]},{"name":"int_col","type":["int","null"]},{"name":"bigint_col","type":["long","null"]},{"name":"float_col","type":["float","null"]},{"name":"double_col","type":["double","null"]},{"name":"date_string_col","type":["bytes","null"]},{"name":"string_col","type":["bytes","null"]},{"name":"timestamp_col","type":[{"type":"long","logicalType":"timestamp-micros"},"null"]}]}"#; + assert_eq!(schema_json, expected); + let _schema: Schema<'_> = serde_json::from_slice(schema_json).unwrap(); + assert_eq!( + u128::from_le_bytes(header.sync()), + 226966037233754408753420635932530907102 + ); + + let header = decode_file(&arrow_test_data("avro/fixed_length_decimal.avro")); + let schema_json = header.get(SCHEMA_METADATA_KEY).unwrap(); + let expected = br#"{"type":"record","name":"topLevelRecord","fields":[{"name":"value","type":[{"type":"fixed","name":"fixed","namespace":"topLevelRecord.value","size":11,"logicalType":"decimal","precision":25,"scale":2},"null"]}]}"#; + assert_eq!(schema_json, expected); + let _schema: Schema<'_> = serde_json::from_slice(schema_json).unwrap(); + assert_eq!( + u128::from_le_bytes(header.sync()), + 325166208089902833952788552656412487328 + ); + } +} diff --git a/arrow-avro/src/reader/mod.rs b/arrow-avro/src/reader/mod.rs new file mode 100644 index 000000000000..7769bbbc4998 --- /dev/null +++ b/arrow-avro/src/reader/mod.rs @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Read Avro data to Arrow + +use crate::reader::block::{Block, BlockDecoder}; +use crate::reader::header::{Header, HeaderDecoder}; +use arrow_schema::ArrowError; +use std::io::BufRead; + +mod header; + +mod block; + +mod vlq; + +/// Read a [`Header`] from the provided [`BufRead`] +fn read_header(mut reader: R) -> Result { + let mut decoder = HeaderDecoder::default(); + loop { + let buf = reader.fill_buf()?; + if buf.is_empty() { + break; + } + let read = buf.len(); + let decoded = decoder.decode(buf)?; + reader.consume(decoded); + if decoded != read { + break; + } + } + + decoder + .flush() + .ok_or_else(|| ArrowError::ParseError("Unexpected EOF".to_string())) +} + +/// Return an iterator of [`Block`] from the provided [`BufRead`] +fn read_blocks(mut reader: R) -> impl Iterator> { + let mut decoder = BlockDecoder::default(); + + let mut try_next = move || { + loop { + let buf = reader.fill_buf()?; + if buf.is_empty() { + break; + } + let read = buf.len(); + let decoded = decoder.decode(buf)?; + reader.consume(decoded); + if decoded != read { + break; + } + } + Ok(decoder.flush()) + }; + std::iter::from_fn(move || try_next().transpose()) +} + +#[cfg(test)] +mod test { + use crate::reader::{read_blocks, read_header}; + use crate::test_util::arrow_test_data; + use std::fs::File; + use std::io::BufReader; + + #[test] + fn test_mux() { + let file = File::open(arrow_test_data("avro/alltypes_plain.avro")).unwrap(); + let mut reader = BufReader::new(file); + let header = read_header(&mut reader).unwrap(); + for result in read_blocks(reader) { + let block = result.unwrap(); + assert_eq!(block.sync, header.sync()); + } + } +} diff --git a/arrow-avro/src/reader/vlq.rs b/arrow-avro/src/reader/vlq.rs new file mode 100644 index 000000000000..80f1c60eec7d --- /dev/null +++ b/arrow-avro/src/reader/vlq.rs @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/// Decoder for zig-zag encoded variable length (VLW) integers +/// +/// See also: +/// +/// +#[derive(Debug, Default)] +pub struct VLQDecoder { + /// Scratch space for decoding VLQ integers + in_progress: u64, + shift: u32, +} + +impl VLQDecoder { + /// Decode a signed long from `buf` + pub fn long(&mut self, buf: &mut &[u8]) -> Option { + while let Some(byte) = buf.first().copied() { + *buf = &buf[1..]; + self.in_progress |= ((byte & 0x7F) as u64) << self.shift; + self.shift += 7; + if byte & 0x80 == 0 { + let val = self.in_progress; + self.in_progress = 0; + self.shift = 0; + return Some((val >> 1) as i64 ^ -((val & 1) as i64)); + } + } + None + } +} diff --git a/arrow-avro/src/schema.rs b/arrow-avro/src/schema.rs new file mode 100644 index 000000000000..17b82cf861b7 --- /dev/null +++ b/arrow-avro/src/schema.rs @@ -0,0 +1,482 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; + +/// The metadata key used for storing the JSON encoded [`Schema`] +pub const SCHEMA_METADATA_KEY: &str = "avro.schema"; + +/// Either a [`PrimitiveType`] or a reference to a previously defined named type +/// +/// +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(untagged)] +pub enum TypeName<'a> { + Primitive(PrimitiveType), + Ref(&'a str), +} + +/// A primitive type +/// +/// +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub enum PrimitiveType { + Null, + Boolean, + Int, + Long, + Float, + Double, + Bytes, + String, +} + +/// Additional attributes within a [`Schema`] +/// +/// +#[derive(Debug, Clone, PartialEq, Eq, Default, Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct Attributes<'a> { + /// A logical type name + /// + /// + #[serde(default)] + pub logical_type: Option<&'a str>, + + /// Additional JSON attributes + #[serde(flatten)] + pub additional: HashMap<&'a str, serde_json::Value>, +} + +/// A type definition that is not a variant of [`ComplexType`] +#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct Type<'a> { + #[serde(borrow)] + pub r#type: TypeName<'a>, + #[serde(flatten)] + pub attributes: Attributes<'a>, +} + +/// An Avro schema +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(untagged)] +pub enum Schema<'a> { + #[serde(borrow)] + TypeName(TypeName<'a>), + #[serde(borrow)] + Union(Vec>), + #[serde(borrow)] + Complex(ComplexType<'a>), + #[serde(borrow)] + Type(Type<'a>), +} + +/// A complex type +/// +/// +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(tag = "type", rename_all = "camelCase")] +pub enum ComplexType<'a> { + #[serde(borrow)] + Union(Vec>), + #[serde(borrow)] + Record(Record<'a>), + #[serde(borrow)] + Enum(Enum<'a>), + #[serde(borrow)] + Array(Array<'a>), + #[serde(borrow)] + Map(Map<'a>), + #[serde(borrow)] + Fixed(Fixed<'a>), +} + +/// A record +/// +/// +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct Record<'a> { + #[serde(borrow)] + pub name: &'a str, + #[serde(borrow, default)] + pub namespace: Option<&'a str>, + #[serde(borrow, default)] + pub doc: Option<&'a str>, + #[serde(borrow, default)] + pub aliases: Vec<&'a str>, + #[serde(borrow)] + pub fields: Vec>, + #[serde(flatten)] + pub attributes: Attributes<'a>, +} + +/// A field within a [`Record`] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct Field<'a> { + #[serde(borrow)] + pub name: &'a str, + #[serde(borrow, default)] + pub doc: Option<&'a str>, + #[serde(borrow)] + pub r#type: Schema<'a>, + #[serde(borrow, default)] + pub default: Option<&'a str>, +} + +/// An enumeration +/// +/// +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct Enum<'a> { + #[serde(borrow)] + pub name: &'a str, + #[serde(borrow, default)] + pub namespace: Option<&'a str>, + #[serde(borrow, default)] + pub doc: Option<&'a str>, + #[serde(borrow, default)] + pub aliases: Vec<&'a str>, + #[serde(borrow)] + pub symbols: Vec<&'a str>, + #[serde(borrow, default)] + pub default: Option<&'a str>, + #[serde(flatten)] + pub attributes: Attributes<'a>, +} + +/// An array +/// +/// +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct Array<'a> { + #[serde(borrow)] + pub items: Box>, + #[serde(flatten)] + pub attributes: Attributes<'a>, +} + +/// A map +/// +/// +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct Map<'a> { + #[serde(borrow)] + pub values: Box>, + #[serde(flatten)] + pub attributes: Attributes<'a>, +} + +/// A fixed length binary array +/// +/// +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct Fixed<'a> { + #[serde(borrow)] + pub name: &'a str, + #[serde(borrow, default)] + pub namespace: Option<&'a str>, + #[serde(borrow, default)] + pub aliases: Vec<&'a str>, + pub size: usize, + #[serde(flatten)] + pub attributes: Attributes<'a>, +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + #[test] + fn test_deserialize() { + let t: Schema = serde_json::from_str("\"string\"").unwrap(); + assert_eq!( + t, + Schema::TypeName(TypeName::Primitive(PrimitiveType::String)) + ); + + let t: Schema = serde_json::from_str("[\"int\", \"null\"]").unwrap(); + assert_eq!( + t, + Schema::Union(vec![ + Schema::TypeName(TypeName::Primitive(PrimitiveType::Int)), + Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)), + ]) + ); + + let t: Type = serde_json::from_str( + r#"{ + "type":"long", + "logicalType":"timestamp-micros" + }"#, + ) + .unwrap(); + + let timestamp = Type { + r#type: TypeName::Primitive(PrimitiveType::Long), + attributes: Attributes { + logical_type: Some("timestamp-micros"), + additional: Default::default(), + }, + }; + + assert_eq!(t, timestamp); + + let t: ComplexType = serde_json::from_str( + r#"{ + "type":"fixed", + "name":"fixed", + "namespace":"topLevelRecord.value", + "size":11, + "logicalType":"decimal", + "precision":25, + "scale":2 + }"#, + ) + .unwrap(); + + let decimal = ComplexType::Fixed(Fixed { + name: "fixed", + namespace: Some("topLevelRecord.value"), + aliases: vec![], + size: 11, + attributes: Attributes { + logical_type: Some("decimal"), + additional: vec![("precision", json!(25)), ("scale", json!(2))] + .into_iter() + .collect(), + }, + }); + + assert_eq!(t, decimal); + + let schema: Schema = serde_json::from_str( + r#"{ + "type":"record", + "name":"topLevelRecord", + "fields":[ + { + "name":"value", + "type":[ + { + "type":"fixed", + "name":"fixed", + "namespace":"topLevelRecord.value", + "size":11, + "logicalType":"decimal", + "precision":25, + "scale":2 + }, + "null" + ] + } + ] + }"#, + ) + .unwrap(); + + assert_eq!( + schema, + Schema::Complex(ComplexType::Record(Record { + name: "topLevelRecord", + namespace: None, + doc: None, + aliases: vec![], + fields: vec![Field { + name: "value", + doc: None, + r#type: Schema::Union(vec![ + Schema::Complex(decimal), + Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)), + ]), + default: None, + },], + attributes: Default::default(), + })) + ); + + let schema: Schema = serde_json::from_str( + r#"{ + "type": "record", + "name": "LongList", + "aliases": ["LinkedLongs"], + "fields" : [ + {"name": "value", "type": "long"}, + {"name": "next", "type": ["null", "LongList"]} + ] + }"#, + ) + .unwrap(); + + assert_eq!( + schema, + Schema::Complex(ComplexType::Record(Record { + name: "LongList", + namespace: None, + doc: None, + aliases: vec!["LinkedLongs"], + fields: vec![ + Field { + name: "value", + doc: None, + r#type: Schema::TypeName(TypeName::Primitive(PrimitiveType::Long)), + default: None, + }, + Field { + name: "next", + doc: None, + r#type: Schema::Union(vec![ + Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)), + Schema::TypeName(TypeName::Ref("LongList")), + ]), + default: None, + } + ], + attributes: Attributes::default(), + })) + ); + + let schema: Schema = serde_json::from_str( + r#"{ + "type":"record", + "name":"topLevelRecord", + "fields":[ + { + "name":"id", + "type":[ + "int", + "null" + ] + }, + { + "name":"timestamp_col", + "type":[ + { + "type":"long", + "logicalType":"timestamp-micros" + }, + "null" + ] + } + ] + }"#, + ) + .unwrap(); + + assert_eq!( + schema, + Schema::Complex(ComplexType::Record(Record { + name: "topLevelRecord", + namespace: None, + doc: None, + aliases: vec![], + fields: vec![ + Field { + name: "id", + doc: None, + r#type: Schema::Union(vec![ + Schema::TypeName(TypeName::Primitive(PrimitiveType::Int)), + Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)), + ]), + default: None, + }, + Field { + name: "timestamp_col", + doc: None, + r#type: Schema::Union(vec![ + Schema::Type(timestamp), + Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)), + ]), + default: None, + } + ], + attributes: Default::default(), + })) + ); + + let schema: Schema = serde_json::from_str( + r#"{ + "type": "record", + "name": "HandshakeRequest", "namespace":"org.apache.avro.ipc", + "fields": [ + {"name": "clientHash", + "type": {"type": "fixed", "name": "MD5", "size": 16}}, + {"name": "clientProtocol", "type": ["null", "string"]}, + {"name": "serverHash", "type": "MD5"}, + {"name": "meta", "type": ["null", {"type": "map", "values": "bytes"}]} + ] + }"#, + ) + .unwrap(); + + assert_eq!( + schema, + Schema::Complex(ComplexType::Record(Record { + name: "HandshakeRequest", + namespace: Some("org.apache.avro.ipc"), + doc: None, + aliases: vec![], + fields: vec![ + Field { + name: "clientHash", + doc: None, + r#type: Schema::Complex(ComplexType::Fixed(Fixed { + name: "MD5", + namespace: None, + aliases: vec![], + size: 16, + attributes: Default::default(), + })), + default: None, + }, + Field { + name: "clientProtocol", + doc: None, + r#type: Schema::Union(vec![ + Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)), + Schema::TypeName(TypeName::Primitive(PrimitiveType::String)), + ]), + default: None, + }, + Field { + name: "serverHash", + doc: None, + r#type: Schema::TypeName(TypeName::Ref("MD5")), + default: None, + }, + Field { + name: "meta", + doc: None, + r#type: Schema::Union(vec![ + Schema::TypeName(TypeName::Primitive(PrimitiveType::Null)), + Schema::Complex(ComplexType::Map(Map { + values: Box::new(Schema::TypeName(TypeName::Primitive( + PrimitiveType::Bytes + ))), + attributes: Default::default(), + })), + ]), + default: None, + } + ], + attributes: Default::default(), + })) + ); + } +} diff --git a/arrow-buffer/src/bigint/div.rs b/arrow-buffer/src/bigint/div.rs index ba530ffcc6c8..e1b2ed4f8aa5 100644 --- a/arrow-buffer/src/bigint/div.rs +++ b/arrow-buffer/src/bigint/div.rs @@ -26,10 +26,7 @@ /// # Panics /// /// Panics if divisor is zero -pub fn div_rem( - numerator: &[u64; N], - divisor: &[u64; N], -) -> ([u64; N], [u64; N]) { +pub fn div_rem(numerator: &[u64; N], divisor: &[u64; N]) -> ([u64; N], [u64; N]) { let numerator_bits = bits(numerator); let divisor_bits = bits(divisor); assert_ne!(divisor_bits, 0, "division by zero"); @@ -61,10 +58,7 @@ fn bits(arr: &[u64]) -> usize { } /// Division of numerator by a u64 divisor -fn div_rem_small( - numerator: &[u64; N], - divisor: u64, -) -> ([u64; N], [u64; N]) { +fn div_rem_small(numerator: &[u64; N], divisor: u64) -> ([u64; N], [u64; N]) { let mut rem = 0u64; let mut numerator = *numerator; numerator.iter_mut().rev().for_each(|d| { @@ -227,11 +221,7 @@ fn sub_assign(a: &mut [u64], b: &[u64]) -> bool { } /// Converts an overflowing binary operation on scalars to one on slices -fn binop_slice( - a: &mut [u64], - b: &[u64], - binop: impl Fn(u64, u64) -> (u64, bool) + Copy, -) -> bool { +fn binop_slice(a: &mut [u64], b: &[u64], binop: impl Fn(u64, u64) -> (u64, bool) + Copy) -> bool { let mut c = false; a.iter_mut().zip(b.iter()).for_each(|(x, y)| { let (res1, overflow1) = y.overflowing_add(u64::from(c)); diff --git a/arrow-buffer/src/bigint/mod.rs b/arrow-buffer/src/bigint/mod.rs index d064663bf63a..afbb3a31df12 100644 --- a/arrow-buffer/src/bigint/mod.rs +++ b/arrow-buffer/src/bigint/mod.rs @@ -310,9 +310,7 @@ impl i256 { (Self::from_le_bytes(bytes), false) } Ordering::Equal => (Self::from_le_bytes(v_bytes.try_into().unwrap()), false), - Ordering::Greater => { - (Self::from_le_bytes(v_bytes[..32].try_into().unwrap()), true) - } + Ordering::Greater => (Self::from_le_bytes(v_bytes[..32].try_into().unwrap()), true), } } @@ -357,8 +355,7 @@ impl i256 { #[inline] pub fn checked_add(self, other: Self) -> Option { let r = self.wrapping_add(other); - ((other.is_negative() && r < self) || (!other.is_negative() && r >= self)) - .then_some(r) + ((other.is_negative() && r < self) || (!other.is_negative() && r >= self)).then_some(r) } /// Performs wrapping subtraction @@ -373,8 +370,7 @@ impl i256 { #[inline] pub fn checked_sub(self, other: Self) -> Option { let r = self.wrapping_sub(other); - ((other.is_negative() && r > self) || (!other.is_negative() && r <= self)) - .then_some(r) + ((other.is_negative() && r > self) || (!other.is_negative() && r <= self)).then_some(r) } /// Performs wrapping multiplication @@ -591,9 +587,7 @@ impl i256 { /// Temporary workaround due to lack of stable const array slicing /// See -const fn split_array( - vals: [u8; N], -) -> ([u8; M], [u8; M]) { +const fn split_array(vals: [u8; N]) -> ([u8; M], [u8; M]) { let mut a = [0; M]; let mut b = [0; M]; let mut i = 0; @@ -915,8 +909,7 @@ mod tests { // Addition let actual = il.wrapping_add(ir); - let (expected, overflow) = - i256::from_bigint_with_overflow(bl.clone() + br.clone()); + let (expected, overflow) = i256::from_bigint_with_overflow(bl.clone() + br.clone()); assert_eq!(actual, expected); let checked = il.checked_add(ir); @@ -927,8 +920,7 @@ mod tests { // Subtraction let actual = il.wrapping_sub(ir); - let (expected, overflow) = - i256::from_bigint_with_overflow(bl.clone() - br.clone()); + let (expected, overflow) = i256::from_bigint_with_overflow(bl.clone() - br.clone()); assert_eq!(actual.to_string(), expected.to_string()); let checked = il.checked_sub(ir); @@ -939,8 +931,7 @@ mod tests { // Multiplication let actual = il.wrapping_mul(ir); - let (expected, overflow) = - i256::from_bigint_with_overflow(bl.clone() * br.clone()); + let (expected, overflow) = i256::from_bigint_with_overflow(bl.clone() * br.clone()); assert_eq!(actual.to_string(), expected.to_string()); let checked = il.checked_mul(ir); @@ -996,8 +987,7 @@ mod tests { // Exponentiation for exp in vec![0, 1, 2, 3, 8, 100].into_iter() { let actual = il.wrapping_pow(exp); - let (expected, overflow) = - i256::from_bigint_with_overflow(bl.clone().pow(exp)); + let (expected, overflow) = i256::from_bigint_with_overflow(bl.clone().pow(exp)); assert_eq!(actual.to_string(), expected.to_string()); let checked = il.checked_pow(exp); @@ -1212,7 +1202,10 @@ mod tests { ("000000000000000000000000000000000000000", Some(i256::ZERO)), ("0000000000000000000000000000000000000000-11", None), ("11-1111111111111111111111111111111111111", None), - ("115792089237316195423570985008687907853269984665640564039457584007913129639936", None) + ( + "115792089237316195423570985008687907853269984665640564039457584007913129639936", + None, + ), ]; for (case, expected) in cases { assert_eq!(i256::from_string(case), expected) diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs index 577c716e4bea..c651edcad92e 100644 --- a/arrow-buffer/src/buffer/boolean.rs +++ b/arrow-buffer/src/buffer/boolean.rs @@ -223,13 +223,7 @@ impl BitAnd<&BooleanBuffer> for &BooleanBuffer { fn bitand(self, rhs: &BooleanBuffer) -> Self::Output { assert_eq!(self.len, rhs.len); BooleanBuffer { - buffer: buffer_bin_and( - &self.buffer, - self.offset, - &rhs.buffer, - rhs.offset, - self.len, - ), + buffer: buffer_bin_and(&self.buffer, self.offset, &rhs.buffer, rhs.offset, self.len), offset: 0, len: self.len, } @@ -242,13 +236,7 @@ impl BitOr<&BooleanBuffer> for &BooleanBuffer { fn bitor(self, rhs: &BooleanBuffer) -> Self::Output { assert_eq!(self.len, rhs.len); BooleanBuffer { - buffer: buffer_bin_or( - &self.buffer, - self.offset, - &rhs.buffer, - rhs.offset, - self.len, - ), + buffer: buffer_bin_or(&self.buffer, self.offset, &rhs.buffer, rhs.offset, self.len), offset: 0, len: self.len, } @@ -261,13 +249,7 @@ impl BitXor<&BooleanBuffer> for &BooleanBuffer { fn bitxor(self, rhs: &BooleanBuffer) -> Self::Output { assert_eq!(self.len, rhs.len); BooleanBuffer { - buffer: buffer_bin_xor( - &self.buffer, - self.offset, - &rhs.buffer, - rhs.offset, - self.len, - ), + buffer: buffer_bin_xor(&self.buffer, self.offset, &rhs.buffer, rhs.offset, self.len), offset: 0, len: self.len, } @@ -428,8 +410,7 @@ mod tests { let buf = Buffer::from(&[0, 1, 1, 0, 0]); let boolean_buf = &BooleanBuffer::new(buf, offset, len); - let expected = - BooleanBuffer::new(Buffer::from(&[255, 254, 254, 255, 255]), offset, len); + let expected = BooleanBuffer::new(Buffer::from(&[255, 254, 254, 255, 255]), offset, len); assert_eq!(!boolean_buf, expected); } } diff --git a/arrow-buffer/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs index 8296d3fbcc31..05530eed9b08 100644 --- a/arrow-buffer/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -323,6 +323,14 @@ impl Buffer { length, }) } + + /// Returns true if this [`Buffer`] is equal to `other`, using pointer comparisons + /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may + /// return false when the arrays are logically equal + #[inline] + pub fn ptr_eq(&self, other: &Self) -> bool { + self.ptr == other.ptr && self.length == other.length + } } /// Creating a `Buffer` instance by copying the memory from a `AsRef<[u8]>` into a newly @@ -515,9 +523,7 @@ mod tests { } #[test] - #[should_panic( - expected = "the offset of the new Buffer cannot exceed the existing length" - )] + #[should_panic(expected = "the offset of the new Buffer cannot exceed the existing length")] fn test_slice_offset_out_of_bound() { let buf = Buffer::from(&[2, 4, 6, 8, 10]); buf.slice(6); @@ -680,9 +686,7 @@ mod tests { } #[test] - #[should_panic( - expected = "the offset of the new Buffer cannot exceed the existing length" - )] + #[should_panic(expected = "the offset of the new Buffer cannot exceed the existing length")] fn slice_overflow() { let buffer = Buffer::from(MutableBuffer::from_len_zeroed(12)); buffer.slice_with_length(2, usize::MAX); diff --git a/arrow-buffer/src/buffer/mutable.rs b/arrow-buffer/src/buffer/mutable.rs index 2c56f9a5b270..69c986cc1056 100644 --- a/arrow-buffer/src/buffer/mutable.rs +++ b/arrow-buffer/src/buffer/mutable.rs @@ -334,9 +334,7 @@ impl MutableBuffer { #[inline] pub(super) fn into_buffer(self) -> Buffer { - let bytes = unsafe { - Bytes::new(self.data, self.len, Deallocation::Standard(self.layout)) - }; + let bytes = unsafe { Bytes::new(self.data, self.len, Deallocation::Standard(self.layout)) }; std::mem::forget(self); Buffer::from_bytes(bytes) } @@ -351,8 +349,7 @@ impl MutableBuffer { // SAFETY // ArrowNativeType is trivially transmutable, is sealed to prevent potentially incorrect // implementation outside this crate, and this method checks alignment - let (prefix, offsets, suffix) = - unsafe { self.as_slice_mut().align_to_mut::() }; + let (prefix, offsets, suffix) = unsafe { self.as_slice_mut().align_to_mut::() }; assert!(prefix.is_empty() && suffix.is_empty()); offsets } @@ -604,9 +601,7 @@ impl MutableBuffer { // we can't specialize `extend` for `TrustedLen` like `Vec` does. // 2. `from_trusted_len_iter_bool` is faster. #[inline] - pub unsafe fn from_trusted_len_iter_bool>( - mut iterator: I, - ) -> Self { + pub unsafe fn from_trusted_len_iter_bool>(mut iterator: I) -> Self { let (_, upper) = iterator.size_hint(); let len = upper.expect("from_trusted_len_iter requires an upper limit"); diff --git a/arrow-buffer/src/buffer/null.rs b/arrow-buffer/src/buffer/null.rs index e0c7d9ef8f49..c79aef398059 100644 --- a/arrow-buffer/src/buffer/null.rs +++ b/arrow-buffer/src/buffer/null.rs @@ -71,10 +71,7 @@ impl NullBuffer { /// This is commonly used by binary operations where the result is NULL if either /// of the input values is NULL. Handling the null mask separately in this way /// can yield significant performance improvements over an iterator approach - pub fn union( - lhs: Option<&NullBuffer>, - rhs: Option<&NullBuffer>, - ) -> Option { + pub fn union(lhs: Option<&NullBuffer>, rhs: Option<&NullBuffer>) -> Option { match (lhs, rhs) { (Some(lhs), Some(rhs)) => Some(Self::new(lhs.inner() & rhs.inner())), (Some(n), None) | (None, Some(n)) => Some(n.clone()), diff --git a/arrow-buffer/src/buffer/offset.rs b/arrow-buffer/src/buffer/offset.rs index fede32c57924..652d30c3b0ab 100644 --- a/arrow-buffer/src/buffer/offset.rs +++ b/arrow-buffer/src/buffer/offset.rs @@ -148,6 +148,14 @@ impl OffsetBuffer { pub fn slice(&self, offset: usize, len: usize) -> Self { Self(self.0.slice(offset, len.saturating_add(1))) } + + /// Returns true if this [`OffsetBuffer`] is equal to `other`, using pointer comparisons + /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may + /// return false when the arrays are logically equal + #[inline] + pub fn ptr_eq(&self, other: &Self) -> bool { + self.0.ptr_eq(&other.0) + } } impl Deref for OffsetBuffer { @@ -211,8 +219,7 @@ mod tests { assert_eq!(buffer.as_ref(), &[0, 2, 8, 11, 18, 20]); let half_max = i32::MAX / 2; - let buffer = - OffsetBuffer::::from_lengths([half_max as usize, half_max as usize]); + let buffer = OffsetBuffer::::from_lengths([half_max as usize, half_max as usize]); assert_eq!(buffer.as_ref(), &[0, half_max, half_max * 2]); } diff --git a/arrow-buffer/src/buffer/ops.rs b/arrow-buffer/src/buffer/ops.rs index eccff6280dd8..ca00e41bea21 100644 --- a/arrow-buffer/src/buffer/ops.rs +++ b/arrow-buffer/src/buffer/ops.rs @@ -184,10 +184,6 @@ pub fn buffer_bin_xor( /// Apply a bitwise not to one input and return the result as a Buffer. /// The input is treated as a bitmap, meaning that offset and length are specified in number of bits. -pub fn buffer_unary_not( - left: &Buffer, - offset_in_bits: usize, - len_in_bits: usize, -) -> Buffer { +pub fn buffer_unary_not(left: &Buffer, offset_in_bits: usize, len_in_bits: usize) -> Buffer { bitwise_unary_op_helper(left, offset_in_bits, len_in_bits, |a| !a) } diff --git a/arrow-buffer/src/buffer/run.rs b/arrow-buffer/src/buffer/run.rs index 29c0f3dfd949..3dbbe344a025 100644 --- a/arrow-buffer/src/buffer/run.rs +++ b/arrow-buffer/src/buffer/run.rs @@ -110,11 +110,7 @@ where /// /// - `buffer` must contain strictly increasing values greater than zero /// - The last value of `buffer` must be greater than or equal to `offset + len` - pub unsafe fn new_unchecked( - run_ends: ScalarBuffer, - offset: usize, - len: usize, - ) -> Self { + pub unsafe fn new_unchecked(run_ends: ScalarBuffer, offset: usize, len: usize) -> Self { Self { run_ends, offset, diff --git a/arrow-buffer/src/buffer/scalar.rs b/arrow-buffer/src/buffer/scalar.rs index 70c86f11866d..3b75d5384046 100644 --- a/arrow-buffer/src/buffer/scalar.rs +++ b/arrow-buffer/src/buffer/scalar.rs @@ -86,6 +86,14 @@ impl ScalarBuffer { pub fn into_inner(self) -> Buffer { self.buffer } + + /// Returns true if this [`ScalarBuffer`] is equal to `other`, using pointer comparisons + /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may + /// return false when the arrays are logically equal + #[inline] + pub fn ptr_eq(&self, other: &Self) -> bool { + self.buffer.ptr_eq(&other.buffer) + } } impl Deref for ScalarBuffer { @@ -213,9 +221,7 @@ mod tests { } #[test] - #[should_panic( - expected = "Memory pointer is not aligned with the specified scalar type" - )] + #[should_panic(expected = "Memory pointer is not aligned with the specified scalar type")] fn test_unaligned() { let expected = [0_i32, 1, 2]; let buffer = Buffer::from_iter(expected.iter().cloned()); @@ -224,18 +230,14 @@ mod tests { } #[test] - #[should_panic( - expected = "the offset of the new Buffer cannot exceed the existing length" - )] + #[should_panic(expected = "the offset of the new Buffer cannot exceed the existing length")] fn test_length_out_of_bounds() { let buffer = Buffer::from_iter([0_i32, 1, 2]); ScalarBuffer::::new(buffer, 1, 3); } #[test] - #[should_panic( - expected = "the offset of the new Buffer cannot exceed the existing length" - )] + #[should_panic(expected = "the offset of the new Buffer cannot exceed the existing length")] fn test_offset_out_of_bounds() { let buffer = Buffer::from_iter([0_i32, 1, 2]); ScalarBuffer::::new(buffer, 4, 0); diff --git a/arrow-buffer/src/builder/boolean.rs b/arrow-buffer/src/builder/boolean.rs index f0e7f0f13670..ca178ae5ce4e 100644 --- a/arrow-buffer/src/builder/boolean.rs +++ b/arrow-buffer/src/builder/boolean.rs @@ -154,14 +154,12 @@ impl BooleanBufferBuilder { if cur_remainder != 0 { // Pad last byte with 1s - *self.buffer.as_slice_mut().last_mut().unwrap() |= - !((1 << cur_remainder) - 1) + *self.buffer.as_slice_mut().last_mut().unwrap() |= !((1 << cur_remainder) - 1) } self.buffer.resize(new_len_bytes, 0xFF); if new_remainder != 0 { // Clear remaining bits - *self.buffer.as_slice_mut().last_mut().unwrap() &= - (1 << new_remainder) - 1 + *self.buffer.as_slice_mut().last_mut().unwrap() &= (1 << new_remainder) - 1 } self.len = new_len; } diff --git a/arrow-buffer/src/bytes.rs b/arrow-buffer/src/bytes.rs index 8f5019d5a4cc..81860b604868 100644 --- a/arrow-buffer/src/bytes.rs +++ b/arrow-buffer/src/bytes.rs @@ -60,11 +60,7 @@ impl Bytes { /// This function is unsafe as there is no guarantee that the given pointer is valid for `len` /// bytes. If the `ptr` and `capacity` come from a `Buffer`, then this is guaranteed. #[inline] - pub(crate) unsafe fn new( - ptr: NonNull, - len: usize, - deallocation: Deallocation, - ) -> Bytes { + pub(crate) unsafe fn new(ptr: NonNull, len: usize, deallocation: Deallocation) -> Bytes { Bytes { ptr, len, diff --git a/arrow-buffer/src/util/bit_chunk_iterator.rs b/arrow-buffer/src/util/bit_chunk_iterator.rs index 6830acae94a1..9e4fb8268dff 100644 --- a/arrow-buffer/src/util/bit_chunk_iterator.rs +++ b/arrow-buffer/src/util/bit_chunk_iterator.rs @@ -60,8 +60,7 @@ impl<'a> UnalignedBitChunk<'a> { // If less than 8 bytes, read into prefix if buffer.len() <= 8 { - let (suffix_mask, trailing_padding) = - compute_suffix_mask(len, offset_padding); + let (suffix_mask, trailing_padding) = compute_suffix_mask(len, offset_padding); let prefix = read_u64(buffer) & suffix_mask & prefix_mask; return Self { @@ -75,8 +74,7 @@ impl<'a> UnalignedBitChunk<'a> { // If less than 16 bytes, read into prefix and suffix if buffer.len() <= 16 { - let (suffix_mask, trailing_padding) = - compute_suffix_mask(len, offset_padding); + let (suffix_mask, trailing_padding) = compute_suffix_mask(len, offset_padding); let prefix = read_u64(&buffer[..8]) & prefix_mask; let suffix = read_u64(&buffer[8..]) & suffix_mask; @@ -167,10 +165,7 @@ impl<'a> UnalignedBitChunk<'a> { } pub type UnalignedBitChunkIterator<'a> = std::iter::Chain< - std::iter::Chain< - std::option::IntoIter, - std::iter::Cloned>, - >, + std::iter::Chain, std::iter::Cloned>>, std::option::IntoIter, >; @@ -338,9 +333,8 @@ impl Iterator for BitChunkIterator<'_> { } else { // the constructor ensures that bit_offset is in 0..8 // that means we need to read at most one additional byte to fill in the high bits - let next = unsafe { - std::ptr::read_unaligned(raw_data.add(index + 1) as *const u8) as u64 - }; + let next = + unsafe { std::ptr::read_unaligned(raw_data.add(index + 1) as *const u8) as u64 }; (current >> bit_offset) | (next << (64 - bit_offset)) }; @@ -387,8 +381,8 @@ mod tests { #[test] fn test_iter_unaligned() { let input: &[u8] = &[ - 0b00000000, 0b00000001, 0b00000010, 0b00000100, 0b00001000, 0b00010000, - 0b00100000, 0b01000000, 0b11111111, + 0b00000000, 0b00000001, 0b00000010, 0b00000100, 0b00001000, 0b00010000, 0b00100000, + 0b01000000, 0b11111111, ]; let buffer: Buffer = Buffer::from(input); @@ -408,8 +402,8 @@ mod tests { #[test] fn test_iter_unaligned_remainder_1_byte() { let input: &[u8] = &[ - 0b00000000, 0b00000001, 0b00000010, 0b00000100, 0b00001000, 0b00010000, - 0b00100000, 0b01000000, 0b11111111, + 0b00000000, 0b00000001, 0b00000010, 0b00000100, 0b00001000, 0b00010000, 0b00100000, + 0b01000000, 0b11111111, ]; let buffer: Buffer = Buffer::from(input); @@ -442,8 +436,8 @@ mod tests { #[test] fn test_iter_unaligned_remainder_bits_large() { let input: &[u8] = &[ - 0b11111111, 0b00000000, 0b11111111, 0b00000000, 0b11111111, 0b00000000, - 0b11111111, 0b00000000, 0b11111111, + 0b11111111, 0b00000000, 0b11111111, 0b00000000, 0b11111111, 0b00000000, 0b11111111, + 0b00000000, 0b11111111, ]; let buffer: Buffer = Buffer::from(input); @@ -637,11 +631,8 @@ mod tests { let max_truncate = 128.min(mask_len - offset); let truncate = rng.gen::().checked_rem(max_truncate).unwrap_or(0); - let unaligned = UnalignedBitChunk::new( - buffer.as_slice(), - offset, - mask_len - offset - truncate, - ); + let unaligned = + UnalignedBitChunk::new(buffer.as_slice(), offset, mask_len - offset - truncate); let bool_slice = &bools[offset..mask_len - truncate]; diff --git a/arrow-buffer/src/util/bit_iterator.rs b/arrow-buffer/src/util/bit_iterator.rs index 4e24ccdabec0..df40a8fbaccb 100644 --- a/arrow-buffer/src/util/bit_iterator.rs +++ b/arrow-buffer/src/util/bit_iterator.rs @@ -276,8 +276,8 @@ mod tests { assert_eq!( actual, &[ - false, true, false, false, true, false, true, false, false, false, false, - false, true, false + false, true, false, false, true, false, true, false, false, false, false, false, + true, false ] ); diff --git a/arrow-buffer/src/util/bit_mask.rs b/arrow-buffer/src/util/bit_mask.rs index 2af24b782632..8f81cb7d0469 100644 --- a/arrow-buffer/src/util/bit_mask.rs +++ b/arrow-buffer/src/util/bit_mask.rs @@ -42,8 +42,7 @@ pub fn set_bits( let chunks = BitChunks::new(data, offset_read + bits_to_align, len - bits_to_align); chunks.iter().for_each(|chunk| { null_count += chunk.count_zeros(); - write_data[write_byte_index..write_byte_index + 8] - .copy_from_slice(&chunk.to_le_bytes()); + write_data[write_byte_index..write_byte_index + 8].copy_from_slice(&chunk.to_le_bytes()); write_byte_index += 8; }); @@ -70,8 +69,8 @@ mod tests { fn test_set_bits_aligned() { let mut destination: Vec = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; let source: &[u8] = &[ - 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, - 0b11100111, 0b10100101, + 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, 0b11100111, + 0b10100101, ]; let destination_offset = 8; @@ -80,8 +79,8 @@ mod tests { let len = 64; let expected_data: &[u8] = &[ - 0, 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, - 0b11100111, 0b10100101, 0, + 0, 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, 0b11100111, + 0b10100101, 0, ]; let expected_null_count = 24; let result = set_bits( @@ -100,8 +99,8 @@ mod tests { fn test_set_bits_unaligned_destination_start() { let mut destination: Vec = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; let source: &[u8] = &[ - 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, - 0b11100111, 0b10100101, + 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, 0b11100111, + 0b10100101, ]; let destination_offset = 3; @@ -110,8 +109,8 @@ mod tests { let len = 64; let expected_data: &[u8] = &[ - 0b00111000, 0b00101111, 0b11001101, 0b11011100, 0b01011110, 0b00011111, - 0b00111110, 0b00101111, 0b00000101, 0b00000000, + 0b00111000, 0b00101111, 0b11001101, 0b11011100, 0b01011110, 0b00011111, 0b00111110, + 0b00101111, 0b00000101, 0b00000000, ]; let expected_null_count = 24; let result = set_bits( @@ -130,8 +129,8 @@ mod tests { fn test_set_bits_unaligned_destination_end() { let mut destination: Vec = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; let source: &[u8] = &[ - 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, - 0b11100111, 0b10100101, + 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, 0b11100111, + 0b10100101, ]; let destination_offset = 8; @@ -140,8 +139,8 @@ mod tests { let len = 62; let expected_data: &[u8] = &[ - 0, 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, - 0b11100111, 0b00100101, 0, + 0, 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, 0b11100111, + 0b00100101, 0, ]; let expected_null_count = 23; let result = set_bits( @@ -160,9 +159,9 @@ mod tests { fn test_set_bits_unaligned() { let mut destination: Vec = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; let source: &[u8] = &[ - 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, - 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, - 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, + 0b11100111, 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, 0b11100111, + 0b10100101, 0b10011001, 0b11011011, 0b11101011, 0b11000011, 0b11100111, 0b10100101, + 0b10011001, 0b11011011, 0b11101011, 0b11000011, ]; let destination_offset = 3; @@ -171,9 +170,8 @@ mod tests { let len = 95; let expected_data: &[u8] = &[ - 0b01111000, 0b01101001, 0b11100110, 0b11110110, 0b11111010, 0b11110000, - 0b01111001, 0b01101001, 0b11100110, 0b11110110, 0b11111010, 0b11110000, - 0b00000001, + 0b01111000, 0b01101001, 0b11100110, 0b11110110, 0b11111010, 0b11110000, 0b01111001, + 0b01101001, 0b11100110, 0b11110110, 0b11111010, 0b11110000, 0b00000001, ]; let expected_null_count = 35; let result = set_bits( diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index 2758a4817814..19b857297d14 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -45,15 +45,17 @@ arrow-buffer = { workspace = true } arrow-data = { workspace = true } arrow-schema = { workspace = true } arrow-select = { workspace = true } -chrono = { version = "0.4.23", default-features = false, features = ["clock"] } +chrono = { workspace = true } half = { version = "2.1", default-features = false } num = { version = "0.4", default-features = false, features = ["std"] } lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } comfy-table = { version = "7.0", optional = true, default-features = false } +base64 = "0.21" [dev-dependencies] criterion = { version = "0.5", default-features = false } half = { version = "2.1", default-features = false } +rand = "0.8" [build-dependencies] diff --git a/arrow-cast/src/base64.rs b/arrow-cast/src/base64.rs new file mode 100644 index 000000000000..e109c8112480 --- /dev/null +++ b/arrow-cast/src/base64.rs @@ -0,0 +1,117 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Functions for Base64 encoding/decoding + +use arrow_array::{Array, GenericBinaryArray, GenericStringArray, OffsetSizeTrait}; +use arrow_buffer::OffsetBuffer; +use arrow_schema::ArrowError; +use base64::encoded_len; +use base64::engine::Config; + +pub use base64::prelude::*; + +/// Bas64 encode each element of `array` with the provided `engine` +pub fn b64_encode( + engine: &E, + array: &GenericBinaryArray, +) -> GenericStringArray { + let lengths = array.offsets().windows(2).map(|w| { + let len = w[1].as_usize() - w[0].as_usize(); + encoded_len(len, engine.config().encode_padding()).unwrap() + }); + let offsets = OffsetBuffer::::from_lengths(lengths); + let buffer_len = offsets.last().unwrap().as_usize(); + let mut buffer = vec![0_u8; buffer_len]; + let mut offset = 0; + + for i in 0..array.len() { + let len = engine + .encode_slice(array.value(i), &mut buffer[offset..]) + .unwrap(); + offset += len; + } + assert_eq!(offset, buffer_len); + + // Safety: Base64 is valid UTF-8 + unsafe { GenericStringArray::new_unchecked(offsets, buffer.into(), array.nulls().cloned()) } +} + +/// Base64 decode each element of `array` with the provided `engine` +pub fn b64_decode( + engine: &E, + array: &GenericBinaryArray, +) -> Result, ArrowError> { + let estimated_len = array.values().len(); // This is an overestimate + let mut buffer = vec![0; estimated_len]; + + let mut offsets = Vec::with_capacity(array.len() + 1); + offsets.push(O::usize_as(0)); + let mut offset = 0; + + for v in array.iter() { + if let Some(v) = v { + let len = engine.decode_slice(v, &mut buffer[offset..]).unwrap(); + // This cannot overflow as `len` is less than `v.len()` and `a` is valid + offset += len; + } + offsets.push(O::usize_as(offset)); + } + + // Safety: offsets monotonically increasing by construction + let offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) }; + + Ok(GenericBinaryArray::new( + offsets, + buffer.into(), + array.nulls().cloned(), + )) +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::BinaryArray; + use base64::prelude::{BASE64_STANDARD, BASE64_STANDARD_NO_PAD}; + use rand::{thread_rng, Rng}; + + fn test_engine(e: &E, a: &BinaryArray) { + let encoded = b64_encode(e, a); + encoded.to_data().validate_full().unwrap(); + + let to_decode = encoded.into(); + let decoded = b64_decode(e, &to_decode).unwrap(); + decoded.to_data().validate_full().unwrap(); + + assert_eq!(&decoded, a); + } + + #[test] + fn test_b64() { + let mut rng = thread_rng(); + let len = rng.gen_range(1024..1050); + let data: BinaryArray = (0..len) + .map(|_| { + let len = rng.gen_range(0..16); + Some((0..len).map(|_| rng.gen()).collect::>()) + }) + .collect(); + + test_engine(&BASE64_STANDARD, &data); + test_engine(&BASE64_STANDARD_NO_PAD, &data); + } +} diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 7b8e6144bb49..8facb4f161f4 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -46,10 +46,9 @@ use crate::parse::{ parse_interval_day_time, parse_interval_month_day_nano, parse_interval_year_month, string_to_datetime, Parser, }; -use arrow_array::{ - builder::*, cast::*, temporal_conversions::*, timezone::Tz, types::*, *, -}; -use arrow_buffer::{i256, ArrowNativeType, Buffer, OffsetBuffer}; +use arrow_array::{builder::*, cast::*, temporal_conversions::*, timezone::Tz, types::*, *}; +use arrow_buffer::{i256, ArrowNativeType, OffsetBuffer}; +use arrow_data::transform::MutableArrayData; use arrow_data::ArrayData; use arrow_schema::*; use arrow_select::take::take; @@ -74,10 +73,9 @@ impl<'a> Default for CastOptions<'a> { } } -/// Return true if a value of type `from_type` can be cast into a -/// value of `to_type`. Note that such as cast may be lossy. +/// Return true if a value of type `from_type` can be cast into a value of `to_type`. /// -/// If this function returns true to stay consistent with the `cast` kernel below. +/// See [`cast_with_options`] for more information pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { use self::DataType::*; use self::IntervalUnit::*; @@ -125,21 +123,15 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { } (Dictionary(_, value_type), _) => can_cast_types(value_type, to_type), (_, Dictionary(_, value_type)) => can_cast_types(from_type, value_type), - (LargeList(list_from), LargeList(list_to)) => { - can_cast_types(list_from.data_type(), list_to.data_type()) - } - (List(list_from), List(list_to)) => { + (List(list_from) | LargeList(list_from), List(list_to) | LargeList(list_to)) => { can_cast_types(list_from.data_type(), list_to.data_type()) } - (List(list_from), LargeList(list_to)) => { - list_from.data_type() == list_to.data_type() - } - (LargeList(list_from), List(list_to)) => { - list_from.data_type() == list_to.data_type() - } (List(list_from) | LargeList(list_from), Utf8 | LargeUtf8) => { can_cast_types(list_from.data_type(), to_type) } + (List(list_from) | LargeList(list_from), FixedSizeList(list_to, _)) => { + can_cast_types(list_from.data_type(), list_to.data_type()) + } (List(_), _) => false, (FixedSizeList(list_from,_), List(list_to)) => { list_from.data_type() == list_to.data_type() @@ -168,17 +160,16 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Decimal128(_, _) | Decimal256(_, _), Utf8 | LargeUtf8) => true, // Utf8 to decimal (Utf8 | LargeUtf8, Decimal128(_, _) | Decimal256(_, _)) => true, - (Decimal128(_, _) | Decimal256(_, _), _) => false, - (_, Decimal128(_, _) | Decimal256(_, _)) => false, (Struct(_), _) => false, (_, Struct(_)) => false, (_, Boolean) => { - DataType::is_numeric(from_type) + DataType::is_integer(from_type) || + DataType::is_floating(from_type) || from_type == &Utf8 || from_type == &LargeUtf8 } (Boolean, _) => { - DataType::is_numeric(to_type) || to_type == &Utf8 || to_type == &LargeUtf8 + DataType::is_integer(to_type) || DataType::is_floating(to_type) || to_type == &Utf8 || to_type == &LargeUtf8 } (Binary, LargeBinary | Utf8 | LargeUtf8 | FixedSizeBinary(_)) => true, @@ -205,6 +196,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Utf8 | LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16, (_, Utf8 | LargeUtf8) => from_type.is_primitive(), + (_, Binary | LargeBinary) => from_type.is_integer(), + // start numeric casts ( UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64 | Float32 | Float64, @@ -227,8 +220,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Time64(_), Time32(to_unit)) => { matches!(to_unit, Second | Millisecond) } - (Timestamp(_, _), Int64) => true, - (Int64, Timestamp(_, _)) => true, + (Timestamp(_, _), _) if to_type.is_numeric() && to_type != &Float16 => true, + (_, Timestamp(_, _)) if from_type.is_numeric() && from_type != &Float16 => true, (Date64, Timestamp(_, None)) => true, (Date32, Timestamp(_, None)) => true, ( @@ -268,30 +261,9 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { } } -/// Cast `array` to the provided data type and return a new Array with -/// type `to_type`, if possible. +/// Cast `array` to the provided data type and return a new Array with type `to_type`, if possible. /// -/// Behavior: -/// * Boolean to Utf8: `true` => '1', `false` => `0` -/// * Utf8 to boolean: `true`, `yes`, `on`, `1` => `true`, `false`, `no`, `off`, `0` => `false`, -/// short variants are accepted, other strings return null or error -/// * Utf8 to numeric: strings that can't be parsed to numbers return null, float strings -/// in integer casts return null -/// * Numeric to boolean: 0 returns `false`, any other value returns `true` -/// * List to List: the underlying data type is cast -/// * Primitive to List: a list array with 1 value per slot is created -/// * Date32 and Date64: precision lost when going to higher interval -/// * Time32 and Time64: precision lost when going to higher interval -/// * Timestamp and Date{32|64}: precision lost when going to higher interval -/// * Temporal to/from backing primitive: zero-copy with data type change -/// * Casting from `float32/float64` to `Decimal(precision, scale)` rounds to the `scale` decimals -/// (i.e. casting 6.4999 to Decimal(10, 1) becomes 6.5). This is the breaking change from `26.0.0`. -/// It used to truncate it instead of round (i.e. outputs 6.4 instead) -/// -/// Unsupported Casts -/// * To or from `StructArray` -/// * List to primitive -/// * Interval and duration +/// See [`cast_with_options`] for more information pub fn cast(array: &dyn Array, to_type: &DataType) -> Result { cast_with_options(array, to_type, &CastOptions::default()) } @@ -364,21 +336,32 @@ where if cast_options.safe { array - .unary_opt::<_, Decimal128Type>(|v| (mul * v.as_()).round().to_i128()) + .unary_opt::<_, Decimal128Type>(|v| { + (mul * v.as_()) + .round() + .to_i128() + .filter(|v| Decimal128Type::validate_decimal_precision(*v, precision).is_ok()) + }) .with_precision_and_scale(precision, scale) .map(|a| Arc::new(a) as ArrayRef) } else { array .try_unary::<_, Decimal128Type, _>(|v| { - (mul * v.as_()).round().to_i128().ok_or_else(|| { - ArrowError::CastError(format!( - "Cannot cast to {}({}, {}). Overflowing on {:?}", - Decimal128Type::PREFIX, - precision, - scale, - v - )) - }) + (mul * v.as_()) + .round() + .to_i128() + .ok_or_else(|| { + ArrowError::CastError(format!( + "Cannot cast to {}({}, {}). Overflowing on {:?}", + Decimal128Type::PREFIX, + precision, + scale, + v + )) + }) + .and_then(|v| { + Decimal128Type::validate_decimal_precision(v, precision).map(|_| v) + }) })? .with_precision_and_scale(precision, scale) .map(|a| Arc::new(a) as ArrayRef) @@ -398,21 +381,28 @@ where if cast_options.safe { array - .unary_opt::<_, Decimal256Type>(|v| i256::from_f64((v.as_() * mul).round())) + .unary_opt::<_, Decimal256Type>(|v| { + i256::from_f64((v.as_() * mul).round()) + .filter(|v| Decimal256Type::validate_decimal_precision(*v, precision).is_ok()) + }) .with_precision_and_scale(precision, scale) .map(|a| Arc::new(a) as ArrayRef) } else { array .try_unary::<_, Decimal256Type, _>(|v| { - i256::from_f64((v.as_() * mul).round()).ok_or_else(|| { - ArrowError::CastError(format!( - "Cannot cast to {}({}, {}). Overflowing on {:?}", - Decimal256Type::PREFIX, - precision, - scale, - v - )) - }) + i256::from_f64((v.as_() * mul).round()) + .ok_or_else(|| { + ArrowError::CastError(format!( + "Cannot cast to {}({}, {}). Overflowing on {:?}", + Decimal256Type::PREFIX, + precision, + scale, + v + )) + }) + .and_then(|v| { + Decimal256Type::validate_decimal_precision(v, precision).map(|_| v) + }) })? .with_precision_and_scale(precision, scale) .map(|a| Arc::new(a) as ArrayRef) @@ -473,7 +463,10 @@ fn cast_month_day_nano_to_duration>( .map(|v| { v.map(|v| match v >> 64 { 0 => Ok((v as i64) / scale), - _ => Err(ArrowError::ComputeError("Cannot convert interval containing non-zero months or days to duration".to_string())) + _ => Err(ArrowError::ComputeError( + "Cannot convert interval containing non-zero months or days to duration" + .to_string(), + )), }) .transpose() }) @@ -539,10 +532,7 @@ fn cast_duration_to_interval>( } /// Cast the primitive array using [`PrimitiveArray::reinterpret_cast`] -fn cast_reinterpret_arrays< - I: ArrowPrimitiveType, - O: ArrowPrimitiveType, ->( +fn cast_reinterpret_arrays>( array: &dyn Array, ) -> Result { Ok(Arc::new(array.as_primitive::().reinterpret_cast::())) @@ -593,14 +583,13 @@ where } else { let v = array.value(i).div_checked(div)?; - let value = - ::from::(v).ok_or_else(|| { - ArrowError::CastError(format!( - "value of {:?} is out of range {}", - v, - T::DATA_TYPE - )) - })?; + let value = ::from::(v).ok_or_else(|| { + ArrowError::CastError(format!( + "value of {:?} is out of range {}", + v, + T::DATA_TYPE + )) + })?; value_builder.append_value(value); } @@ -669,11 +658,11 @@ fn as_time_res_with_timezone( }) } -/// Cast `array` to the provided data type and return a new Array with -/// type `to_type`, if possible. It accepts `CastOptions` to allow consumers -/// to configure cast behavior. +/// Cast `array` to the provided data type and return a new Array with type `to_type`, if possible. +/// +/// Accepts [`CastOptions`] to specify cast behavior. /// -/// Behavior: +/// ## Behavior /// * Boolean to Utf8: `true` => '1', `false` => `0` /// * Utf8 to boolean: `true`, `yes`, `on`, `1` => `true`, `false`, `no`, `off`, `0` => `false`, /// short variants are accepted, other strings return null or error @@ -681,15 +670,21 @@ fn as_time_res_with_timezone( /// in integer casts return null /// * Numeric to boolean: 0 returns `false`, any other value returns `true` /// * List to List: the underlying data type is cast +/// * List to FixedSizeList: the underlying data type is cast. If safe is true and a list element +/// has the wrong length it will be replaced with NULL, otherwise an error will be returned /// * Primitive to List: a list array with 1 value per slot is created /// * Date32 and Date64: precision lost when going to higher interval /// * Time32 and Time64: precision lost when going to higher interval /// * Timestamp and Date{32|64}: precision lost when going to higher interval /// * Temporal to/from backing primitive: zero-copy with data type change +/// * Casting from `float32/float64` to `Decimal(precision, scale)` rounds to the `scale` decimals +/// (i.e. casting `6.4999` to Decimal(10, 1) becomes `6.5`). Prior to version `26.0.0`, +/// casting would truncate instead (i.e. outputs `6.4` instead) /// /// Unsupported Casts /// * To or from `StructArray` /// * List to primitive +/// * Interval and duration pub fn cast_with_options( array: &dyn Array, to_type: &DataType, @@ -760,29 +755,17 @@ pub fn cast_with_options( "Casting from type {from_type:?} to dictionary type {to_type:?} not supported", ))), }, - (List(_), List(ref to)) => { - cast_list_inner::(array, to, to_type, cast_options) - } - (LargeList(_), LargeList(ref to)) => { - cast_list_inner::(array, to, to_type, cast_options) - } - (List(list_from), LargeList(list_to)) => { - if list_to.data_type() != list_from.data_type() { - Err(ArrowError::CastError( - "cannot cast list to large-list with different child data".into(), - )) - } else { - cast_list_container::(array, cast_options) - } - } - (LargeList(list_from), List(list_to)) => { - if list_to.data_type() != list_from.data_type() { - Err(ArrowError::CastError( - "cannot cast large-list to list with different child data".into(), - )) - } else { - cast_list_container::(array, cast_options) - } + (List(_), List(to)) => cast_list_values::(array, to, cast_options), + (LargeList(_), LargeList(to)) => cast_list_values::(array, to, cast_options), + (List(_), LargeList(list_to)) => cast_list::(array, list_to, cast_options), + (LargeList(_), List(list_to)) => cast_list::(array, list_to, cast_options), + (List(_), FixedSizeList(field, size)) => { + let array = array.as_list::(); + cast_list_to_fixed_size_list::(array, field, *size, cast_options) + } + (LargeList(_), FixedSizeList(field, size)) => { + let array = array.as_list::(); + cast_list_to_fixed_size_list::(array, field, *size, cast_options) } (List(_) | LargeList(_), _) => match to_type { Utf8 => value_to_string::(array, cast_options), @@ -809,7 +792,6 @@ pub fn cast_with_options( cast_fixed_size_list_to_list::(array) } } - (_, List(ref to)) => cast_values_to_list::(array, to, cast_options), (_, LargeList(ref to)) => cast_values_to_list::(array, to, cast_options), (Decimal128(_, s1), Decimal128(p2, s2)) => { @@ -848,7 +830,7 @@ pub fn cast_with_options( cast_options, ) } - (Decimal128(_, scale), _) => { + (Decimal128(_, scale), _) if !to_type.is_temporal() => { // cast decimal to other type match to_type { UInt8 => cast_decimal_to_integer::( @@ -899,16 +881,12 @@ pub fn cast_with_options( *scale, cast_options, ), - Float32 => { - cast_decimal_to_float::(array, |x| { - (x as f64 / 10_f64.powi(*scale as i32)) as f32 - }) - } - Float64 => { - cast_decimal_to_float::(array, |x| { - x as f64 / 10_f64.powi(*scale as i32) - }) - } + Float32 => cast_decimal_to_float::(array, |x| { + (x as f64 / 10_f64.powi(*scale as i32)) as f32 + }), + Float64 => cast_decimal_to_float::(array, |x| { + x as f64 / 10_f64.powi(*scale as i32) + }), Utf8 => value_to_string::(array, cast_options), LargeUtf8 => value_to_string::(array, cast_options), Null => Ok(new_null_array(to_type, array.len())), @@ -917,7 +895,7 @@ pub fn cast_with_options( ))), } } - (Decimal256(_, scale), _) => { + (Decimal256(_, scale), _) if !to_type.is_temporal() => { // cast decimal to other type match to_type { UInt8 => cast_decimal_to_integer::( @@ -968,16 +946,12 @@ pub fn cast_with_options( *scale, cast_options, ), - Float32 => { - cast_decimal_to_float::(array, |x| { - (x.to_f64().unwrap() / 10_f64.powi(*scale as i32)) as f32 - }) - } - Float64 => { - cast_decimal_to_float::(array, |x| { - x.to_f64().unwrap() / 10_f64.powi(*scale as i32) - }) - } + Float32 => cast_decimal_to_float::(array, |x| { + (x.to_f64().unwrap() / 10_f64.powi(*scale as i32)) as f32 + }), + Float64 => cast_decimal_to_float::(array, |x| { + x.to_f64().unwrap() / 10_f64.powi(*scale as i32) + }), Utf8 => value_to_string::(array, cast_options), LargeUtf8 => value_to_string::(array, cast_options), Null => Ok(new_null_array(to_type, array.len())), @@ -986,7 +960,7 @@ pub fn cast_with_options( ))), } } - (_, Decimal128(precision, scale)) => { + (_, Decimal128(precision, scale)) if !from_type.is_temporal() => { // cast data to decimal match from_type { UInt8 => cast_integer_to_decimal::<_, Decimal128Type, _>( @@ -1075,7 +1049,7 @@ pub fn cast_with_options( ))), } } - (_, Decimal256(precision, scale)) => { + (_, Decimal256(precision, scale)) if !from_type.is_temporal() => { // cast data to decimal match from_type { UInt8 => cast_integer_to_decimal::<_, Decimal256Type, _>( @@ -1219,25 +1193,35 @@ pub fn cast_with_options( Float64 => parse_string::(array, cast_options), Date32 => parse_string::(array, cast_options), Date64 => parse_string::(array, cast_options), - Binary => Ok(Arc::new(BinaryArray::from(array.as_string::().clone()))), + Binary => Ok(Arc::new(BinaryArray::from( + array.as_string::().clone(), + ))), LargeBinary => { let binary = BinaryArray::from(array.as_string::().clone()); cast_byte_container::(&binary) } LargeUtf8 => cast_byte_container::(array), Time32(TimeUnit::Second) => parse_string::(array, cast_options), - Time32(TimeUnit::Millisecond) => parse_string::(array, cast_options), - Time64(TimeUnit::Microsecond) => parse_string::(array, cast_options), - Time64(TimeUnit::Nanosecond) => parse_string::(array, cast_options), - Timestamp(TimeUnit::Second, to_tz) => { - cast_string_to_timestamp::(array, to_tz, cast_options) + Time32(TimeUnit::Millisecond) => { + parse_string::(array, cast_options) } - Timestamp(TimeUnit::Millisecond, to_tz) => { - cast_string_to_timestamp::(array, to_tz, cast_options) + Time64(TimeUnit::Microsecond) => { + parse_string::(array, cast_options) } - Timestamp(TimeUnit::Microsecond, to_tz) => { - cast_string_to_timestamp::(array, to_tz, cast_options) + Time64(TimeUnit::Nanosecond) => { + parse_string::(array, cast_options) } + Timestamp(TimeUnit::Second, to_tz) => { + cast_string_to_timestamp::(array, to_tz, cast_options) + } + Timestamp(TimeUnit::Millisecond, to_tz) => cast_string_to_timestamp::< + i32, + TimestampMillisecondType, + >(array, to_tz, cast_options), + Timestamp(TimeUnit::Microsecond, to_tz) => cast_string_to_timestamp::< + i32, + TimestampMicrosecondType, + >(array, to_tz, cast_options), Timestamp(TimeUnit::Nanosecond, to_tz) => { cast_string_to_timestamp::(array, to_tz, cast_options) } @@ -1269,26 +1253,33 @@ pub fn cast_with_options( Date64 => parse_string::(array, cast_options), Utf8 => cast_byte_container::(array), Binary => { - let large_binary = - LargeBinaryArray::from(array.as_string::().clone()); + let large_binary = LargeBinaryArray::from(array.as_string::().clone()); cast_byte_container::(&large_binary) } LargeBinary => Ok(Arc::new(LargeBinaryArray::from( array.as_string::().clone(), ))), Time32(TimeUnit::Second) => parse_string::(array, cast_options), - Time32(TimeUnit::Millisecond) => parse_string::(array, cast_options), - Time64(TimeUnit::Microsecond) => parse_string::(array, cast_options), - Time64(TimeUnit::Nanosecond) => parse_string::(array, cast_options), - Timestamp(TimeUnit::Second, to_tz) => { - cast_string_to_timestamp::(array, to_tz, cast_options) + Time32(TimeUnit::Millisecond) => { + parse_string::(array, cast_options) } - Timestamp(TimeUnit::Millisecond, to_tz) => { - cast_string_to_timestamp::(array, to_tz, cast_options) + Time64(TimeUnit::Microsecond) => { + parse_string::(array, cast_options) } - Timestamp(TimeUnit::Microsecond, to_tz) => { - cast_string_to_timestamp::(array, to_tz, cast_options) + Time64(TimeUnit::Nanosecond) => { + parse_string::(array, cast_options) } + Timestamp(TimeUnit::Second, to_tz) => { + cast_string_to_timestamp::(array, to_tz, cast_options) + } + Timestamp(TimeUnit::Millisecond, to_tz) => cast_string_to_timestamp::< + i64, + TimestampMillisecondType, + >(array, to_tz, cast_options), + Timestamp(TimeUnit::Microsecond, to_tz) => cast_string_to_timestamp::< + i64, + TimestampMicrosecondType, + >(array, to_tz, cast_options), Timestamp(TimeUnit::Nanosecond, to_tz) => { cast_string_to_timestamp::(array, to_tz, cast_options) } @@ -1311,9 +1302,7 @@ pub fn cast_with_options( let array = cast_binary_to_string::(array, cast_options)?; cast_byte_container::(array.as_ref()) } - LargeBinary => { - cast_byte_container::(array) - } + LargeBinary => cast_byte_container::(array), FixedSizeBinary(size) => { cast_binary_to_fixed_size_binary::(array, *size, cast_options) } @@ -1337,278 +1326,139 @@ pub fn cast_with_options( }, (FixedSizeBinary(size), _) => match to_type { Binary => cast_fixed_size_binary_to_binary::(array, *size), - LargeBinary => - cast_fixed_size_binary_to_binary::(array, *size), + LargeBinary => cast_fixed_size_binary_to_binary::(array, *size), _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", ))), }, - (from_type, LargeUtf8) if from_type.is_primitive() => value_to_string::(array, cast_options), - (from_type, Utf8) if from_type.is_primitive() => value_to_string::(array, cast_options), + (from_type, LargeUtf8) if from_type.is_primitive() => { + value_to_string::(array, cast_options) + } + (from_type, Utf8) if from_type.is_primitive() => { + value_to_string::(array, cast_options) + } + (from_type, Binary) if from_type.is_integer() => match from_type { + UInt8 => cast_numeric_to_binary::(array), + UInt16 => cast_numeric_to_binary::(array), + UInt32 => cast_numeric_to_binary::(array), + UInt64 => cast_numeric_to_binary::(array), + Int8 => cast_numeric_to_binary::(array), + Int16 => cast_numeric_to_binary::(array), + Int32 => cast_numeric_to_binary::(array), + Int64 => cast_numeric_to_binary::(array), + _ => unreachable!(), + }, + (from_type, LargeBinary) if from_type.is_integer() => match from_type { + UInt8 => cast_numeric_to_binary::(array), + UInt16 => cast_numeric_to_binary::(array), + UInt32 => cast_numeric_to_binary::(array), + UInt64 => cast_numeric_to_binary::(array), + Int8 => cast_numeric_to_binary::(array), + Int16 => cast_numeric_to_binary::(array), + Int32 => cast_numeric_to_binary::(array), + Int64 => cast_numeric_to_binary::(array), + _ => unreachable!(), + }, // start numeric casts - (UInt8, UInt16) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt8, UInt32) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt8, UInt64) => { - cast_numeric_arrays::(array, cast_options) - } + (UInt8, UInt16) => cast_numeric_arrays::(array, cast_options), + (UInt8, UInt32) => cast_numeric_arrays::(array, cast_options), + (UInt8, UInt64) => cast_numeric_arrays::(array, cast_options), (UInt8, Int8) => cast_numeric_arrays::(array, cast_options), - (UInt8, Int16) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt8, Int32) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt8, Int64) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt8, Float32) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt8, Float64) => { - cast_numeric_arrays::(array, cast_options) - } - - (UInt16, UInt8) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt16, UInt32) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt16, UInt64) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt16, Int8) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt16, Int16) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt16, Int32) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt16, Int64) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt16, Float32) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt16, Float64) => { - cast_numeric_arrays::(array, cast_options) - } - - (UInt32, UInt8) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt32, UInt16) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt32, UInt64) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt32, Int8) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt32, Int16) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt32, Int32) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt32, Int64) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt32, Float32) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt32, Float64) => { - cast_numeric_arrays::(array, cast_options) - } - - (UInt64, UInt8) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt64, UInt16) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt64, UInt32) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt64, Int8) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt64, Int16) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt64, Int32) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt64, Int64) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt64, Float32) => { - cast_numeric_arrays::(array, cast_options) - } - (UInt64, Float64) => { - cast_numeric_arrays::(array, cast_options) - } + (UInt8, Int16) => cast_numeric_arrays::(array, cast_options), + (UInt8, Int32) => cast_numeric_arrays::(array, cast_options), + (UInt8, Int64) => cast_numeric_arrays::(array, cast_options), + (UInt8, Float32) => cast_numeric_arrays::(array, cast_options), + (UInt8, Float64) => cast_numeric_arrays::(array, cast_options), + + (UInt16, UInt8) => cast_numeric_arrays::(array, cast_options), + (UInt16, UInt32) => cast_numeric_arrays::(array, cast_options), + (UInt16, UInt64) => cast_numeric_arrays::(array, cast_options), + (UInt16, Int8) => cast_numeric_arrays::(array, cast_options), + (UInt16, Int16) => cast_numeric_arrays::(array, cast_options), + (UInt16, Int32) => cast_numeric_arrays::(array, cast_options), + (UInt16, Int64) => cast_numeric_arrays::(array, cast_options), + (UInt16, Float32) => cast_numeric_arrays::(array, cast_options), + (UInt16, Float64) => cast_numeric_arrays::(array, cast_options), + + (UInt32, UInt8) => cast_numeric_arrays::(array, cast_options), + (UInt32, UInt16) => cast_numeric_arrays::(array, cast_options), + (UInt32, UInt64) => cast_numeric_arrays::(array, cast_options), + (UInt32, Int8) => cast_numeric_arrays::(array, cast_options), + (UInt32, Int16) => cast_numeric_arrays::(array, cast_options), + (UInt32, Int32) => cast_numeric_arrays::(array, cast_options), + (UInt32, Int64) => cast_numeric_arrays::(array, cast_options), + (UInt32, Float32) => cast_numeric_arrays::(array, cast_options), + (UInt32, Float64) => cast_numeric_arrays::(array, cast_options), + + (UInt64, UInt8) => cast_numeric_arrays::(array, cast_options), + (UInt64, UInt16) => cast_numeric_arrays::(array, cast_options), + (UInt64, UInt32) => cast_numeric_arrays::(array, cast_options), + (UInt64, Int8) => cast_numeric_arrays::(array, cast_options), + (UInt64, Int16) => cast_numeric_arrays::(array, cast_options), + (UInt64, Int32) => cast_numeric_arrays::(array, cast_options), + (UInt64, Int64) => cast_numeric_arrays::(array, cast_options), + (UInt64, Float32) => cast_numeric_arrays::(array, cast_options), + (UInt64, Float64) => cast_numeric_arrays::(array, cast_options), (Int8, UInt8) => cast_numeric_arrays::(array, cast_options), - (Int8, UInt16) => { - cast_numeric_arrays::(array, cast_options) - } - (Int8, UInt32) => { - cast_numeric_arrays::(array, cast_options) - } - (Int8, UInt64) => { - cast_numeric_arrays::(array, cast_options) - } + (Int8, UInt16) => cast_numeric_arrays::(array, cast_options), + (Int8, UInt32) => cast_numeric_arrays::(array, cast_options), + (Int8, UInt64) => cast_numeric_arrays::(array, cast_options), (Int8, Int16) => cast_numeric_arrays::(array, cast_options), (Int8, Int32) => cast_numeric_arrays::(array, cast_options), (Int8, Int64) => cast_numeric_arrays::(array, cast_options), - (Int8, Float32) => { - cast_numeric_arrays::(array, cast_options) - } - (Int8, Float64) => { - cast_numeric_arrays::(array, cast_options) - } + (Int8, Float32) => cast_numeric_arrays::(array, cast_options), + (Int8, Float64) => cast_numeric_arrays::(array, cast_options), - (Int16, UInt8) => { - cast_numeric_arrays::(array, cast_options) - } - (Int16, UInt16) => { - cast_numeric_arrays::(array, cast_options) - } - (Int16, UInt32) => { - cast_numeric_arrays::(array, cast_options) - } - (Int16, UInt64) => { - cast_numeric_arrays::(array, cast_options) - } + (Int16, UInt8) => cast_numeric_arrays::(array, cast_options), + (Int16, UInt16) => cast_numeric_arrays::(array, cast_options), + (Int16, UInt32) => cast_numeric_arrays::(array, cast_options), + (Int16, UInt64) => cast_numeric_arrays::(array, cast_options), (Int16, Int8) => cast_numeric_arrays::(array, cast_options), - (Int16, Int32) => { - cast_numeric_arrays::(array, cast_options) - } - (Int16, Int64) => { - cast_numeric_arrays::(array, cast_options) - } - (Int16, Float32) => { - cast_numeric_arrays::(array, cast_options) - } - (Int16, Float64) => { - cast_numeric_arrays::(array, cast_options) - } - - (Int32, UInt8) => { - cast_numeric_arrays::(array, cast_options) - } - (Int32, UInt16) => { - cast_numeric_arrays::(array, cast_options) - } - (Int32, UInt32) => { - cast_numeric_arrays::(array, cast_options) - } - (Int32, UInt64) => { - cast_numeric_arrays::(array, cast_options) - } + (Int16, Int32) => cast_numeric_arrays::(array, cast_options), + (Int16, Int64) => cast_numeric_arrays::(array, cast_options), + (Int16, Float32) => cast_numeric_arrays::(array, cast_options), + (Int16, Float64) => cast_numeric_arrays::(array, cast_options), + + (Int32, UInt8) => cast_numeric_arrays::(array, cast_options), + (Int32, UInt16) => cast_numeric_arrays::(array, cast_options), + (Int32, UInt32) => cast_numeric_arrays::(array, cast_options), + (Int32, UInt64) => cast_numeric_arrays::(array, cast_options), (Int32, Int8) => cast_numeric_arrays::(array, cast_options), - (Int32, Int16) => { - cast_numeric_arrays::(array, cast_options) - } - (Int32, Int64) => { - cast_numeric_arrays::(array, cast_options) - } - (Int32, Float32) => { - cast_numeric_arrays::(array, cast_options) - } - (Int32, Float64) => { - cast_numeric_arrays::(array, cast_options) - } - - (Int64, UInt8) => { - cast_numeric_arrays::(array, cast_options) - } - (Int64, UInt16) => { - cast_numeric_arrays::(array, cast_options) - } - (Int64, UInt32) => { - cast_numeric_arrays::(array, cast_options) - } - (Int64, UInt64) => { - cast_numeric_arrays::(array, cast_options) - } + (Int32, Int16) => cast_numeric_arrays::(array, cast_options), + (Int32, Int64) => cast_numeric_arrays::(array, cast_options), + (Int32, Float32) => cast_numeric_arrays::(array, cast_options), + (Int32, Float64) => cast_numeric_arrays::(array, cast_options), + + (Int64, UInt8) => cast_numeric_arrays::(array, cast_options), + (Int64, UInt16) => cast_numeric_arrays::(array, cast_options), + (Int64, UInt32) => cast_numeric_arrays::(array, cast_options), + (Int64, UInt64) => cast_numeric_arrays::(array, cast_options), (Int64, Int8) => cast_numeric_arrays::(array, cast_options), - (Int64, Int16) => { - cast_numeric_arrays::(array, cast_options) - } - (Int64, Int32) => { - cast_numeric_arrays::(array, cast_options) - } - (Int64, Float32) => { - cast_numeric_arrays::(array, cast_options) - } - (Int64, Float64) => { - cast_numeric_arrays::(array, cast_options) - } - - (Float32, UInt8) => { - cast_numeric_arrays::(array, cast_options) - } - (Float32, UInt16) => { - cast_numeric_arrays::(array, cast_options) - } - (Float32, UInt32) => { - cast_numeric_arrays::(array, cast_options) - } - (Float32, UInt64) => { - cast_numeric_arrays::(array, cast_options) - } - (Float32, Int8) => { - cast_numeric_arrays::(array, cast_options) - } - (Float32, Int16) => { - cast_numeric_arrays::(array, cast_options) - } - (Float32, Int32) => { - cast_numeric_arrays::(array, cast_options) - } - (Float32, Int64) => { - cast_numeric_arrays::(array, cast_options) - } - (Float32, Float64) => { - cast_numeric_arrays::(array, cast_options) - } - - (Float64, UInt8) => { - cast_numeric_arrays::(array, cast_options) - } - (Float64, UInt16) => { - cast_numeric_arrays::(array, cast_options) - } - (Float64, UInt32) => { - cast_numeric_arrays::(array, cast_options) - } - (Float64, UInt64) => { - cast_numeric_arrays::(array, cast_options) - } - (Float64, Int8) => { - cast_numeric_arrays::(array, cast_options) - } - (Float64, Int16) => { - cast_numeric_arrays::(array, cast_options) - } - (Float64, Int32) => { - cast_numeric_arrays::(array, cast_options) - } - (Float64, Int64) => { - cast_numeric_arrays::(array, cast_options) - } - (Float64, Float32) => { - cast_numeric_arrays::(array, cast_options) - } + (Int64, Int16) => cast_numeric_arrays::(array, cast_options), + (Int64, Int32) => cast_numeric_arrays::(array, cast_options), + (Int64, Float32) => cast_numeric_arrays::(array, cast_options), + (Int64, Float64) => cast_numeric_arrays::(array, cast_options), + + (Float32, UInt8) => cast_numeric_arrays::(array, cast_options), + (Float32, UInt16) => cast_numeric_arrays::(array, cast_options), + (Float32, UInt32) => cast_numeric_arrays::(array, cast_options), + (Float32, UInt64) => cast_numeric_arrays::(array, cast_options), + (Float32, Int8) => cast_numeric_arrays::(array, cast_options), + (Float32, Int16) => cast_numeric_arrays::(array, cast_options), + (Float32, Int32) => cast_numeric_arrays::(array, cast_options), + (Float32, Int64) => cast_numeric_arrays::(array, cast_options), + (Float32, Float64) => cast_numeric_arrays::(array, cast_options), + + (Float64, UInt8) => cast_numeric_arrays::(array, cast_options), + (Float64, UInt16) => cast_numeric_arrays::(array, cast_options), + (Float64, UInt32) => cast_numeric_arrays::(array, cast_options), + (Float64, UInt64) => cast_numeric_arrays::(array, cast_options), + (Float64, Int8) => cast_numeric_arrays::(array, cast_options), + (Float64, Int16) => cast_numeric_arrays::(array, cast_options), + (Float64, Int32) => cast_numeric_arrays::(array, cast_options), + (Float64, Int64) => cast_numeric_arrays::(array, cast_options), + (Float64, Float32) => cast_numeric_arrays::(array, cast_options), // end numeric casts // temporal casts @@ -1664,92 +1514,106 @@ pub fn cast_with_options( cast_reinterpret_arrays::(array) } (Date32, Date64) => Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .unary::<_, Date64Type>(|x| x as i64 * MILLISECONDS_IN_DAY), )), (Date64, Date32) => Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .unary::<_, Date32Type>(|x| (x / MILLISECONDS_IN_DAY) as i32), )), (Time32(TimeUnit::Second), Time32(TimeUnit::Millisecond)) => Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .unary::<_, Time32MillisecondType>(|x| x * MILLISECONDS as i32), )), (Time32(TimeUnit::Second), Time64(TimeUnit::Microsecond)) => Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .unary::<_, Time64MicrosecondType>(|x| x as i64 * MICROSECONDS), )), (Time32(TimeUnit::Second), Time64(TimeUnit::Nanosecond)) => Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .unary::<_, Time64NanosecondType>(|x| x as i64 * NANOSECONDS), )), (Time32(TimeUnit::Millisecond), Time32(TimeUnit::Second)) => Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .unary::<_, Time32SecondType>(|x| x / MILLISECONDS as i32), )), (Time32(TimeUnit::Millisecond), Time64(TimeUnit::Microsecond)) => Ok(Arc::new( - array.as_primitive::() - .unary::<_, Time64MicrosecondType>(|x| { - x as i64 * (MICROSECONDS / MILLISECONDS) - }), + array + .as_primitive::() + .unary::<_, Time64MicrosecondType>(|x| x as i64 * (MICROSECONDS / MILLISECONDS)), )), (Time32(TimeUnit::Millisecond), Time64(TimeUnit::Nanosecond)) => Ok(Arc::new( - array.as_primitive::() - .unary::<_, Time64NanosecondType>(|x| { - x as i64 * (MICROSECONDS / NANOSECONDS) - }), + array + .as_primitive::() + .unary::<_, Time64NanosecondType>(|x| x as i64 * (MICROSECONDS / NANOSECONDS)), )), (Time64(TimeUnit::Microsecond), Time32(TimeUnit::Second)) => Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .unary::<_, Time32SecondType>(|x| (x / MICROSECONDS) as i32), )), (Time64(TimeUnit::Microsecond), Time32(TimeUnit::Millisecond)) => Ok(Arc::new( - array.as_primitive::() - .unary::<_, Time32MillisecondType>(|x| { - (x / (MICROSECONDS / MILLISECONDS)) as i32 - }), + array + .as_primitive::() + .unary::<_, Time32MillisecondType>(|x| (x / (MICROSECONDS / MILLISECONDS)) as i32), )), (Time64(TimeUnit::Microsecond), Time64(TimeUnit::Nanosecond)) => Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .unary::<_, Time64NanosecondType>(|x| x * (NANOSECONDS / MICROSECONDS)), )), (Time64(TimeUnit::Nanosecond), Time32(TimeUnit::Second)) => Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .unary::<_, Time32SecondType>(|x| (x / NANOSECONDS) as i32), )), (Time64(TimeUnit::Nanosecond), Time32(TimeUnit::Millisecond)) => Ok(Arc::new( - array.as_primitive::() - .unary::<_, Time32MillisecondType>(|x| { - (x / (NANOSECONDS / MILLISECONDS)) as i32 - }), + array + .as_primitive::() + .unary::<_, Time32MillisecondType>(|x| (x / (NANOSECONDS / MILLISECONDS)) as i32), )), (Time64(TimeUnit::Nanosecond), Time64(TimeUnit::Microsecond)) => Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .unary::<_, Time64MicrosecondType>(|x| x / (NANOSECONDS / MICROSECONDS)), )), - (Timestamp(TimeUnit::Second, _), Int64) => { - cast_reinterpret_arrays::(array) + // Timestamp to integer/floating/decimals + (Timestamp(TimeUnit::Second, _), _) if to_type.is_numeric() => { + let array = cast_reinterpret_arrays::(array)?; + cast_with_options(&array, to_type, cast_options) } - (Timestamp(TimeUnit::Millisecond, _), Int64) => { - cast_reinterpret_arrays::(array) + (Timestamp(TimeUnit::Millisecond, _), _) if to_type.is_numeric() => { + let array = cast_reinterpret_arrays::(array)?; + cast_with_options(&array, to_type, cast_options) } - (Timestamp(TimeUnit::Microsecond, _), Int64) => { - cast_reinterpret_arrays::(array) + (Timestamp(TimeUnit::Microsecond, _), _) if to_type.is_numeric() => { + let array = cast_reinterpret_arrays::(array)?; + cast_with_options(&array, to_type, cast_options) } - (Timestamp(TimeUnit::Nanosecond, _), Int64) => { - cast_reinterpret_arrays::(array) + (Timestamp(TimeUnit::Nanosecond, _), _) if to_type.is_numeric() => { + let array = cast_reinterpret_arrays::(array)?; + cast_with_options(&array, to_type, cast_options) } - (Int64, Timestamp(unit, tz)) => Ok(make_timestamp_array( - array.as_primitive(), - unit.clone(), - tz.clone(), - )), + (_, Timestamp(unit, tz)) if from_type.is_numeric() => { + let array = cast_with_options(array, &Int64, cast_options)?; + Ok(make_timestamp_array( + array.as_primitive(), + unit.clone(), + tz.clone(), + )) + } (Timestamp(from_unit, from_tz), Timestamp(to_unit, to_tz)) => { let array = cast_with_options(array, &Int64, cast_options)?; @@ -1783,39 +1647,29 @@ pub fn cast_with_options( (None, Some(to_tz)) => { let to_tz: Tz = to_tz.parse()?; match to_unit { - TimeUnit::Second => { - adjust_timestamp_to_timezone::( - converted, - &to_tz, - cast_options, - )? - } - TimeUnit::Millisecond => { - adjust_timestamp_to_timezone::( - converted, - &to_tz, - cast_options, - )? - } - TimeUnit::Microsecond => { - adjust_timestamp_to_timezone::( - converted, - &to_tz, - cast_options, - )? - } - TimeUnit::Nanosecond => { - adjust_timestamp_to_timezone::( - converted, - &to_tz, - cast_options, - )? - } + TimeUnit::Second => adjust_timestamp_to_timezone::( + converted, + &to_tz, + cast_options, + )?, + TimeUnit::Millisecond => adjust_timestamp_to_timezone::< + TimestampMillisecondType, + >( + converted, &to_tz, cast_options + )?, + TimeUnit::Microsecond => adjust_timestamp_to_timezone::< + TimestampMicrosecondType, + >( + converted, &to_tz, cast_options + )?, + TimeUnit::Nanosecond => adjust_timestamp_to_timezone::< + TimestampNanosecondType, + >( + converted, &to_tz, cast_options + )?, } } - _ => { - converted - } + _ => converted, }; Ok(make_timestamp_array( &adjusted, @@ -1834,45 +1688,43 @@ pub fn cast_with_options( if time_array.is_null(i) { b.append_null(); } else { - b.append_value(num::integer::div_floor::(time_array.value(i), from_size) as i32); + b.append_value( + num::integer::div_floor::(time_array.value(i), from_size) as i32, + ); } } Ok(Arc::new(b.finish()) as ArrayRef) } - (Timestamp(TimeUnit::Second, _), Date64) => Ok(Arc::new( - match cast_options.safe { - true => { - // change error to None - array.as_primitive::() - .unary_opt::<_, Date64Type>(|x| { - x.checked_mul(MILLISECONDS) - }) - } - false => { - array.as_primitive::().try_unary::<_, Date64Type, _>( - |x| { - x.mul_checked(MILLISECONDS) - }, - )? - } - }, - )), + (Timestamp(TimeUnit::Second, _), Date64) => Ok(Arc::new(match cast_options.safe { + true => { + // change error to None + array + .as_primitive::() + .unary_opt::<_, Date64Type>(|x| x.checked_mul(MILLISECONDS)) + } + false => array + .as_primitive::() + .try_unary::<_, Date64Type, _>(|x| x.mul_checked(MILLISECONDS))?, + })), (Timestamp(TimeUnit::Millisecond, _), Date64) => { cast_reinterpret_arrays::(array) } (Timestamp(TimeUnit::Microsecond, _), Date64) => Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .unary::<_, Date64Type>(|x| x / (MICROSECONDS / MILLISECONDS)), )), (Timestamp(TimeUnit::Nanosecond, _), Date64) => Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .unary::<_, Date64Type>(|x| x / (NANOSECONDS / MILLISECONDS)), )), (Timestamp(TimeUnit::Second, tz), Time64(TimeUnit::Microsecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .try_unary::<_, Time64MicrosecondType, ArrowError>(|x| { Ok(time_to_time64us(as_time_res_with_timezone::< TimestampSecondType, @@ -1883,7 +1735,8 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Second, tz), Time64(TimeUnit::Nanosecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .try_unary::<_, Time64NanosecondType, ArrowError>(|x| { Ok(time_to_time64ns(as_time_res_with_timezone::< TimestampSecondType, @@ -1894,7 +1747,8 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Millisecond, tz), Time64(TimeUnit::Microsecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .try_unary::<_, Time64MicrosecondType, ArrowError>(|x| { Ok(time_to_time64us(as_time_res_with_timezone::< TimestampMillisecondType, @@ -1905,7 +1759,8 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Millisecond, tz), Time64(TimeUnit::Nanosecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .try_unary::<_, Time64NanosecondType, ArrowError>(|x| { Ok(time_to_time64ns(as_time_res_with_timezone::< TimestampMillisecondType, @@ -1916,7 +1771,8 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Microsecond, tz), Time64(TimeUnit::Microsecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .try_unary::<_, Time64MicrosecondType, ArrowError>(|x| { Ok(time_to_time64us(as_time_res_with_timezone::< TimestampMicrosecondType, @@ -1927,7 +1783,8 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Microsecond, tz), Time64(TimeUnit::Nanosecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .try_unary::<_, Time64NanosecondType, ArrowError>(|x| { Ok(time_to_time64ns(as_time_res_with_timezone::< TimestampMicrosecondType, @@ -1938,7 +1795,8 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Nanosecond, tz), Time64(TimeUnit::Microsecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .try_unary::<_, Time64MicrosecondType, ArrowError>(|x| { Ok(time_to_time64us(as_time_res_with_timezone::< TimestampNanosecondType, @@ -1949,7 +1807,8 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Nanosecond, tz), Time64(TimeUnit::Nanosecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .try_unary::<_, Time64NanosecondType, ArrowError>(|x| { Ok(time_to_time64ns(as_time_res_with_timezone::< TimestampNanosecondType, @@ -1960,7 +1819,8 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Second, tz), Time32(TimeUnit::Second)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .try_unary::<_, Time32SecondType, ArrowError>(|x| { Ok(time_to_time32s(as_time_res_with_timezone::< TimestampSecondType, @@ -1971,7 +1831,8 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Second, tz), Time32(TimeUnit::Millisecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .try_unary::<_, Time32MillisecondType, ArrowError>(|x| { Ok(time_to_time32ms(as_time_res_with_timezone::< TimestampSecondType, @@ -1982,7 +1843,8 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Millisecond, tz), Time32(TimeUnit::Second)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .try_unary::<_, Time32SecondType, ArrowError>(|x| { Ok(time_to_time32s(as_time_res_with_timezone::< TimestampMillisecondType, @@ -1993,7 +1855,8 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Millisecond, tz), Time32(TimeUnit::Millisecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .try_unary::<_, Time32MillisecondType, ArrowError>(|x| { Ok(time_to_time32ms(as_time_res_with_timezone::< TimestampMillisecondType, @@ -2004,7 +1867,8 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Microsecond, tz), Time32(TimeUnit::Second)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .try_unary::<_, Time32SecondType, ArrowError>(|x| { Ok(time_to_time32s(as_time_res_with_timezone::< TimestampMicrosecondType, @@ -2015,7 +1879,8 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Microsecond, tz), Time32(TimeUnit::Millisecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .try_unary::<_, Time32MillisecondType, ArrowError>(|x| { Ok(time_to_time32ms(as_time_res_with_timezone::< TimestampMicrosecondType, @@ -2026,7 +1891,8 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Nanosecond, tz), Time32(TimeUnit::Second)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .try_unary::<_, Time32SecondType, ArrowError>(|x| { Ok(time_to_time32s(as_time_res_with_timezone::< TimestampNanosecondType, @@ -2037,7 +1903,8 @@ pub fn cast_with_options( (Timestamp(TimeUnit::Nanosecond, tz), Time32(TimeUnit::Millisecond)) => { let tz = tz.as_ref().map(|tz| tz.parse()).transpose()?; Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .try_unary::<_, Time32MillisecondType, ArrowError>(|x| { Ok(time_to_time32ms(as_time_res_with_timezone::< TimestampNanosecondType, @@ -2047,38 +1914,41 @@ pub fn cast_with_options( } (Date64, Timestamp(TimeUnit::Second, None)) => Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .unary::<_, TimestampSecondType>(|x| x / MILLISECONDS), )), (Date64, Timestamp(TimeUnit::Millisecond, None)) => { cast_reinterpret_arrays::(array) } (Date64, Timestamp(TimeUnit::Microsecond, None)) => Ok(Arc::new( - array.as_primitive::().unary::<_, TimestampMicrosecondType>( - |x| x * (MICROSECONDS / MILLISECONDS), - ), + array + .as_primitive::() + .unary::<_, TimestampMicrosecondType>(|x| x * (MICROSECONDS / MILLISECONDS)), )), (Date64, Timestamp(TimeUnit::Nanosecond, None)) => Ok(Arc::new( - array.as_primitive::().unary::<_, TimestampNanosecondType>( - |x| x * (NANOSECONDS / MILLISECONDS), - ), + array + .as_primitive::() + .unary::<_, TimestampNanosecondType>(|x| x * (NANOSECONDS / MILLISECONDS)), )), (Date32, Timestamp(TimeUnit::Second, None)) => Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .unary::<_, TimestampSecondType>(|x| (x as i64) * SECONDS_IN_DAY), )), (Date32, Timestamp(TimeUnit::Millisecond, None)) => Ok(Arc::new( - array.as_primitive::().unary::<_, TimestampMillisecondType>( - |x| (x as i64) * MILLISECONDS_IN_DAY, - ), + array + .as_primitive::() + .unary::<_, TimestampMillisecondType>(|x| (x as i64) * MILLISECONDS_IN_DAY), )), (Date32, Timestamp(TimeUnit::Microsecond, None)) => Ok(Arc::new( - array.as_primitive::().unary::<_, TimestampMicrosecondType>( - |x| (x as i64) * MICROSECONDS_IN_DAY, - ), + array + .as_primitive::() + .unary::<_, TimestampMicrosecondType>(|x| (x as i64) * MICROSECONDS_IN_DAY), )), (Date32, Timestamp(TimeUnit::Nanosecond, None)) => Ok(Arc::new( - array.as_primitive::() + array + .as_primitive::() .unary::<_, TimestampNanosecondType>(|x| (x as i64) * NANOSECONDS_IN_DAY), )), (Int64, Duration(TimeUnit::Second)) => { @@ -2396,9 +2266,7 @@ where // Natural cast between numeric types // If the value of T can't be casted to R, will throw error -fn try_numeric_cast( - from: &PrimitiveArray, -) -> Result, ArrowError> +fn try_numeric_cast(from: &PrimitiveArray) -> Result, ArrowError> where T: ArrowPrimitiveType, R: ArrowPrimitiveType, @@ -2448,6 +2316,19 @@ fn value_to_string( Ok(Arc::new(builder.finish())) } +fn cast_numeric_to_binary( + array: &dyn Array, +) -> Result { + let array = array.as_primitive::(); + let size = std::mem::size_of::(); + let offsets = OffsetBuffer::from_lengths(std::iter::repeat(size).take(array.len())); + Ok(Arc::new(GenericBinaryArray::::new( + offsets, + array.values().inner().clone(), + array.nulls().cloned(), + ))) +} + /// Parse UTF-8 fn parse_string( array: &dyn Array, @@ -2499,11 +2380,7 @@ fn cast_string_to_timestamp( Ok(Arc::new(out.with_timezone_opt(to_tz.clone()))) } -fn cast_string_to_timestamp_impl< - O: OffsetSizeTrait, - T: ArrowTimestampType, - Tz: TimeZone, ->( +fn cast_string_to_timestamp_impl( array: &GenericStringArray, tz: &Tz, cast_options: &CastOptions, @@ -2660,9 +2537,7 @@ fn adjust_timestamp_to_timezone( } else { array.try_unary::<_, Int64Type, _>(|o| { adjust(o).ok_or_else(|| { - ArrowError::CastError( - "Cannot cast timezone to different timezone".to_string(), - ) + ArrowError::CastError("Cannot cast timezone to different timezone".to_string()) }) })? }; @@ -2686,11 +2561,10 @@ where .iter() .map(|value| match value { Some(value) => match value.to_ascii_lowercase().trim() { - "t" | "tr" | "tru" | "true" | "y" | "ye" | "yes" | "on" | "1" => { - Ok(Some(true)) + "t" | "tr" | "tru" | "true" | "y" | "ye" | "yes" | "on" | "1" => Ok(Some(true)), + "f" | "fa" | "fal" | "fals" | "false" | "n" | "no" | "of" | "off" | "0" => { + Ok(Some(false)) } - "f" | "fa" | "fal" | "fals" | "false" | "n" | "no" | "of" | "off" - | "0" => Ok(Some(false)), invalid_value => match cast_options.safe { true => Ok(None), false => Err(ArrowError::CastError(format!( @@ -2722,19 +2596,38 @@ where ))); } - let integers = parts[0].trim_start_matches('0'); + let (negative, first_part) = if parts[0].is_empty() { + (false, parts[0]) + } else { + match parts[0].as_bytes()[0] { + b'-' => (true, &parts[0][1..]), + b'+' => (false, &parts[0][1..]), + _ => (false, parts[0]), + } + }; + + let integers = first_part.trim_start_matches('0'); let decimals = if parts.len() == 2 { parts[1] } else { "" }; + if !integers.is_empty() && !integers.as_bytes()[0].is_ascii_digit() { + return Err(ArrowError::InvalidArgumentError(format!( + "Invalid decimal format: {value_str:?}" + ))); + } + + if !decimals.is_empty() && !decimals.as_bytes()[0].is_ascii_digit() { + return Err(ArrowError::InvalidArgumentError(format!( + "Invalid decimal format: {value_str:?}" + ))); + } + // Adjust decimal based on scale - let number_decimals = if decimals.len() > scale { + let mut number_decimals = if decimals.len() > scale { let decimal_number = i256::from_string(decimals).ok_or_else(|| { - ArrowError::InvalidArgumentError(format!( - "Cannot parse decimal format: {value_str}" - )) + ArrowError::InvalidArgumentError(format!("Cannot parse decimal format: {value_str}")) })?; - let div = - i256::from_i128(10_i128).pow_checked((decimals.len() - scale) as u32)?; + let div = i256::from_i128(10_i128).pow_checked((decimals.len() - scale) as u32)?; let half = div.div_wrapping(i256::from_i128(2)); let half_neg = half.neg_wrapping(); @@ -2756,9 +2649,7 @@ where "Cannot parse decimal format: {value_str}" )) }) - .map(|v| { - v.mul_wrapping(i256::from_i128(10_i128).pow_wrapping(scale as u32)) - })? + .map(|v| v.mul_wrapping(i256::from_i128(10_i128).pow_wrapping(scale as u32)))? } else { i256::ZERO }; @@ -2771,6 +2662,10 @@ where format!("{integers}{decimals}") }; + if negative { + number_decimals.insert(0, '-'); + } + let value = i256::from_string(number_decimals.as_str()).ok_or_else(|| { ArrowError::InvalidArgumentError(format!( "Cannot convert {} to {}: Overflow", @@ -2780,11 +2675,7 @@ where })?; T::Native::from_decimal(value).ok_or_else(|| { - ArrowError::InvalidArgumentError(format!( - "Cannot convert {} to {}", - value_str, - T::PREFIX - )) + ArrowError::InvalidArgumentError(format!("Cannot convert {} to {}", value_str, T::PREFIX)) }) } @@ -2801,6 +2692,11 @@ where if cast_options.safe { let iter = from.iter().map(|v| { v.and_then(|v| parse_string_to_decimal_native::(v, scale as usize).ok()) + .and_then(|v| { + T::validate_decimal_precision(v, precision) + .is_ok() + .then_some(v) + }) }); // Benefit: // 20% performance improvement @@ -2815,13 +2711,15 @@ where .iter() .map(|v| { v.map(|v| { - parse_string_to_decimal_native::(v, scale as usize).map_err(|_| { - ArrowError::CastError(format!( - "Cannot cast string '{}' to value of {:?} type", - v, - T::DATA_TYPE, - )) - }) + parse_string_to_decimal_native::(v, scale as usize) + .map_err(|_| { + ArrowError::CastError(format!( + "Cannot cast string '{}' to value of {:?} type", + v, + T::DATA_TYPE, + )) + }) + .and_then(|v| T::validate_decimal_precision(v, precision).map(|_| v)) }) .transpose() }) @@ -2878,8 +2776,7 @@ fn cast_numeric_to_bool(from: &dyn Array) -> Result where FROM: ArrowPrimitiveType, { - numeric_to_bool_cast::(from.as_primitive::()) - .map(|to| Arc::new(to) as ArrayRef) + numeric_to_bool_cast::(from.as_primitive::()).map(|to| Arc::new(to) as ArrayRef) } fn numeric_to_bool_cast(from: &PrimitiveArray) -> Result @@ -2918,10 +2815,7 @@ where ))) } -fn bool_to_numeric_cast( - from: &BooleanArray, - _cast_options: &CastOptions, -) -> PrimitiveArray +fn bool_to_numeric_cast(from: &BooleanArray, _cast_options: &CastOptions) -> PrimitiveArray where T: ArrowPrimitiveType, T::Native: num::NumCast, @@ -2969,8 +2863,7 @@ fn dictionary_cast( Arc::new(PrimitiveArray::::from(dict_array.keys().to_data())); let values_array = dict_array.values(); let cast_keys = cast_with_options(&keys_array, to_index_type, cast_options)?; - let cast_values = - cast_with_options(values_array, to_value_type, cast_options)?; + let cast_values = cast_with_options(values_array, to_value_type, cast_options)?; // Failure to cast keys (because they don't fit in the // target type) results in NULL values; @@ -3042,66 +2935,24 @@ fn cast_to_dictionary( use DataType::*; match *dict_value_type { - Int8 => pack_numeric_to_dictionary::( - array, - dict_value_type, - cast_options, - ), - Int16 => pack_numeric_to_dictionary::( - array, - dict_value_type, - cast_options, - ), - Int32 => pack_numeric_to_dictionary::( - array, - dict_value_type, - cast_options, - ), - Int64 => pack_numeric_to_dictionary::( - array, - dict_value_type, - cast_options, - ), - UInt8 => pack_numeric_to_dictionary::( - array, - dict_value_type, - cast_options, - ), - UInt16 => pack_numeric_to_dictionary::( - array, - dict_value_type, - cast_options, - ), - UInt32 => pack_numeric_to_dictionary::( - array, - dict_value_type, - cast_options, - ), - UInt64 => pack_numeric_to_dictionary::( - array, - dict_value_type, - cast_options, - ), - Decimal128(_, _) => pack_numeric_to_dictionary::( - array, - dict_value_type, - cast_options, - ), - Decimal256(_, _) => pack_numeric_to_dictionary::( - array, - dict_value_type, - cast_options, - ), - Utf8 => pack_byte_to_dictionary::>(array, cast_options), - LargeUtf8 => { - pack_byte_to_dictionary::>(array, cast_options) - } - Binary => { - pack_byte_to_dictionary::>(array, cast_options) - } - LargeBinary => { - pack_byte_to_dictionary::>(array, cast_options) + Int8 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), + Int16 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), + Int32 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), + Int64 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), + UInt8 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), + UInt16 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), + UInt32 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), + UInt64 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), + Decimal128(_, _) => { + pack_numeric_to_dictionary::(array, dict_value_type, cast_options) + } + Decimal256(_, _) => { + pack_numeric_to_dictionary::(array, dict_value_type, cast_options) } + Utf8 => pack_byte_to_dictionary::>(array, cast_options), + LargeUtf8 => pack_byte_to_dictionary::>(array, cast_options), + Binary => pack_byte_to_dictionary::>(array, cast_options), + LargeBinary => pack_byte_to_dictionary::>(array, cast_options), _ => Err(ArrowError::CastError(format!( "Unsupported output type for dictionary packing: {dict_value_type:?}" ))), @@ -3123,8 +2974,7 @@ where let cast_values = cast_with_options(array, dict_value_type, cast_options)?; let values = cast_values.as_primitive::(); - let mut b = - PrimitiveDictionaryBuilder::::with_capacity(values.len(), values.len()); + let mut b = PrimitiveDictionaryBuilder::::with_capacity(values.len(), values.len()); // copy each element one at a time for i in 0..values.len() { @@ -3152,8 +3002,7 @@ where .as_any() .downcast_ref::>() .unwrap(); - let mut b = - GenericByteDictionaryBuilder::::with_capacity(values.len(), 1024, 1024); + let mut b = GenericByteDictionaryBuilder::::with_capacity(values.len(), 1024, 1024); // copy each element one at a time for i in 0..values.len() { @@ -3178,29 +3027,6 @@ fn cast_values_to_list( Ok(Arc::new(list)) } -/// Helper function that takes an Generic list container and casts the inner datatype. -fn cast_list_inner( - array: &dyn Array, - to: &Field, - to_type: &DataType, - cast_options: &CastOptions, -) -> Result { - let data = array.to_data(); - let underlying_array = make_array(data.child_data()[0].clone()); - let cast_array = - cast_with_options(underlying_array.as_ref(), to.data_type(), cast_options)?; - let builder = data - .into_builder() - .data_type(to_type.clone()) - .child_data(vec![cast_array.into_data()]); - - // Safety - // Data was valid before - let array_data = unsafe { builder.build_unchecked() }; - let list = GenericListArray::::from(array_data); - Ok(Arc::new(list) as ArrayRef) -} - /// A specified helper to cast from `GenericBinaryArray` to `GenericStringArray` when they have same /// offset size so re-encoding offset is unnecessary. fn cast_binary_to_string( @@ -3217,10 +3043,8 @@ fn cast_binary_to_string( Err(e) => match cast_options.safe { true => { // Fallback to slow method to convert invalid sequences to nulls - let mut builder = GenericStringBuilder::::with_capacity( - array.len(), - array.value_data().len(), - ); + let mut builder = + GenericStringBuilder::::with_capacity(array.len(), array.value_data().len()); let iter = array .iter() @@ -3315,8 +3139,8 @@ where offsets .iter() .try_for_each::<_, Result<_, ArrowError>>(|offset| { - let offset = <::Offset as NumCast>::from(*offset) - .ok_or_else(|| { + let offset = + <::Offset as NumCast>::from(*offset).ok_or_else(|| { ArrowError::ComputeError(format!( "{}{} array too large to cast to {}{} array", FROM::Offset::PREFIX, @@ -3345,9 +3169,7 @@ where Ok(Arc::new(GenericByteArray::::from(array_data))) } -fn cast_fixed_size_list_to_list( - array: &dyn Array, -) -> Result +fn cast_fixed_size_list_to_list(array: &dyn Array) -> Result where OffsetSize: OffsetSizeTrait, { @@ -3356,80 +3178,134 @@ where Ok(Arc::new(list)) } -/// Cast the container type of List/Largelist array but not the inner types. -/// This function can leave the value data intact and only has to cast the offset dtypes. -fn cast_list_container( - array: &dyn Array, - _cast_options: &CastOptions, +fn cast_list_to_fixed_size_list( + array: &GenericListArray, + field: &FieldRef, + size: i32, + cast_options: &CastOptions, ) -> Result where - OffsetSizeFrom: OffsetSizeTrait + ToPrimitive, - OffsetSizeTo: OffsetSizeTrait + NumCast, + OffsetSize: OffsetSizeTrait, { - let list = array.as_list::(); - // the value data stored by the list - let values = list.values(); + let cap = array.len() * size as usize; - let out_dtype = match array.data_type() { - DataType::List(value_type) => { - assert_eq!( - std::mem::size_of::(), - std::mem::size_of::() - ); - assert_eq!( - std::mem::size_of::(), - std::mem::size_of::() - ); - DataType::LargeList(value_type.clone()) + let mut nulls = (cast_options.safe || array.null_count() != 0).then(|| { + let mut buffer = BooleanBufferBuilder::new(array.len()); + match array.nulls() { + Some(n) => buffer.append_buffer(n.inner()), + None => buffer.append_n(array.len(), true), } - DataType::LargeList(value_type) => { - assert_eq!( - std::mem::size_of::(), - std::mem::size_of::() - ); - assert_eq!( - std::mem::size_of::(), - std::mem::size_of::() - ); - if values.len() > i32::MAX as usize { - return Err(ArrowError::ComputeError( - "LargeList too large to cast to List".into(), - )); + buffer + }); + + // Nulls in FixedSizeListArray take up space and so we must pad the values + let values = array.values().to_data(); + let mut mutable = MutableArrayData::new(vec![&values], cast_options.safe, cap); + // The end position in values of the last incorrectly-sized list slice + let mut last_pos = 0; + for (idx, w) in array.offsets().windows(2).enumerate() { + let start_pos = w[0].as_usize(); + let end_pos = w[1].as_usize(); + let len = end_pos - start_pos; + + if len != size as usize { + if cast_options.safe || array.is_null(idx) { + if last_pos != start_pos { + // Extend with valid slices + mutable.extend(0, last_pos, start_pos); + } + // Pad this slice with nulls + mutable.extend_nulls(size as _); + nulls.as_mut().unwrap().set_bit(idx, false); + // Set last_pos to the end of this slice's values + last_pos = end_pos + } else { + return Err(ArrowError::CastError(format!( + "Cannot cast to FixedSizeList({size}): value at index {idx} has length {len}", + ))); } - DataType::List(value_type.clone()) } - // implementation error - _ => unreachable!(), + } + + let values = match last_pos { + 0 => array.values().slice(0, cap), // All slices were the correct length + _ => { + if mutable.len() != cap { + // Remaining slices were all correct length + let remaining = cap - mutable.len(); + mutable.extend(0, last_pos, last_pos + remaining) + } + make_array(mutable.freeze()) + } }; - let iter = list.value_offsets().iter().map(|idx| { - let idx: OffsetSizeTo = NumCast::from(*idx).unwrap(); - idx - }); + // Cast the inner values if necessary + let values = cast_with_options(values.as_ref(), field.data_type(), cast_options)?; - // SAFETY - // A slice produces a trusted length iterator - let offset_buffer = unsafe { Buffer::from_trusted_len_iter(iter) }; + // Construct the FixedSizeListArray + let nulls = nulls.map(|mut x| x.finish().into()); + let array = FixedSizeListArray::new(field.clone(), size, values, nulls); + Ok(Arc::new(array)) +} - // wrap up - let builder = ArrayData::builder(out_dtype) - .len(list.len()) - .add_buffer(offset_buffer) - .add_child_data(values.to_data()) - .nulls(list.nulls().cloned()); +/// Helper function that takes an Generic list container and casts the inner datatype. +fn cast_list_values( + array: &dyn Array, + to: &FieldRef, + cast_options: &CastOptions, +) -> Result { + let list = array.as_list::(); + let values = cast_with_options(list.values(), to.data_type(), cast_options)?; + Ok(Arc::new(GenericListArray::::new( + to.clone(), + list.offsets().clone(), + values, + list.nulls().cloned(), + ))) +} - let array_data = unsafe { builder.build_unchecked() }; - Ok(Arc::new(GenericListArray::::from(array_data))) +/// Cast the container type of List/Largelist array along with the inner datatype +fn cast_list( + array: &dyn Array, + field: &FieldRef, + cast_options: &CastOptions, +) -> Result { + let list = array.as_list::(); + let values = list.values(); + let offsets = list.offsets(); + let nulls = list.nulls().cloned(); + + if !O::IS_LARGE && values.len() > i32::MAX as usize { + return Err(ArrowError::ComputeError( + "LargeList too large to cast to List".into(), + )); + } + + // Recursively cast values + let values = cast_with_options(values, field.data_type(), cast_options)?; + let offsets: Vec<_> = offsets.iter().map(|x| O::usize_as(x.as_usize())).collect(); + + // Safety: valid offsets and checked for overflow + let offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) }; + + Ok(Arc::new(GenericListArray::::new( + field.clone(), + offsets, + values, + nulls, + ))) } #[cfg(test)] mod tests { + use arrow_buffer::{Buffer, NullBuffer}; + use super::*; macro_rules! generate_cast_test_case { ($INPUT_ARRAY: expr, $OUTPUT_TYPE_ARRAY: ident, $OUTPUT_TYPE: expr, $OUTPUT_VALUES: expr) => { - let output = $OUTPUT_TYPE_ARRAY::from($OUTPUT_VALUES) - .with_data_type($OUTPUT_TYPE.clone()); + let output = + $OUTPUT_TYPE_ARRAY::from($OUTPUT_VALUES).with_data_type($OUTPUT_TYPE.clone()); // assert cast type let input_array_type = $INPUT_ARRAY.data_type(); @@ -3442,8 +3318,7 @@ mod tests { safe: false, format_options: FormatOptions::default(), }; - let result = - cast_with_options($INPUT_ARRAY, $OUTPUT_TYPE, &cast_option).unwrap(); + let result = cast_with_options($INPUT_ARRAY, $OUTPUT_TYPE, &cast_option).unwrap(); assert_eq!($OUTPUT_TYPE, result.data_type()); assert_eq!(result.as_ref(), &output); }; @@ -3777,8 +3652,7 @@ mod tests { #[test] fn test_cast_decimal_to_numeric() { - let value_array: Vec> = - vec![Some(125), Some(225), Some(325), None, Some(525)]; + let value_array: Vec> = vec![Some(125), Some(225), Some(325), None, Some(525)]; let array = create_decimal_array(value_array, 38, 2).unwrap(); // u8 generate_cast_test_case!( @@ -4590,8 +4464,7 @@ mod tests { #[test] fn test_cast_i32_to_list_f64_nullable_sliced() { - let array = - Int32Array::from(vec![Some(5), None, Some(7), Some(8), None, Some(10)]); + let array = Int32Array::from(vec![Some(5), None, Some(7), Some(8), None, Some(10)]); let array = array.slice(2, 4); let b = cast( &array, @@ -4641,9 +4514,8 @@ mod tests { Ok(_) => panic!("expected error"), Err(e) => { assert!( - e.to_string().contains( - "Cast error: Cannot cast string 'seven' to value of Int32 type", - ), + e.to_string() + .contains("Cast error: Cannot cast string 'seven' to value of Int32 type",), "Error: {e}" ) } @@ -4654,8 +4526,7 @@ mod tests { fn test_cast_utf8_to_bool() { let strings = StringArray::from(vec!["true", "false", "invalid", " Y ", ""]); let casted = cast(&strings, &DataType::Boolean).unwrap(); - let expected = - BooleanArray::from(vec![Some(true), Some(false), None, Some(true), None]); + let expected = BooleanArray::from(vec![Some(true), Some(false), None, Some(true), None]); assert_eq!(*as_boolean_array(&casted), expected); } @@ -4673,9 +4544,9 @@ mod tests { match casted { Ok(_) => panic!("expected error"), Err(e) => { - assert!(e.to_string().contains( - "Cast error: Cannot cast value 'invalid' to value of Boolean type" - )) + assert!(e + .to_string() + .contains("Cast error: Cannot cast value 'invalid' to value of Boolean type")) } } } @@ -4721,25 +4592,157 @@ mod tests { } #[test] - #[should_panic( - expected = "Casting from Int32 to Timestamp(Microsecond, None) not supported" - )] - fn test_cast_int32_to_timestamp() { + fn test_cast_integer_to_timestamp() { + let array = Int64Array::from(vec![Some(2), Some(10), None]); + let expected = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + + let array = Int8Array::from(vec![Some(2), Some(10), None]); + let actual = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + + assert_eq!(&actual, &expected); + + let array = Int16Array::from(vec![Some(2), Some(10), None]); + let actual = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + + assert_eq!(&actual, &expected); + let array = Int32Array::from(vec![Some(2), Some(10), None]); - cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + let actual = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + + assert_eq!(&actual, &expected); + + let array = UInt8Array::from(vec![Some(2), Some(10), None]); + let actual = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + + assert_eq!(&actual, &expected); + + let array = UInt16Array::from(vec![Some(2), Some(10), None]); + let actual = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + + assert_eq!(&actual, &expected); + + let array = UInt32Array::from(vec![Some(2), Some(10), None]); + let actual = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + + assert_eq!(&actual, &expected); + + let array = UInt64Array::from(vec![Some(2), Some(10), None]); + let actual = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + + assert_eq!(&actual, &expected); + } + + #[test] + fn test_cast_timestamp_to_integer() { + let array = TimestampMillisecondArray::from(vec![Some(5), Some(1), None]) + .with_timezone("UTC".to_string()); + let expected = cast(&array, &DataType::Int64).unwrap(); + + let actual = cast(&cast(&array, &DataType::Int8).unwrap(), &DataType::Int64).unwrap(); + assert_eq!(&actual, &expected); + + let actual = cast(&cast(&array, &DataType::Int16).unwrap(), &DataType::Int64).unwrap(); + assert_eq!(&actual, &expected); + + let actual = cast(&cast(&array, &DataType::Int32).unwrap(), &DataType::Int64).unwrap(); + assert_eq!(&actual, &expected); + + let actual = cast(&cast(&array, &DataType::UInt8).unwrap(), &DataType::Int64).unwrap(); + assert_eq!(&actual, &expected); + + let actual = cast(&cast(&array, &DataType::UInt16).unwrap(), &DataType::Int64).unwrap(); + assert_eq!(&actual, &expected); + + let actual = cast(&cast(&array, &DataType::UInt32).unwrap(), &DataType::Int64).unwrap(); + assert_eq!(&actual, &expected); + + let actual = cast(&cast(&array, &DataType::UInt64).unwrap(), &DataType::Int64).unwrap(); + assert_eq!(&actual, &expected); + } + + #[test] + fn test_cast_floating_to_timestamp() { + let array = Int64Array::from(vec![Some(2), Some(10), None]); + let expected = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + + let array = Float32Array::from(vec![Some(2.0), Some(10.6), None]); + let actual = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + + assert_eq!(&actual, &expected); + + let array = Float64Array::from(vec![Some(2.1), Some(10.2), None]); + let actual = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + + assert_eq!(&actual, &expected); + } + + #[test] + fn test_cast_timestamp_to_floating() { + let array = TimestampMillisecondArray::from(vec![Some(5), Some(1), None]) + .with_timezone("UTC".to_string()); + let expected = cast(&array, &DataType::Int64).unwrap(); + + let actual = cast(&cast(&array, &DataType::Float32).unwrap(), &DataType::Int64).unwrap(); + assert_eq!(&actual, &expected); + + let actual = cast(&cast(&array, &DataType::Float64).unwrap(), &DataType::Int64).unwrap(); + assert_eq!(&actual, &expected); + } + + #[test] + fn test_cast_decimal_to_timestamp() { + let array = Int64Array::from(vec![Some(2), Some(10), None]); + let expected = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + + let array = Decimal128Array::from(vec![Some(200), Some(1000), None]) + .with_precision_and_scale(4, 2) + .unwrap(); + let actual = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + + assert_eq!(&actual, &expected); + + let array = Decimal256Array::from(vec![ + Some(i256::from_i128(2000)), + Some(i256::from_i128(10000)), + None, + ]) + .with_precision_and_scale(5, 3) + .unwrap(); + let actual = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + + assert_eq!(&actual, &expected); + } + + #[test] + fn test_cast_timestamp_to_decimal() { + let array = TimestampMillisecondArray::from(vec![Some(5), Some(1), None]) + .with_timezone("UTC".to_string()); + let expected = cast(&array, &DataType::Int64).unwrap(); + + let actual = cast( + &cast(&array, &DataType::Decimal128(5, 2)).unwrap(), + &DataType::Int64, + ) + .unwrap(); + assert_eq!(&actual, &expected); + + let actual = cast( + &cast(&array, &DataType::Decimal256(10, 5)).unwrap(), + &DataType::Int64, + ) + .unwrap(); + assert_eq!(&actual, &expected); } #[test] fn test_cast_list_i32_to_list_u16() { - let value_data = - Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2, 100000000]).into_data(); + let value_data = Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2, 100000000]).into_data(); let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); // Construct a list array from the above two // [[0,0,0], [-1, -2, -1], [2, 100000000]] - let list_data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); + let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -4783,19 +4786,14 @@ mod tests { } #[test] - #[should_panic( - expected = "Casting from Int32 to Timestamp(Microsecond, None) not supported" - )] fn test_cast_list_i32_to_list_timestamp() { // Construct a value array - let value_data = - Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2, 8, 100000000]).into_data(); + let value_data = Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2, 8, 100000000]).into_data(); let value_offsets = Buffer::from_slice_ref([0, 3, 6, 9]); // Construct a list array from the above two - let list_data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); + let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -4804,7 +4802,7 @@ mod tests { .unwrap(); let list_array = Arc::new(ListArray::from(list_data)) as ArrayRef; - cast( + let actual = cast( &list_array, &DataType::List(Arc::new(Field::new( "item", @@ -4813,6 +4811,22 @@ mod tests { ))), ) .unwrap(); + + let expected = cast( + &cast( + &list_array, + &DataType::List(Arc::new(Field::new("item", DataType::Int64, true))), + ) + .unwrap(), + &DataType::List(Arc::new(Field::new( + "item", + DataType::Timestamp(TimeUnit::Microsecond, None), + true, + ))), + ) + .unwrap(); + + assert_eq!(&actual, &expected); } #[test] @@ -4940,10 +4954,37 @@ mod tests { format_options: FormatOptions::default(), }; let err = cast_with_options(array, &to_type, &options).unwrap_err(); - assert_eq!(err.to_string(), "Cast error: Cannot cast string 'Not a valid date' to value of Date32 type"); + assert_eq!( + err.to_string(), + "Cast error: Cannot cast string 'Not a valid date' to value of Date32 type" + ); } } + #[test] + fn test_cast_string_format_yyyymmdd_to_date32() { + let a = Arc::new(StringArray::from(vec![ + Some("2020-12-25"), + Some("20201117"), + ])) as ArrayRef; + + let to_type = DataType::Date32; + let options = CastOptions { + safe: false, + format_options: FormatOptions::default(), + }; + let result = cast_with_options(&a, &to_type, &options).unwrap(); + let c = result.as_primitive::(); + assert_eq!( + chrono::NaiveDate::from_ymd_opt(2020, 12, 25), + c.value_as_date(0) + ); + assert_eq!( + chrono::NaiveDate::from_ymd_opt(2020, 11, 17), + c.value_as_date(1) + ); + } + #[test] fn test_cast_string_to_time32second() { let a1 = Arc::new(StringArray::from(vec![ @@ -5097,14 +5138,16 @@ mod tests { format_options: FormatOptions::default(), }; let err = cast_with_options(array, &to_type, &options).unwrap_err(); - assert_eq!(err.to_string(), "Cast error: Cannot cast string 'Not a valid date' to value of Date64 type"); + assert_eq!( + err.to_string(), + "Cast error: Cannot cast string 'Not a valid date' to value of Date64 type" + ); } } macro_rules! test_safe_string_to_interval { ($data_vec:expr, $interval_unit:expr, $array_ty:ty, $expect_vec:expr) => { - let source_string_array = - Arc::new(StringArray::from($data_vec.clone())) as ArrayRef; + let source_string_array = Arc::new(StringArray::from($data_vec.clone())) as ArrayRef; let options = CastOptions { safe: true, @@ -5358,23 +5401,61 @@ mod tests { } #[test] - fn test_fixed_size_binary_to_binary() { - let bytes_1 = "Hiiii".as_bytes(); - let bytes_2 = "Hello".as_bytes(); - - let binary_data = vec![Some(bytes_1), Some(bytes_2), None]; - let a1 = Arc::new(FixedSizeBinaryArray::from(binary_data.clone())) as ArrayRef; + fn test_fixed_size_binary_to_binary() { + let bytes_1 = "Hiiii".as_bytes(); + let bytes_2 = "Hello".as_bytes(); + + let binary_data = vec![Some(bytes_1), Some(bytes_2), None]; + let a1 = Arc::new(FixedSizeBinaryArray::from(binary_data.clone())) as ArrayRef; + + let array_ref = cast(&a1, &DataType::Binary).unwrap(); + let down_cast = array_ref.as_binary::(); + assert_eq!(bytes_1, down_cast.value(0)); + assert_eq!(bytes_2, down_cast.value(1)); + assert!(down_cast.is_null(2)); + + let array_ref = cast(&a1, &DataType::LargeBinary).unwrap(); + let down_cast = array_ref.as_binary::(); + assert_eq!(bytes_1, down_cast.value(0)); + assert_eq!(bytes_2, down_cast.value(1)); + assert!(down_cast.is_null(2)); + } + + #[test] + fn test_numeric_to_binary() { + let a = Int16Array::from(vec![Some(1), Some(511), None]); + + let array_ref = cast(&a, &DataType::Binary).unwrap(); + let down_cast = array_ref.as_binary::(); + assert_eq!(&1_i16.to_le_bytes(), down_cast.value(0)); + assert_eq!(&511_i16.to_le_bytes(), down_cast.value(1)); + assert!(down_cast.is_null(2)); + + let a = Int64Array::from(vec![Some(-1), Some(123456789), None]); + + let array_ref = cast(&a, &DataType::Binary).unwrap(); + let down_cast = array_ref.as_binary::(); + assert_eq!(&(-1_i64).to_le_bytes(), down_cast.value(0)); + assert_eq!(&123456789_i64.to_le_bytes(), down_cast.value(1)); + assert!(down_cast.is_null(2)); + } + + #[test] + fn test_numeric_to_large_binary() { + let a = Int16Array::from(vec![Some(1), Some(511), None]); - let array_ref = cast(&a1, &DataType::Binary).unwrap(); - let down_cast = array_ref.as_binary::(); - assert_eq!(bytes_1, down_cast.value(0)); - assert_eq!(bytes_2, down_cast.value(1)); + let array_ref = cast(&a, &DataType::LargeBinary).unwrap(); + let down_cast = array_ref.as_binary::(); + assert_eq!(&1_i16.to_le_bytes(), down_cast.value(0)); + assert_eq!(&511_i16.to_le_bytes(), down_cast.value(1)); assert!(down_cast.is_null(2)); - let array_ref = cast(&a1, &DataType::LargeBinary).unwrap(); + let a = Int64Array::from(vec![Some(-1), Some(123456789), None]); + + let array_ref = cast(&a, &DataType::LargeBinary).unwrap(); let down_cast = array_ref.as_binary::(); - assert_eq!(bytes_1, down_cast.value(0)); - assert_eq!(bytes_2, down_cast.value(1)); + assert_eq!(&(-1_i64).to_le_bytes(), down_cast.value(0)); + assert_eq!(&123456789_i64.to_le_bytes(), down_cast.value(1)); assert!(down_cast.is_null(2)); } @@ -5398,12 +5479,9 @@ mod tests { #[test] fn test_cast_timestamp_to_date32() { - let array = TimestampMillisecondArray::from(vec![ - Some(864000000005), - Some(1545696000001), - None, - ]) - .with_timezone("UTC".to_string()); + let array = + TimestampMillisecondArray::from(vec![Some(864000000005), Some(1545696000001), None]) + .with_timezone("UTC".to_string()); let b = cast(&array, &DataType::Date32).unwrap(); let c = b.as_primitive::(); assert_eq!(10000, c.value(0)); @@ -5413,19 +5491,15 @@ mod tests { #[test] fn test_cast_timestamp_to_date64() { - let array = TimestampMillisecondArray::from(vec![ - Some(864000000005), - Some(1545696000001), - None, - ]); + let array = + TimestampMillisecondArray::from(vec![Some(864000000005), Some(1545696000001), None]); let b = cast(&array, &DataType::Date64).unwrap(); let c = b.as_primitive::(); assert_eq!(864000000005, c.value(0)); assert_eq!(1545696000001, c.value(1)); assert!(c.is_null(2)); - let array = - TimestampSecondArray::from(vec![Some(864000000005), Some(1545696000001)]); + let array = TimestampSecondArray::from(vec![Some(864000000005), Some(1545696000001)]); let b = cast(&array, &DataType::Date64).unwrap(); let c = b.as_primitive::(); assert_eq!(864000000005000, c.value(0)); @@ -5477,9 +5551,8 @@ mod tests { assert!(c.is_null(2)); // test timestamp microseconds - let a = - TimestampMicrosecondArray::from(vec![Some(86405000000), Some(1000000), None]) - .with_timezone("+01:00".to_string()); + let a = TimestampMicrosecondArray::from(vec![Some(86405000000), Some(1000000), None]) + .with_timezone("+01:00".to_string()); let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Time64(TimeUnit::Microsecond)).unwrap(); let c = b.as_primitive::(); @@ -5493,12 +5566,8 @@ mod tests { assert!(c.is_null(2)); // test timestamp nanoseconds - let a = TimestampNanosecondArray::from(vec![ - Some(86405000000000), - Some(1000000000), - None, - ]) - .with_timezone("+01:00".to_string()); + let a = TimestampNanosecondArray::from(vec![Some(86405000000000), Some(1000000000), None]) + .with_timezone("+01:00".to_string()); let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Time64(TimeUnit::Microsecond)).unwrap(); let c = b.as_primitive::(); @@ -5512,8 +5581,8 @@ mod tests { assert!(c.is_null(2)); // test overflow - let a = TimestampSecondArray::from(vec![Some(i64::MAX)]) - .with_timezone("+01:00".to_string()); + let a = + TimestampSecondArray::from(vec![Some(i64::MAX)]).with_timezone("+01:00".to_string()); let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Time64(TimeUnit::Microsecond)); assert!(b.is_err()); @@ -5556,9 +5625,8 @@ mod tests { assert!(c.is_null(2)); // test timestamp microseconds - let a = - TimestampMicrosecondArray::from(vec![Some(86405000000), Some(1000000), None]) - .with_timezone("+01:00".to_string()); + let a = TimestampMicrosecondArray::from(vec![Some(86405000000), Some(1000000), None]) + .with_timezone("+01:00".to_string()); let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Time32(TimeUnit::Second)).unwrap(); let c = b.as_primitive::(); @@ -5572,12 +5640,8 @@ mod tests { assert!(c.is_null(2)); // test timestamp nanoseconds - let a = TimestampNanosecondArray::from(vec![ - Some(86405000000000), - Some(1000000000), - None, - ]) - .with_timezone("+01:00".to_string()); + let a = TimestampNanosecondArray::from(vec![Some(86405000000000), Some(1000000000), None]) + .with_timezone("+01:00".to_string()); let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Time32(TimeUnit::Second)).unwrap(); let c = b.as_primitive::(); @@ -5591,8 +5655,8 @@ mod tests { assert!(c.is_null(2)); // test overflow - let a = TimestampSecondArray::from(vec![Some(i64::MAX)]) - .with_timezone("+01:00".to_string()); + let a = + TimestampSecondArray::from(vec![Some(i64::MAX)]).with_timezone("+01:00".to_string()); let array = Arc::new(a) as ArrayRef; let b = cast(&array, &DataType::Time32(TimeUnit::Second)); assert!(b.is_err()); @@ -5679,8 +5743,7 @@ mod tests { #[test] fn test_cast_date64_to_timestamp() { - let array = - Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]); + let array = Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]); let b = cast(&array, &DataType::Timestamp(TimeUnit::Second, None)).unwrap(); let c = b.as_primitive::(); assert_eq!(864000000, c.value(0)); @@ -5690,8 +5753,7 @@ mod tests { #[test] fn test_cast_date64_to_timestamp_ms() { - let array = - Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]); + let array = Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]); let b = cast(&array, &DataType::Timestamp(TimeUnit::Millisecond, None)).unwrap(); let c = b .as_any() @@ -5704,8 +5766,7 @@ mod tests { #[test] fn test_cast_date64_to_timestamp_us() { - let array = - Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]); + let array = Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]); let b = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); let c = b .as_any() @@ -5718,8 +5779,7 @@ mod tests { #[test] fn test_cast_date64_to_timestamp_ns() { - let array = - Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]); + let array = Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]); let b = cast(&array, &DataType::Timestamp(TimeUnit::Nanosecond, None)).unwrap(); let c = b .as_any() @@ -5732,12 +5792,9 @@ mod tests { #[test] fn test_cast_timestamp_to_i64() { - let array = TimestampMillisecondArray::from(vec![ - Some(864000000005), - Some(1545696000001), - None, - ]) - .with_timezone("UTC".to_string()); + let array = + TimestampMillisecondArray::from(vec![Some(864000000005), Some(1545696000001), None]) + .with_timezone("UTC".to_string()); let b = cast(&array, &DataType::Int64).unwrap(); let c = b.as_primitive::(); assert_eq!(&DataType::Int64, c.data_type()); @@ -5769,11 +5826,8 @@ mod tests { #[test] fn test_cast_timestamp_to_strings() { // "2018-12-25T00:00:02.001", "1997-05-19T00:00:03.005", None - let array = TimestampMillisecondArray::from(vec![ - Some(864000003005), - Some(1545696002001), - None, - ]); + let array = + TimestampMillisecondArray::from(vec![Some(864000003005), Some(1545696002001), None]); let out = cast(&array, &DataType::Utf8).unwrap(); let out = out .as_any() @@ -5817,13 +5871,9 @@ mod tests { .with_timestamp_tz_format(Some(ts_format)), }; // "2018-12-25T00:00:02.001", "1997-05-19T00:00:03.005", None - let array_without_tz = TimestampMillisecondArray::from(vec![ - Some(864000003005), - Some(1545696002001), - None, - ]); - let out = - cast_with_options(&array_without_tz, &DataType::Utf8, &cast_options).unwrap(); + let array_without_tz = + TimestampMillisecondArray::from(vec![Some(864000003005), Some(1545696002001), None]); + let out = cast_with_options(&array_without_tz, &DataType::Utf8, &cast_options).unwrap(); let out = out .as_any() .downcast_ref::() @@ -5839,8 +5889,7 @@ mod tests { ] ); let out = - cast_with_options(&array_without_tz, &DataType::LargeUtf8, &cast_options) - .unwrap(); + cast_with_options(&array_without_tz, &DataType::LargeUtf8, &cast_options).unwrap(); let out = out .as_any() .downcast_ref::() @@ -5856,14 +5905,10 @@ mod tests { ] ); - let array_with_tz = TimestampMillisecondArray::from(vec![ - Some(864000003005), - Some(1545696002001), - None, - ]) - .with_timezone(tz.to_string()); - let out = - cast_with_options(&array_with_tz, &DataType::Utf8, &cast_options).unwrap(); + let array_with_tz = + TimestampMillisecondArray::from(vec![Some(864000003005), Some(1545696002001), None]) + .with_timezone(tz.to_string()); + let out = cast_with_options(&array_with_tz, &DataType::Utf8, &cast_options).unwrap(); let out = out .as_any() .downcast_ref::() @@ -5878,8 +5923,7 @@ mod tests { None ] ); - let out = cast_with_options(&array_with_tz, &DataType::LargeUtf8, &cast_options) - .unwrap(); + let out = cast_with_options(&array_with_tz, &DataType::LargeUtf8, &cast_options).unwrap(); let out = out .as_any() .downcast_ref::() @@ -5898,11 +5942,8 @@ mod tests { #[test] fn test_cast_between_timestamps() { - let array = TimestampMillisecondArray::from(vec![ - Some(864000003005), - Some(1545696002001), - None, - ]); + let array = + TimestampMillisecondArray::from(vec![Some(864000003005), Some(1545696002001), None]); let b = cast(&array, &DataType::Timestamp(TimeUnit::Second, None)).unwrap(); let c = b.as_primitive::(); assert_eq!(864000003, c.value(0)); @@ -6306,8 +6347,7 @@ mod tests { ]; let u64_array: ArrayRef = Arc::new(UInt64Array::from(u64_values)); - let f64_expected = - vec![0.0, 255.0, 65535.0, 4294967295.0, 18446744073709552000.0]; + let f64_expected = vec![0.0, 255.0, 65535.0, 4294967295.0, 18446744073709552000.0]; assert_eq!( f64_expected, get_cast_values::(&u64_array, &DataType::Float64) @@ -6316,8 +6356,7 @@ mod tests { .collect::>() ); - let f32_expected = - vec![0.0, 255.0, 65535.0, 4294967300.0, 18446744000000000000.0]; + let f32_expected = vec![0.0, 255.0, 65535.0, 4294967300.0, 18446744000000000000.0]; assert_eq!( f32_expected, get_cast_values::(&u64_array, &DataType::Float32) @@ -6350,8 +6389,7 @@ mod tests { get_cast_values::(&u64_array, &DataType::Int8) ); - let u64_expected = - vec!["0", "255", "65535", "4294967295", "18446744073709551615"]; + let u64_expected = vec!["0", "255", "65535", "4294967295", "18446744073709551615"]; assert_eq!( u64_expected, get_cast_values::(&u64_array, &DataType::UInt64) @@ -6782,15 +6820,13 @@ mod tests { get_cast_values::(&i32_array, &DataType::Int8) ); - let u64_expected = - vec!["null", "null", "null", "0", "127", "32767", "2147483647"]; + let u64_expected = vec!["null", "null", "null", "0", "127", "32767", "2147483647"]; assert_eq!( u64_expected, get_cast_values::(&i32_array, &DataType::UInt64) ); - let u32_expected = - vec!["null", "null", "null", "0", "127", "32767", "2147483647"]; + let u32_expected = vec!["null", "null", "null", "0", "127", "32767", "2147483647"]; assert_eq!( u32_expected, get_cast_values::(&i32_array, &DataType::UInt32) @@ -6826,8 +6862,7 @@ mod tests { #[test] fn test_cast_from_int16() { - let i16_values: Vec = - vec![i16::MIN, i8::MIN as i16, 0, i8::MAX as i16, i16::MAX]; + let i16_values: Vec = vec![i16::MIN, i8::MIN as i16, 0, i8::MAX as i16, i16::MAX]; let i16_array: ArrayRef = Arc::new(Int16Array::from(i16_values)); let f64_expected = vec!["-32768.0", "-128.0", "0.0", "127.0", "32767.0"]; @@ -7168,8 +7203,7 @@ mod tests { fn test_cast_string_array_to_dict() { use DataType::*; - let array = Arc::new(StringArray::from(vec![Some("one"), None, Some("three")])) - as ArrayRef; + let array = Arc::new(StringArray::from(vec![Some("one"), None, Some("three")])) as ArrayRef; let expected = vec!["one", "null", "three"]; @@ -7268,16 +7302,12 @@ mod tests { cast_from_null_to_other(&data_type); // Cast null from and to list - let data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); + let data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); cast_from_null_to_other(&data_type); - let data_type = - DataType::LargeList(Arc::new(Field::new("item", DataType::Int32, true))); + let data_type = DataType::LargeList(Arc::new(Field::new("item", DataType::Int32, true))); cast_from_null_to_other(&data_type); - let data_type = DataType::FixedSizeList( - Arc::new(Field::new("item", DataType::Int32, true)), - 4, - ); + let data_type = + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, true)), 4); cast_from_null_to_other(&data_type); // Cast null from and to dictionary @@ -7288,8 +7318,7 @@ mod tests { cast_from_null_to_other(&data_type); // Cast null from and to struct - let data_type = - DataType::Struct(vec![Field::new("data", DataType::Int64, false)].into()); + let data_type = DataType::Struct(vec![Field::new("data", DataType::Int64, false)].into()); cast_from_null_to_other(&data_type); } @@ -7469,6 +7498,183 @@ mod tests { assert_eq!(&expected.value(2), &actual.value(2)); } + #[test] + fn test_cast_list_to_fsl() { + // There four noteworthy cases we should handle: + // 1. No nulls + // 2. Nulls that are always empty + // 3. Nulls that have varying lengths + // 4. Nulls that are correctly sized (same as target list size) + + // Non-null case + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let values = vec![ + Some(vec![Some(1), Some(2), Some(3)]), + Some(vec![Some(4), Some(5), Some(6)]), + ]; + let array = Arc::new(ListArray::from_iter_primitive::( + values.clone(), + )) as ArrayRef; + let expected = Arc::new(FixedSizeListArray::from_iter_primitive::( + values, 3, + )) as ArrayRef; + let actual = cast(array.as_ref(), &DataType::FixedSizeList(field.clone(), 3)).unwrap(); + assert_eq!(expected.as_ref(), actual.as_ref()); + + // Null cases + // Array is [[1, 2, 3], null, [4, 5, 6], null] + let cases = [ + ( + // Zero-length nulls + vec![1, 2, 3, 4, 5, 6], + vec![3, 0, 3, 0], + ), + ( + // Varying-length nulls + vec![1, 2, 3, 0, 0, 4, 5, 6, 0], + vec![3, 2, 3, 1], + ), + ( + // Correctly-sized nulls + vec![1, 2, 3, 0, 0, 0, 4, 5, 6, 0, 0, 0], + vec![3, 3, 3, 3], + ), + ( + // Mixed nulls + vec![1, 2, 3, 4, 5, 6, 0, 0, 0], + vec![3, 0, 3, 3], + ), + ]; + let null_buffer = NullBuffer::from(vec![true, false, true, false]); + + let expected = Arc::new(FixedSizeListArray::from_iter_primitive::( + vec![ + Some(vec![Some(1), Some(2), Some(3)]), + None, + Some(vec![Some(4), Some(5), Some(6)]), + None, + ], + 3, + )) as ArrayRef; + + for (values, lengths) in cases.iter() { + let array = Arc::new(ListArray::new( + field.clone(), + OffsetBuffer::from_lengths(lengths.clone()), + Arc::new(Int32Array::from(values.clone())), + Some(null_buffer.clone()), + )) as ArrayRef; + let actual = cast(array.as_ref(), &DataType::FixedSizeList(field.clone(), 3)).unwrap(); + assert_eq!(expected.as_ref(), actual.as_ref()); + } + } + + #[test] + fn test_cast_list_to_fsl_safety() { + let values = vec![ + Some(vec![Some(1), Some(2), Some(3)]), + Some(vec![Some(4), Some(5)]), + Some(vec![Some(6), Some(7), Some(8), Some(9)]), + Some(vec![Some(3), Some(4), Some(5)]), + ]; + let array = Arc::new(ListArray::from_iter_primitive::( + values.clone(), + )) as ArrayRef; + + let res = cast_with_options( + array.as_ref(), + &DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, true)), 3), + &CastOptions { + safe: false, + ..Default::default() + }, + ); + assert!(res.is_err()); + assert!(format!("{:?}", res) + .contains("Cannot cast to FixedSizeList(3): value at index 1 has length 2")); + + // When safe=true (default), the cast will fill nulls for lists that are + // too short and truncate lists that are too long. + let res = cast( + array.as_ref(), + &DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, true)), 3), + ) + .unwrap(); + let expected = Arc::new(FixedSizeListArray::from_iter_primitive::( + vec![ + Some(vec![Some(1), Some(2), Some(3)]), + None, // Too short -> replaced with null + None, // Too long -> replaced with null + Some(vec![Some(3), Some(4), Some(5)]), + ], + 3, + )) as ArrayRef; + assert_eq!(expected.as_ref(), res.as_ref()); + } + + #[test] + fn test_cast_large_list_to_fsl() { + let values = vec![Some(vec![Some(1), Some(2)]), Some(vec![Some(3), Some(4)])]; + let array = Arc::new(LargeListArray::from_iter_primitive::( + values.clone(), + )) as ArrayRef; + let expected = Arc::new(FixedSizeListArray::from_iter_primitive::( + values, 2, + )) as ArrayRef; + let actual = cast( + array.as_ref(), + &DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, true)), 2), + ) + .unwrap(); + assert_eq!(expected.as_ref(), actual.as_ref()); + } + + #[test] + fn test_cast_list_to_fsl_subcast() { + let array = Arc::new(LargeListArray::from_iter_primitive::( + vec![ + Some(vec![Some(1), Some(2)]), + Some(vec![Some(3), Some(i32::MAX)]), + ], + )) as ArrayRef; + let expected = Arc::new(FixedSizeListArray::from_iter_primitive::( + vec![ + Some(vec![Some(1), Some(2)]), + Some(vec![Some(3), Some(i32::MAX as i64)]), + ], + 2, + )) as ArrayRef; + let actual = cast( + array.as_ref(), + &DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int64, true)), 2), + ) + .unwrap(); + assert_eq!(expected.as_ref(), actual.as_ref()); + + let res = cast_with_options( + array.as_ref(), + &DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int16, true)), 2), + &CastOptions { + safe: false, + ..Default::default() + }, + ); + assert!(res.is_err()); + assert!(format!("{:?}", res).contains("Can't cast value 2147483647 to type Int16")); + } + + #[test] + fn test_cast_list_to_fsl_empty() { + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let array = new_empty_array(&DataType::List(field.clone())); + + let target_type = DataType::FixedSizeList(field.clone(), 3); + let expected = new_empty_array(&target_type); + + let actual = cast(array.as_ref(), &target_type).unwrap(); + assert_eq!(expected.as_ref(), actual.as_ref()); + } + fn make_list_array() -> ListArray { // Construct a value array let value_data = ArrayData::builder(DataType::Int32) @@ -7482,8 +7688,7 @@ mod tests { let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); // Construct a list array from the above two - let list_data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); + let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -7525,10 +7730,8 @@ mod tests { .build() .unwrap(); - let list_data_type = DataType::FixedSizeList( - Arc::new(Field::new("item", DataType::Int32, true)), - 4, - ); + let list_data_type = + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, true)), 4); let list_data = ArrayData::builder(list_data_type) .len(2) .add_child_data(value_data) @@ -7545,10 +7748,8 @@ mod tests { .build() .unwrap(); - let list_data_type = DataType::FixedSizeList( - Arc::new(Field::new("item", DataType::Int64, true)), - 4, - ); + let list_data_type = + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int64, true)), 4); let list_data = ArrayData::builder(list_data_type) .len(2) .add_child_data(value_data) @@ -7589,8 +7790,7 @@ mod tests { let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); let value_data = str_array.into_data(); - let list_data_type = - DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))); + let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) @@ -7739,6 +7939,68 @@ mod tests { assert!(casted_array.is_err()); } + #[test] + fn test_cast_floating_point_to_decimal128_precision_overflow() { + let array = Float64Array::from(vec![1.1]); + let array = Arc::new(array) as ArrayRef; + let casted_array = cast_with_options( + &array, + &DataType::Decimal128(2, 2), + &CastOptions { + safe: true, + format_options: FormatOptions::default(), + }, + ); + assert!(casted_array.is_ok()); + assert!(casted_array.unwrap().is_null(0)); + + let casted_array = cast_with_options( + &array, + &DataType::Decimal128(2, 2), + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, + ); + let err = casted_array.unwrap_err().to_string(); + let expected_error = "Invalid argument error: 110 is too large to store in a Decimal128 of precision 2. Max is 99"; + assert!( + err.contains(expected_error), + "did not find expected error '{expected_error}' in actual error '{err}'" + ); + } + + #[test] + fn test_cast_floating_point_to_decimal256_precision_overflow() { + let array = Float64Array::from(vec![1.1]); + let array = Arc::new(array) as ArrayRef; + let casted_array = cast_with_options( + &array, + &DataType::Decimal256(2, 2), + &CastOptions { + safe: true, + format_options: FormatOptions::default(), + }, + ); + assert!(casted_array.is_ok()); + assert!(casted_array.unwrap().is_null(0)); + + let casted_array = cast_with_options( + &array, + &DataType::Decimal256(2, 2), + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, + ); + let err = casted_array.unwrap_err().to_string(); + let expected_error = "Invalid argument error: 110 is too large to store in a Decimal256 of precision 2. Max is 99"; + assert!( + err.contains(expected_error), + "did not find expected error '{expected_error}' in actual error '{err}'" + ); + } + #[test] fn test_cast_floating_point_to_decimal128_overflow() { let array = Float64Array::from(vec![f64::MAX]); @@ -7867,12 +8129,7 @@ mod tests { let array = vec![Some(123)]; let input_decimal_array = create_decimal_array(array, 10, -1).unwrap(); let array = Arc::new(input_decimal_array) as ArrayRef; - generate_cast_test_case!( - &array, - Decimal128Array, - &output_type, - vec![Some(12_i128),] - ); + generate_cast_test_case!(&array, Decimal128Array, &output_type, vec![Some(12_i128),]); let casted_array = cast(&array, &output_type).unwrap(); let decimal_arr = casted_array.as_primitive::(); @@ -7882,12 +8139,7 @@ mod tests { let array = vec![Some(125)]; let input_decimal_array = create_decimal_array(array, 10, -1).unwrap(); let array = Arc::new(input_decimal_array) as ArrayRef; - generate_cast_test_case!( - &array, - Decimal128Array, - &output_type, - vec![Some(13_i128),] - ); + generate_cast_test_case!(&array, Decimal128Array, &output_type, vec![Some(13_i128),]); let casted_array = cast(&array, &output_type).unwrap(); let decimal_arr = casted_array.as_primitive::(); @@ -8030,6 +8282,21 @@ mod tests { assert_eq!("0.00", decimal_arr.value_as_string(10)); assert_eq!("0.00", decimal_arr.value_as_string(11)); assert!(decimal_arr.is_null(12)); + assert_eq!("-1.23", decimal_arr.value_as_string(13)); + assert_eq!("-1.24", decimal_arr.value_as_string(14)); + assert_eq!("0.00", decimal_arr.value_as_string(15)); + assert_eq!("-123.00", decimal_arr.value_as_string(16)); + assert_eq!("-123.23", decimal_arr.value_as_string(17)); + assert_eq!("-0.12", decimal_arr.value_as_string(18)); + assert_eq!("1.23", decimal_arr.value_as_string(19)); + assert_eq!("1.24", decimal_arr.value_as_string(20)); + assert_eq!("0.00", decimal_arr.value_as_string(21)); + assert_eq!("123.00", decimal_arr.value_as_string(22)); + assert_eq!("123.23", decimal_arr.value_as_string(23)); + assert_eq!("0.12", decimal_arr.value_as_string(24)); + assert!(decimal_arr.is_null(25)); + assert!(decimal_arr.is_null(26)); + assert!(decimal_arr.is_null(27)); // Decimal256 let output_type = DataType::Decimal256(76, 3); @@ -8051,6 +8318,21 @@ mod tests { assert_eq!("0.000", decimal_arr.value_as_string(10)); assert_eq!("0.000", decimal_arr.value_as_string(11)); assert!(decimal_arr.is_null(12)); + assert_eq!("-1.235", decimal_arr.value_as_string(13)); + assert_eq!("-1.236", decimal_arr.value_as_string(14)); + assert_eq!("0.000", decimal_arr.value_as_string(15)); + assert_eq!("-123.000", decimal_arr.value_as_string(16)); + assert_eq!("-123.234", decimal_arr.value_as_string(17)); + assert_eq!("-0.123", decimal_arr.value_as_string(18)); + assert_eq!("1.235", decimal_arr.value_as_string(19)); + assert_eq!("1.236", decimal_arr.value_as_string(20)); + assert_eq!("0.000", decimal_arr.value_as_string(21)); + assert_eq!("123.000", decimal_arr.value_as_string(22)); + assert_eq!("123.234", decimal_arr.value_as_string(23)); + assert_eq!("0.123", decimal_arr.value_as_string(24)); + assert!(decimal_arr.is_null(25)); + assert!(decimal_arr.is_null(26)); + assert!(decimal_arr.is_null(27)); } #[test] @@ -8069,6 +8351,21 @@ mod tests { Some(""), Some(" "), None, + Some("-1.23499999"), + Some("-1.23599999"), + Some("-0.00001"), + Some("-123"), + Some("-123.234000"), + Some("-000.123"), + Some("+1.23499999"), + Some("+1.23599999"), + Some("+0.00001"), + Some("+123"), + Some("+123.234000"), + Some("+000.123"), + Some("1.-23499999"), + Some("-1.-23499999"), + Some("--1.23499999"), ]); let array = Arc::new(str_array) as ArrayRef; @@ -8091,6 +8388,21 @@ mod tests { Some(""), Some(" "), None, + Some("-1.23499999"), + Some("-1.23599999"), + Some("-0.00001"), + Some("-123"), + Some("-123.234000"), + Some("-000.123"), + Some("+1.23499999"), + Some("+1.23599999"), + Some("+0.00001"), + Some("+123"), + Some("+123.234000"), + Some("+000.123"), + Some("1.-23499999"), + Some("-1.-23499999"), + Some("--1.23499999"), ]); let array = Arc::new(str_array) as ArrayRef; @@ -8129,9 +8441,9 @@ mod tests { let str_array = StringArray::from(vec![". 0.123"]); let array = Arc::new(str_array) as ArrayRef; let casted_err = cast_with_options(&array, &output_type, &option).unwrap_err(); - assert!(casted_err.to_string().contains( - "Cannot cast string '. 0.123' to value of Decimal128(38, 10) type" - )); + assert!(casted_err + .to_string() + .contains("Cannot cast string '. 0.123' to value of Decimal128(38, 10) type")); } fn test_cast_string_to_decimal128_overflow(overflow_array: ArrayRef) { @@ -8152,6 +8464,32 @@ mod tests { ); } + #[test] + fn test_cast_string_to_decimal128_precision_overflow() { + let array = StringArray::from(vec!["1000".to_string()]); + let array = Arc::new(array) as ArrayRef; + let casted_array = cast_with_options( + &array, + &DataType::Decimal128(10, 8), + &CastOptions { + safe: true, + format_options: FormatOptions::default(), + }, + ); + assert!(casted_array.is_ok()); + assert!(casted_array.unwrap().is_null(0)); + + let err = cast_with_options( + &array, + &DataType::Decimal128(10, 8), + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, + ); + assert_eq!("Invalid argument error: 100000000000 is too large to store in a Decimal128 of precision 10. Max is 9999999999", err.unwrap_err().to_string()); + } + #[test] fn test_cast_utf8_to_decimal128_overflow() { let overflow_str_array = StringArray::from(vec![ @@ -8209,6 +8547,32 @@ mod tests { assert!(decimal_arr.is_null(6)); } + #[test] + fn test_cast_string_to_decimal256_precision_overflow() { + let array = StringArray::from(vec!["1000".to_string()]); + let array = Arc::new(array) as ArrayRef; + let casted_array = cast_with_options( + &array, + &DataType::Decimal256(10, 8), + &CastOptions { + safe: true, + format_options: FormatOptions::default(), + }, + ); + assert!(casted_array.is_ok()); + assert!(casted_array.unwrap().is_null(0)); + + let err = cast_with_options( + &array, + &DataType::Decimal256(10, 8), + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, + ); + assert_eq!("Invalid argument error: 100000000000 is too large to store in a Decimal256 of precision 10. Max is 9999999999", err.unwrap_err().to_string()); + } + #[test] fn test_cast_utf8_to_decimal256_overflow() { let overflow_str_array = StringArray::from(vec![ @@ -8356,9 +8720,8 @@ mod tests { let tz = tz.as_ref().parse().unwrap(); - let as_tz = |v: i64| { - as_datetime_with_timezone::(v, tz).unwrap() - }; + let as_tz = + |v: i64| as_datetime_with_timezone::(v, tz).unwrap(); let as_utc = |v: &i64| as_tz(*v).naive_utc().to_string(); let as_local = |v: &i64| as_tz(*v).naive_local().to_string(); @@ -8468,8 +8831,7 @@ mod tests { None, ]; - let array256: Vec> = - array128.iter().map(|v| v.map(i256::from_i128)).collect(); + let array256: Vec> = array128.iter().map(|v| v.map(i256::from_i128)).collect(); test_decimal_to_string::( DataType::Utf8, @@ -8558,11 +8920,9 @@ mod tests { fn test_cast_from_duration_to_interval() { // from duration second to interval month day nano let array = vec![1234567]; - let casted_array = cast_from_duration_to_interval::( - array, - &CastOptions::default(), - ) - .unwrap(); + let casted_array = + cast_from_duration_to_interval::(array, &CastOptions::default()) + .unwrap(); assert_eq!( casted_array.data_type(), &DataType::Interval(IntervalUnit::MonthDayNano) @@ -8681,10 +9041,7 @@ mod tests { .as_any() .downcast_ref::>() .ok_or_else(|| { - ArrowError::ComputeError(format!( - "Failed to downcast to {}", - T::DATA_TYPE - )) + ArrowError::ComputeError(format!("Failed to downcast to {}", T::DATA_TYPE)) }) .cloned() } @@ -8722,8 +9079,7 @@ mod tests { cast_from_interval_to_duration(&array, &nullable).unwrap(); assert!(!casted_array.is_valid(0)); - let res = - cast_from_interval_to_duration::(&array, &fallible); + let res = cast_from_interval_to_duration::(&array, &fallible); assert!(res.is_err()); // from interval month day nano to duration microsecond @@ -8734,8 +9090,7 @@ mod tests { let array = vec![i128::MAX].into(); let casted_array = - cast_from_interval_to_duration::(&array, &nullable) - .unwrap(); + cast_from_interval_to_duration::(&array, &nullable).unwrap(); assert!(!casted_array.is_valid(0)); let casted_array = @@ -8766,8 +9121,7 @@ mod tests { ] .into(); let casted_array = - cast_from_interval_to_duration::(&array, &nullable) - .unwrap(); + cast_from_interval_to_duration::(&array, &nullable).unwrap(); assert!(!casted_array.is_valid(0)); assert!(!casted_array.is_valid(1)); assert!(!casted_array.is_valid(2)); @@ -8836,11 +9190,9 @@ mod tests { fn test_cast_from_interval_day_time_to_interval_month_day_nano() { // from interval day time to interval month day nano let array = vec![123]; - let casted_array = cast_from_interval_day_time_to_interval_month_day_nano( - array, - &CastOptions::default(), - ) - .unwrap(); + let casted_array = + cast_from_interval_day_time_to_interval_month_day_nano(array, &CastOptions::default()) + .unwrap(); assert_eq!( casted_array.data_type(), &DataType::Interval(IntervalUnit::MonthDayNano) @@ -8874,8 +9226,7 @@ mod tests { .map(|ts| ts / 1_000_000) .collect::>(); - let array = - TimestampMillisecondArray::from(ts_array).with_timezone("UTC".to_string()); + let array = TimestampMillisecondArray::from(ts_array).with_timezone("UTC".to_string()); let casted_array = cast(&array, &DataType::Date32).unwrap(); let date_array = casted_array.as_primitive::(); let casted_array = cast(&date_array, &DataType::Utf8).unwrap(); @@ -8902,6 +9253,26 @@ mod tests { assert_eq!(formatted.value(1).to_string(), "[[4], [null], [6]]"); } + #[test] + fn test_nested_list_cast() { + let mut builder = ListBuilder::new(ListBuilder::new(Int32Builder::new())); + builder.append_value([Some([Some(1), Some(2), None]), None]); + builder.append_value([None, Some([]), None]); + builder.append_null(); + builder.append_value([Some([Some(2), Some(3)])]); + let start = builder.finish(); + + let mut builder = LargeListBuilder::new(LargeListBuilder::new(Int8Builder::new())); + builder.append_value([Some([Some(1), Some(2), None]), None]); + builder.append_value([None, Some([]), None]); + builder.append_null(); + builder.append_value([Some([Some(2), Some(3)])]); + let expected = builder.finish(); + + let actual = cast(&start, expected.data_type()).unwrap(); + assert_eq!(actual.as_ref(), &expected); + } + const CAST_OPTIONS: CastOptions<'static> = CastOptions { safe: true, format_options: FormatOptions::new(), diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index d15d57cf3c05..28c29c94bbdb 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -129,10 +129,7 @@ impl<'a> FormatOptions<'a> { } /// Overrides the format used for [`DataType::Timestamp`] columns with a timezone - pub const fn with_timestamp_tz_format( - self, - timestamp_tz_format: Option<&'a str>, - ) -> Self { + pub const fn with_timestamp_tz_format(self, timestamp_tz_format: Option<&'a str>) -> Self { Self { timestamp_tz_format, ..self @@ -173,9 +170,7 @@ impl<'a> ValueFormatter<'a> { match self.formatter.format.write(self.idx, s) { Ok(_) => Ok(()), Err(FormatError::Arrow(e)) => Err(e), - Err(FormatError::Format(_)) => { - Err(ArrowError::CastError("Format error".to_string())) - } + Err(FormatError::Format(_)) => Err(ArrowError::CastError("Format error".to_string())), } } @@ -260,10 +255,7 @@ impl<'a> ArrayFormatter<'a> { /// Returns an [`ArrayFormatter`] that can be used to format `array` /// /// This returns an error if an array of the given data type cannot be formatted - pub fn try_new( - array: &'a dyn Array, - options: &FormatOptions<'a>, - ) -> Result { + pub fn try_new(array: &'a dyn Array, options: &FormatOptions<'a>) -> Result { Ok(Self { format: make_formatter(array, options)?, safe: options.safe, @@ -399,8 +391,15 @@ impl<'a> DisplayIndex for &'a BooleanArray { } } -impl<'a> DisplayIndex for &'a NullArray { - fn write(&self, _idx: usize, _f: &mut dyn Write) -> FormatResult { +impl<'a> DisplayIndexState<'a> for &'a NullArray { + type State = &'a str; + + fn prepare(&self, options: &FormatOptions<'a>) -> Result { + Ok(options.null) + } + + fn write(&self, state: &Self::State, _idx: usize, f: &mut dyn Write) -> FormatResult { + f.write_str(state)?; Ok(()) } } @@ -465,9 +464,7 @@ fn write_timestamp( let date = Utc.from_utc_datetime(&naive).with_timezone(&tz); match format { Some(s) => write!(f, "{}", date.format(s))?, - None => { - write!(f, "{}", date.to_rfc3339_opts(SecondsFormat::AutoSi, true))? - } + None => write!(f, "{}", date.to_rfc3339_opts(SecondsFormat::AutoSi, true))?, } } None => match format { @@ -519,19 +516,11 @@ macro_rules! temporal_display { impl<'a> DisplayIndexState<'a> for &'a PrimitiveArray<$t> { type State = TimeFormat<'a>; - fn prepare( - &self, - options: &FormatOptions<'a>, - ) -> Result { + fn prepare(&self, options: &FormatOptions<'a>) -> Result { Ok(options.$format) } - fn write( - &self, - fmt: &Self::State, - idx: usize, - f: &mut dyn Write, - ) -> FormatResult { + fn write(&self, fmt: &Self::State, idx: usize, f: &mut dyn Write) -> FormatResult { let value = self.value(idx); let naive = $convert(value as _).ok_or_else(|| { ArrowError::CastError(format!( @@ -568,19 +557,11 @@ macro_rules! duration_display { impl<'a> DisplayIndexState<'a> for &'a PrimitiveArray<$t> { type State = DurationFormat; - fn prepare( - &self, - options: &FormatOptions<'a>, - ) -> Result { + fn prepare(&self, options: &FormatOptions<'a>) -> Result { Ok(options.duration_format) } - fn write( - &self, - fmt: &Self::State, - idx: usize, - f: &mut dyn Write, - ) -> FormatResult { + fn write(&self, fmt: &Self::State, idx: usize, f: &mut dyn Write) -> FormatResult { let v = self.value(idx); match fmt { DurationFormat::ISO8601 => write!(f, "{}", $convert(v))?, @@ -697,8 +678,7 @@ impl<'a> DisplayIndex for &'a PrimitiveArray { fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult { let value: u128 = self.value(idx) as u128; - let months_part: i32 = - ((value & 0xFFFFFFFF000000000000000000000000) >> 96) as i32; + let months_part: i32 = ((value & 0xFFFFFFFF000000000000000000000000) >> 96) as i32; let days_part: i32 = ((value & 0xFFFFFFFF0000000000000000) >> 64) as i32; let nanoseconds_part: i64 = (value & 0xFFFFFFFFFFFFFFFF) as i64; @@ -930,10 +910,7 @@ impl<'a> DisplayIndexState<'a> for &'a UnionArray { /// suitable for converting large arrays or record batches. /// /// Please see [`ArrayFormatter`] for a more performant interface -pub fn array_value_to_string( - column: &dyn Array, - row: usize, -) -> Result { +pub fn array_value_to_string(column: &dyn Array, row: usize) -> Result { let options = FormatOptions::default().with_display_error(true); let formatter = ArrayFormatter::try_new(column, &options)?; Ok(formatter.value(row).to_string()) @@ -979,12 +956,9 @@ mod tests { // [[a, b, c], [d, e, f], [g, h]] let entry_offsets = [0, 3, 6, 8]; - let map_array = MapArray::new_from_strings( - keys.clone().into_iter(), - &values_data, - &entry_offsets, - ) - .unwrap(); + let map_array = + MapArray::new_from_strings(keys.clone().into_iter(), &values_data, &entry_offsets) + .unwrap(); assert_eq!( "{d: 30, e: 40, f: 50}", array_value_to_string(&map_array, 1).unwrap() @@ -999,8 +973,7 @@ mod tests { #[test] fn test_array_value_to_string_duration() { let iso_fmt = FormatOptions::new(); - let pretty_fmt = - FormatOptions::new().with_duration_format(DurationFormat::Pretty); + let pretty_fmt = FormatOptions::new().with_duration_format(DurationFormat::Pretty); let array = DurationNanosecondArray::from(vec![ 1, @@ -1098,4 +1071,12 @@ mod tests { assert_eq!(iso[5], "-P45DT50554S"); assert_eq!(pretty[5], "-45 days -14 hours -2 mins -34 secs"); } + + #[test] + fn test_null() { + let array = NullArray::new(2); + let options = FormatOptions::new().with_null("NULL"); + let formatted = format_array(&array, &options); + assert_eq!(formatted, &["NULL".to_string(), "NULL".to_string()]) + } } diff --git a/arrow-cast/src/lib.rs b/arrow-cast/src/lib.rs index d2677a0e0a53..71ebe6c0ed8b 100644 --- a/arrow-cast/src/lib.rs +++ b/arrow-cast/src/lib.rs @@ -21,6 +21,7 @@ pub mod cast; pub use cast::*; pub mod display; pub mod parse; - #[cfg(feature = "prettyprint")] pub mod pretty; + +pub mod base64; diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index ac3b89e0ba02..750f38006d33 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -64,10 +64,7 @@ impl TimestampParser { /// Parses a date of the form `1997-01-31` fn date(&self) -> Option { - if self.mask & 0b1111111111 != 0b1101101111 - || !self.test(4, b'-') - || !self.test(7, b'-') - { + if self.mask & 0b1111111111 != 0b1101101111 || !self.test(4, b'-') || !self.test(7, b'-') { return None; } @@ -173,13 +170,9 @@ impl TimestampParser { /// * "2023-01-01 04:05:06.789 PST", /// /// [IANA timezones]: https://www.iana.org/time-zones -pub fn string_to_datetime( - timezone: &T, - s: &str, -) -> Result, ArrowError> { - let err = |ctx: &str| { - ArrowError::ParseError(format!("Error parsing timestamp from '{s}': {ctx}")) - }; +pub fn string_to_datetime(timezone: &T, s: &str) -> Result, ArrowError> { + let err = + |ctx: &str| ArrowError::ParseError(format!("Error parsing timestamp from '{s}': {ctx}")); let bytes = s.as_bytes(); if bytes.len() < 10 { @@ -277,16 +270,11 @@ pub fn string_to_timestamp_nanos(s: &str) -> Result { to_timestamp_nanos(string_to_datetime(&Utc, s)?.naive_utc()) } -/// Defensive check to prevent chrono-rs panics when nanosecond conversion happens on non-supported dates +/// Fallible conversion of [`NaiveDateTime`] to `i64` nanoseconds #[inline] fn to_timestamp_nanos(dt: NaiveDateTime) -> Result { - if dt.timestamp().checked_mul(1_000_000_000).is_none() { - return Err(ArrowError::ParseError( - ERR_NANOSECONDS_NOT_SUPPORTED.to_string(), - )); - } - - Ok(dt.timestamp_nanos()) + dt.timestamp_nanos_opt() + .ok_or_else(|| ArrowError::ParseError(ERR_NANOSECONDS_NOT_SUPPORTED.to_string())) } /// Accepts a string in ISO8601 standard format and some @@ -305,9 +293,8 @@ fn to_timestamp_nanos(dt: NaiveDateTime) -> Result { /// This function does not support parsing strings with a timezone /// or offset specified, as it considers only time since midnight. pub fn string_to_time_nanoseconds(s: &str) -> Result { - let nt = string_to_time(s).ok_or_else(|| { - ArrowError::ParseError(format!("Failed to parse \'{s}\' as time")) - })?; + let nt = string_to_time(s) + .ok_or_else(|| ArrowError::ParseError(format!("Failed to parse \'{s}\' as time")))?; Ok(nt.num_seconds_from_midnight() as i64 * 1_000_000_000 + nt.nanosecond() as i64) } @@ -318,12 +305,8 @@ fn string_to_time(s: &str) -> Option { } let (am, bytes) = match bytes.get(bytes.len() - 3..) { - Some(b" AM" | b" am" | b" Am" | b" aM") => { - (Some(true), &bytes[..bytes.len() - 3]) - } - Some(b" PM" | b" pm" | b" pM" | b" Pm") => { - (Some(false), &bytes[..bytes.len() - 3]) - } + Some(b" AM" | b" am" | b" Am" | b" aM") => (Some(true), &bytes[..bytes.len() - 3]), + Some(b" PM" | b" pm" | b" pM" | b" Pm") => (Some(false), &bytes[..bytes.len() - 3]), _ => (None, bytes), }; @@ -506,10 +489,7 @@ impl Parser for Time64NanosecondType { fn parse_formatted(string: &str, format: &str) -> Option { let nt = NaiveTime::parse_from_str(string, format).ok()?; - Some( - nt.num_seconds_from_midnight() as i64 * 1_000_000_000 - + nt.nanosecond() as i64, - ) + Some(nt.num_seconds_from_midnight() as i64 * 1_000_000_000 + nt.nanosecond() as i64) } } @@ -524,10 +504,7 @@ impl Parser for Time64MicrosecondType { fn parse_formatted(string: &str, format: &str) -> Option { let nt = NaiveTime::parse_from_str(string, format).ok()?; - Some( - nt.num_seconds_from_midnight() as i64 * 1_000_000 - + nt.nanosecond() as i64 / 1_000, - ) + Some(nt.num_seconds_from_midnight() as i64 * 1_000_000 + nt.nanosecond() as i64 / 1_000) } } @@ -542,10 +519,7 @@ impl Parser for Time32MillisecondType { fn parse_formatted(string: &str, format: &str) -> Option { let nt = NaiveTime::parse_from_str(string, format).ok()?; - Some( - nt.num_seconds_from_midnight() as i32 * 1_000 - + nt.nanosecond() as i32 / 1_000_000, - ) + Some(nt.num_seconds_from_midnight() as i32 * 1_000 + nt.nanosecond() as i32 / 1_000_000) } } @@ -560,10 +534,7 @@ impl Parser for Time32SecondType { fn parse_formatted(string: &str, format: &str) -> Option { let nt = NaiveTime::parse_from_str(string, format).ok()?; - Some( - nt.num_seconds_from_midnight() as i32 - + nt.nanosecond() as i32 / 1_000_000_000, - ) + Some(nt.num_seconds_from_midnight() as i32 + nt.nanosecond() as i32 / 1_000_000_000) } } @@ -588,8 +559,20 @@ fn parse_date(string: &str) -> Option { const HYPHEN: u8 = b'-'.wrapping_sub(b'0'); + // refer to https://www.rfc-editor.org/rfc/rfc3339#section-3 if digits[4] != HYPHEN { - return None; + let (year, month, day) = match (mask, string.len()) { + (0b11111111, 8) => ( + digits[0] as u16 * 1000 + + digits[1] as u16 * 100 + + digits[2] as u16 * 10 + + digits[3] as u16, + digits[4] * 10 + digits[5], + digits[6] * 10 + digits[7], + ), + _ => return None, + }; + return NaiveDate::from_ymd_opt(year as _, month as _, day as _); } let (month, day) = match mask { @@ -620,10 +603,8 @@ fn parse_date(string: &str) -> Option { _ => return None, }; - let year = digits[0] as u16 * 1000 - + digits[1] as u16 * 100 - + digits[2] as u16 * 10 - + digits[3] as u16; + let year = + digits[0] as u16 * 1000 + digits[1] as u16 * 100 + digits[2] as u16 * 10 + digits[3] as u16; NaiveDate::from_ymd_opt(year as _, month as _, day as _) } @@ -733,8 +714,7 @@ pub fn parse_decimal( fractionals += 1; digits += 1; result = result.mul_wrapping(base); - result = - result.add_wrapping(T::Native::usize_as((b - b'0') as usize)); + result = result.add_wrapping(T::Native::usize_as((b - b'0') as usize)); } // Fail on "." @@ -776,9 +756,11 @@ pub fn parse_interval_year_month( let config = IntervalParseConfig::new(IntervalUnit::Year); let interval = Interval::parse(value, &config)?; - let months = interval.to_year_months().map_err(|_| ArrowError::CastError(format!( + let months = interval.to_year_months().map_err(|_| { + ArrowError::CastError(format!( "Cannot cast {value} to IntervalYearMonth. Only year and month fields are allowed." - )))?; + )) + })?; Ok(IntervalYearMonthType::make_value(0, months)) } @@ -893,21 +875,16 @@ impl FromStr for IntervalAmount { Ok(0) } else { integer.parse::().map_err(|_| { - ArrowError::ParseError(format!( - "Failed to parse {s} as interval amount" - )) + ArrowError::ParseError(format!("Failed to parse {s} as interval amount")) }) }?; let frac_unscaled = frac.parse::().map_err(|_| { - ArrowError::ParseError(format!( - "Failed to parse {s} as interval amount" - )) + ArrowError::ParseError(format!("Failed to parse {s} as interval amount")) })?; // scale fractional part by interval precision - let frac = - frac_unscaled * 10_i64.pow(INTERVAL_PRECISION - frac.len() as u32); + let frac = frac_unscaled * 10_i64.pow(INTERVAL_PRECISION - frac.len() as u32); // propagate the sign of the integer part to the fractional part let frac = if integer < 0 || explicit_neg { @@ -920,9 +897,9 @@ impl FromStr for IntervalAmount { Ok(result) } - Some((_, frac)) if frac.starts_with('-') => Err(ArrowError::ParseError( - format!("Failed to parse {s} as interval amount"), - )), + Some((_, frac)) if frac.starts_with('-') => Err(ArrowError::ParseError(format!( + "Failed to parse {s} as interval amount" + ))), Some((_, frac)) if frac.len() > INTERVAL_PRECISION as usize => { Err(ArrowError::ParseError(format!( "{s} exceeds the precision available for interval amount" @@ -930,9 +907,7 @@ impl FromStr for IntervalAmount { } Some(_) | None => { let integer = s.parse::().map_err(|_| { - ArrowError::ParseError(format!( - "Failed to parse {s} as interval amount" - )) + ArrowError::ParseError(format!("Failed to parse {s} as interval amount")) })?; let result = Self { integer, frac: 0 }; @@ -1010,25 +985,20 @@ impl Interval { /// e.g. INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days /// e.g. INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours /// [Postgres reference](https://www.postgresql.org/docs/15/datatype-datetime.html#DATATYPE-INTERVAL-INPUT:~:text=Field%20values%20can,fractional%20on%20output.) - fn add( - &self, - amount: IntervalAmount, - unit: IntervalUnit, - ) -> Result { + fn add(&self, amount: IntervalAmount, unit: IntervalUnit) -> Result { let result = match unit { IntervalUnit::Century => { let months_int = amount.integer.mul_checked(100)?.mul_checked(12)?; let month_frac = amount.frac * 12 / 10_i64.pow(INTERVAL_PRECISION - 2); - let months = - months_int - .add_checked(month_frac)? - .try_into() - .map_err(|_| { - ArrowError::ParseError(format!( - "Unable to represent {} centuries as months in a signed 32-bit integer", - &amount.integer - )) - })?; + let months = months_int + .add_checked(month_frac)? + .try_into() + .map_err(|_| { + ArrowError::ParseError(format!( + "Unable to represent {} centuries as months in a signed 32-bit integer", + &amount.integer + )) + })?; Self::new(self.months.add_checked(months)?, self.days, self.nanos) } @@ -1036,32 +1006,30 @@ impl Interval { let months_int = amount.integer.mul_checked(10)?.mul_checked(12)?; let month_frac = amount.frac * 12 / 10_i64.pow(INTERVAL_PRECISION - 1); - let months = - months_int - .add_checked(month_frac)? - .try_into() - .map_err(|_| { - ArrowError::ParseError(format!( - "Unable to represent {} decades as months in a signed 32-bit integer", - &amount.integer - )) - })?; + let months = months_int + .add_checked(month_frac)? + .try_into() + .map_err(|_| { + ArrowError::ParseError(format!( + "Unable to represent {} decades as months in a signed 32-bit integer", + &amount.integer + )) + })?; Self::new(self.months.add_checked(months)?, self.days, self.nanos) } IntervalUnit::Year => { let months_int = amount.integer.mul_checked(12)?; let month_frac = amount.frac * 12 / 10_i64.pow(INTERVAL_PRECISION); - let months = - months_int - .add_checked(month_frac)? - .try_into() - .map_err(|_| { - ArrowError::ParseError(format!( - "Unable to represent {} years as months in a signed 32-bit integer", - &amount.integer - )) - })?; + let months = months_int + .add_checked(month_frac)? + .try_into() + .map_err(|_| { + ArrowError::ParseError(format!( + "Unable to represent {} years as months in a signed 32-bit integer", + &amount.integer + )) + })?; Self::new(self.months.add_checked(months)?, self.days, self.nanos) } @@ -1095,8 +1063,7 @@ impl Interval { )) })?; - let nanos = - amount.frac * 7 * 24 * 6 * 6 / 10_i64.pow(INTERVAL_PRECISION - 11); + let nanos = amount.frac * 7 * 24 * 6 * 6 / 10_i64.pow(INTERVAL_PRECISION - 11); Self::new( self.months, @@ -1112,8 +1079,7 @@ impl Interval { )) })?; - let nanos = - amount.frac * 24 * 6 * 6 / 10_i64.pow(INTERVAL_PRECISION - 11); + let nanos = amount.frac * 24 * 6 * 6 / 10_i64.pow(INTERVAL_PRECISION - 11); Self::new( self.months, @@ -1123,8 +1089,7 @@ impl Interval { } IntervalUnit::Hour => { let nanos_int = amount.integer.mul_checked(NANOS_PER_HOUR)?; - let nanos_frac = - amount.frac * 6 * 6 / 10_i64.pow(INTERVAL_PRECISION - 11); + let nanos_frac = amount.frac * 6 * 6 / 10_i64.pow(INTERVAL_PRECISION - 11); let nanos = nanos_int.add_checked(nanos_frac)?; Interval::new(self.months, self.days, self.nanos.add_checked(nanos)?) @@ -1313,12 +1278,12 @@ mod tests { // Ensure both T and ' ' variants work assert_eq!( - naive_datetime.timestamp_nanos(), + naive_datetime.timestamp_nanos_opt().unwrap(), parse_timestamp("2020-09-08T13:42:29.190855").unwrap() ); assert_eq!( - naive_datetime.timestamp_nanos(), + naive_datetime.timestamp_nanos_opt().unwrap(), parse_timestamp("2020-09-08 13:42:29.190855").unwrap() ); @@ -1331,12 +1296,12 @@ mod tests { // Ensure both T and ' ' variants work assert_eq!( - naive_datetime_whole_secs.timestamp_nanos(), + naive_datetime_whole_secs.timestamp_nanos_opt().unwrap(), parse_timestamp("2020-09-08T13:42:29").unwrap() ); assert_eq!( - naive_datetime_whole_secs.timestamp_nanos(), + naive_datetime_whole_secs.timestamp_nanos_opt().unwrap(), parse_timestamp("2020-09-08 13:42:29").unwrap() ); @@ -1349,7 +1314,7 @@ mod tests { ); assert_eq!( - naive_datetime_no_time.timestamp_nanos(), + naive_datetime_no_time.timestamp_nanos_opt().unwrap(), parse_timestamp("2020-09-08").unwrap() ) } @@ -1403,8 +1368,7 @@ mod tests { "2030-12-04T17:11:10.123456", ]; for case in cases { - let chrono = - NaiveDateTime::parse_from_str(case, "%Y-%m-%dT%H:%M:%S%.f").unwrap(); + let chrono = NaiveDateTime::parse_from_str(case, "%Y-%m-%dT%H:%M:%S%.f").unwrap(); let custom = string_to_datetime(&Utc, case).unwrap(); assert_eq!(chrono, custom.naive_utc()) } @@ -1436,8 +1400,7 @@ mod tests { ]; for (s, ctx) in cases { - let expected = - format!("Parser error: Error parsing timestamp from '{s}': {ctx}"); + let expected = format!("Parser error: Error parsing timestamp from '{s}': {ctx}"); let actual = string_to_datetime(&Utc, s).unwrap_err().to_string(); assert_eq!(actual, expected) } @@ -1463,12 +1426,12 @@ mod tests { // Ensure both T and ' ' variants work assert_eq!( - naive_datetime.timestamp_nanos(), + naive_datetime.timestamp_nanos_opt().unwrap(), parse_timestamp("2020-09-08T13:42:29.190855").unwrap() ); assert_eq!( - naive_datetime.timestamp_nanos(), + naive_datetime.timestamp_nanos_opt().unwrap(), parse_timestamp("2020-09-08 13:42:29.190855").unwrap() ); @@ -1479,12 +1442,12 @@ mod tests { // Ensure both T and ' ' variants work assert_eq!( - naive_datetime.timestamp_nanos(), + naive_datetime.timestamp_nanos_opt().unwrap(), parse_timestamp("2020-09-08T13:42:29").unwrap() ); assert_eq!( - naive_datetime.timestamp_nanos(), + naive_datetime.timestamp_nanos_opt().unwrap(), parse_timestamp("2020-09-08 13:42:29").unwrap() ); @@ -1502,8 +1465,7 @@ mod tests { assert_eq!(local, "2020-09-08 15:42:29"); let dt = - NaiveDateTime::parse_from_str("2020-09-08T13:42:29Z", "%Y-%m-%dT%H:%M:%SZ") - .unwrap(); + NaiveDateTime::parse_from_str("2020-09-08T13:42:29Z", "%Y-%m-%dT%H:%M:%SZ").unwrap(); let local: Tz = "+08:00".parse().unwrap(); // Parsed as offset from UTC @@ -1634,10 +1596,7 @@ mod tests { // custom format assert_eq!( - Time64NanosecondType::parse_formatted( - "02 - 10 - 01 - .1234567", - "%H - %M - %S - %.f" - ), + Time64NanosecondType::parse_formatted("02 - 10 - 01 - .1234567", "%H - %M - %S - %.f"), Some(7_801_123_456_700) ); } @@ -1714,10 +1673,7 @@ mod tests { // custom format assert_eq!( - Time64MicrosecondType::parse_formatted( - "02 - 10 - 01 - .1234", - "%H - %M - %S - %.f" - ), + Time64MicrosecondType::parse_formatted("02 - 10 - 01 - .1234", "%H - %M - %S - %.f"), Some(7_801_123_400) ); } @@ -1764,10 +1720,7 @@ mod tests { // custom format assert_eq!( - Time32MillisecondType::parse_formatted( - "02 - 10 - 01 - .1", - "%H - %M - %S - %.f" - ), + Time32MillisecondType::parse_formatted("02 - 10 - 01 - .1", "%H - %M - %S - %.f"), Some(7_801_100) ); } @@ -2010,8 +1963,19 @@ mod tests { ); assert_eq!( - Interval::new(-13i32, -8i32, -NANOS_PER_HOUR - NANOS_PER_MINUTE - NANOS_PER_SECOND - (1.11_f64 * NANOS_PER_MILLIS as f64) as i64), - Interval::parse("-1 year -1 month -1 week -1 day -1 hour -1 minute -1 second -1.11 millisecond", &config).unwrap(), + Interval::new( + -13i32, + -8i32, + -NANOS_PER_HOUR + - NANOS_PER_MINUTE + - NANOS_PER_SECOND + - (1.11_f64 * NANOS_PER_MILLIS as f64) as i64 + ), + Interval::parse( + "-1 year -1 month -1 week -1 day -1 hour -1 minute -1 second -1.11 millisecond", + &config + ) + .unwrap(), ); } @@ -2285,22 +2249,34 @@ mod tests { let edge_tests_256 = [ ( "9999999999999999999999999999999999999999999999999999999999999999999999999999", -i256::from_string("9999999999999999999999999999999999999999999999999999999999999999999999999999").unwrap(), + i256::from_string( + "9999999999999999999999999999999999999999999999999999999999999999999999999999", + ) + .unwrap(), 0, ), ( "999999999999999999999999999999999999999999999999999999999999999999999999.9999", - i256::from_string("9999999999999999999999999999999999999999999999999999999999999999999999999999").unwrap(), + i256::from_string( + "9999999999999999999999999999999999999999999999999999999999999999999999999999", + ) + .unwrap(), 4, ), ( "99999999999999999999999999999999999999999999999999.99999999999999999999999999", - i256::from_string("9999999999999999999999999999999999999999999999999999999999999999999999999999").unwrap(), + i256::from_string( + "9999999999999999999999999999999999999999999999999999999999999999999999999999", + ) + .unwrap(), 26, ), ( "99999999999999999999999999999999999999999999999999", - i256::from_string("9999999999999999999999999999999999999999999999999900000000000000000000000000").unwrap(), + i256::from_string( + "9999999999999999999999999999999999999999999999999900000000000000000000000000", + ) + .unwrap(), 26, ), ]; diff --git a/arrow-cast/src/pretty.rs b/arrow-cast/src/pretty.rs index 59a9f9d605e2..550afa9f739d 100644 --- a/arrow-cast/src/pretty.rs +++ b/arrow-cast/src/pretty.rs @@ -25,9 +25,7 @@ use comfy_table::{Cell, Table}; use std::fmt::Display; /// Create a visual representation of record batches -pub fn pretty_format_batches( - results: &[RecordBatch], -) -> Result { +pub fn pretty_format_batches(results: &[RecordBatch]) -> Result { let options = FormatOptions::default().with_display_error(true); pretty_format_batches_with_options(results, &options) } @@ -70,10 +68,7 @@ pub fn print_columns(col_name: &str, results: &[ArrayRef]) -> Result<(), ArrowEr } /// Convert a series of record batches into a table -fn create_table( - results: &[RecordBatch], - options: &FormatOptions, -) -> Result { +fn create_table(results: &[RecordBatch], options: &FormatOptions) -> Result { let mut table = Table::new(); table.load_preset("||--+-++| ++++++"); @@ -209,8 +204,8 @@ mod tests { let table = pretty_format_columns("a", &columns).unwrap().to_string(); let expected = vec![ - "+---+", "| a |", "+---+", "| a |", "| b |", "| |", "| d |", "| e |", - "| |", "| g |", "+---+", + "+---+", "| a |", "+---+", "| a |", "| b |", "| |", "| d |", "| e |", "| |", + "| g |", "+---+", ]; let actual: Vec<&str> = table.lines().collect(); @@ -289,10 +284,8 @@ mod tests { #[test] fn test_pretty_format_fixed_size_list() { // define a schema. - let field_type = DataType::FixedSizeList( - Arc::new(Field::new("item", DataType::Int32, true)), - 3, - ); + let field_type = + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, true)), 3); let schema = Arc::new(Schema::new(vec![Field::new("d1", field_type, true)])); let keys_builder = Int32Array::builder(3); @@ -383,10 +376,7 @@ mod tests { }; } - fn timestamp_batch( - timezone: &str, - value: T::Native, - ) -> RecordBatch { + fn timestamp_batch(timezone: &str, value: T::Native) -> RecordBatch { let mut builder = PrimitiveBuilder::::with_capacity(10); builder.append_value(value); builder.append_null(); @@ -621,8 +611,8 @@ mod tests { let table = pretty_format_batches(&[batch]).unwrap().to_string(); let expected = vec![ - "+------+", "| f |", "+------+", "| 101 |", "| |", "| 200 |", - "| 3040 |", "+------+", + "+------+", "| f |", "+------+", "| 101 |", "| |", "| 200 |", "| 3040 |", + "+------+", ]; let actual: Vec<&str> = table.lines().collect(); @@ -660,16 +650,14 @@ mod tests { )), Arc::new(StructArray::from(vec![( Arc::new(Field::new("c121", DataType::Utf8, false)), - Arc::new(StringArray::from(vec![Some("e"), Some("f"), Some("g")])) - as ArrayRef, + Arc::new(StringArray::from(vec![Some("e"), Some("f"), Some("g")])) as ArrayRef, )])) as ArrayRef, ), ]); let c2 = StringArray::from(vec![Some("a"), Some("b"), Some("c")]); let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]) - .unwrap(); + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]).unwrap(); let table = pretty_format_batches(&[batch]).unwrap().to_string(); let expected = vec![ @@ -705,8 +693,7 @@ mod tests { UnionMode::Dense, )]); - let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(union)]).unwrap(); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(union)]).unwrap(); let table = pretty_format_batches(&[batch]).unwrap().to_string(); let actual: Vec<&str> = table.lines().collect(); let expected = vec![ @@ -742,8 +729,7 @@ mod tests { UnionMode::Sparse, )]); - let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(union)]).unwrap(); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(union)]).unwrap(); let table = pretty_format_batches(&[batch]).unwrap().to_string(); let actual: Vec<&str> = table.lines().collect(); let expected = vec![ @@ -799,8 +785,7 @@ mod tests { UnionMode::Sparse, )]); - let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(outer)]).unwrap(); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(outer)]).unwrap(); let table = pretty_format_batches(&[batch]).unwrap().to_string(); let actual: Vec<&str> = table.lines().collect(); let expected = vec![ @@ -882,8 +867,7 @@ mod tests { let table = pretty_format_batches(&[batch]).unwrap().to_string(); let expected = vec![ - "+------+", "| f16 |", "+------+", "| NaN |", "| 4 |", "| -inf |", - "+------+", + "+------+", "| f16 |", "+------+", "| NaN |", "| 4 |", "| -inf |", "+------+", ]; let actual: Vec<&str> = table.lines().collect(); @@ -986,9 +970,7 @@ mod tests { fn test_format_options() { let options = FormatOptions::default().with_null("null"); let array = Int32Array::from(vec![Some(1), Some(2), None, Some(3), Some(4)]); - let batch = - RecordBatch::try_from_iter([("my_column_name", Arc::new(array) as _)]) - .unwrap(); + let batch = RecordBatch::try_from_iter([("my_column_name", Arc::new(array) as _)]).unwrap(); let column = pretty_format_columns_with_options( "my_column_name", diff --git a/arrow-csv/Cargo.toml b/arrow-csv/Cargo.toml index 1f1a762d5065..66a6d7dbcaa5 100644 --- a/arrow-csv/Cargo.toml +++ b/arrow-csv/Cargo.toml @@ -39,7 +39,7 @@ arrow-buffer = { workspace = true } arrow-cast = { workspace = true } arrow-data = { workspace = true } arrow-schema = { workspace = true } -chrono = { version = "0.4.23", default-features = false, features = ["clock"] } +chrono = { workspace = true } csv = { version = "1.1", default-features = false } csv-core = { version = "0.1" } lazy_static = { version = "1.4", default-features = false } diff --git a/arrow-csv/examples/README.md b/arrow-csv/examples/README.md new file mode 100644 index 000000000000..340413e76d94 --- /dev/null +++ b/arrow-csv/examples/README.md @@ -0,0 +1,21 @@ + + +# Examples +- [`csv_calculation.rs`](csv_calculation.rs): performs a simple calculation using the CSV reader \ No newline at end of file diff --git a/arrow-csv/examples/csv_calculation.rs b/arrow-csv/examples/csv_calculation.rs new file mode 100644 index 000000000000..6ce963e2b012 --- /dev/null +++ b/arrow-csv/examples/csv_calculation.rs @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::cast::AsArray; +use arrow_array::types::Int16Type; +use arrow_csv::ReaderBuilder; + +use arrow_schema::{DataType, Field, Schema}; +use std::fs::File; +use std::sync::Arc; + +fn main() { + // read csv from file + let file = File::open("arrow-csv/test/data/example.csv").unwrap(); + let csv_schema = Schema::new(vec![ + Field::new("c1", DataType::Int16, true), + Field::new("c2", DataType::Float32, true), + Field::new("c3", DataType::Utf8, true), + Field::new("c4", DataType::Boolean, true), + ]); + let mut reader = ReaderBuilder::new(Arc::new(csv_schema)) + .with_header(true) + .build(file) + .unwrap(); + + match reader.next() { + Some(r) => match r { + Ok(r) => { + // get the column(0) max value + let col = r.column(0).as_primitive::(); + let max = col.iter().max().flatten(); + println!("max value column(0): {max:?}") + } + Err(e) => { + println!("{e:?}"); + } + }, + None => { + println!("csv is empty"); + } + } +} diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 328c2cd41f3b..83c8965fdf8a 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -133,8 +133,8 @@ use arrow_schema::*; use chrono::{TimeZone, Utc}; use csv::StringRecord; use lazy_static::lazy_static; -use regex::RegexSet; -use std::fmt; +use regex::{Regex, RegexSet}; +use std::fmt::{self, Debug}; use std::fs::File; use std::io::{BufRead, BufReader as StdBufReader, Read, Seek, SeekFrom}; use std::sync::Arc; @@ -157,6 +157,22 @@ lazy_static! { ]).unwrap(); } +/// A wrapper over `Option` to check if the value is `NULL`. +#[derive(Debug, Clone, Default)] +struct NullRegex(Option); + +impl NullRegex { + /// Returns true if the value should be considered as `NULL` according to + /// the provided regular expression. + #[inline] + fn is_null(&self, s: &str) -> bool { + match &self.0 { + Some(r) => r.is_match(s), + None => s.is_empty(), + } + } +} + #[derive(Default, Copy, Clone)] struct InferredDataType { /// Packed booleans indicating type @@ -177,6 +193,7 @@ impl InferredDataType { /// Returns the inferred data type fn get(&self) -> DataType { match self.packed { + 0 => DataType::Null, 1 => DataType::Boolean, 2 => DataType::Int64, 4 | 6 => DataType::Float64, // Promote Int64 to Float64 @@ -208,16 +225,17 @@ impl InferredDataType { /// The format specification for the CSV file #[derive(Debug, Clone, Default)] pub struct Format { - has_header: bool, + header: bool, delimiter: Option, escape: Option, quote: Option, terminator: Option, + null_regex: NullRegex, } impl Format { pub fn with_header(mut self, has_header: bool) -> Self { - self.has_header = has_header; + self.header = has_header; self } @@ -241,6 +259,12 @@ impl Format { self } + /// Provide a regex to match null values, defaults to `^$` + pub fn with_null_regex(mut self, null_regex: Regex) -> Self { + self.null_regex = NullRegex(Some(null_regex)); + self + } + /// Infer schema of CSV records from the provided `reader` /// /// If `max_records` is `None`, all records will be read, otherwise up to `max_records` @@ -256,7 +280,7 @@ impl Format { // get or create header names // when has_header is false, creates default column names with column_ prefix - let headers: Vec = if self.has_header { + let headers: Vec = if self.header { let headers = &csv_reader.headers().map_err(map_csv_error)?.clone(); headers.iter().map(|s| s.to_string()).collect() } else { @@ -268,8 +292,7 @@ impl Format { let header_length = headers.len(); // keep track of inferred field types - let mut column_types: Vec = - vec![Default::default(); header_length]; + let mut column_types: Vec = vec![Default::default(); header_length]; let mut records_count = 0; @@ -283,11 +306,9 @@ impl Format { // Note since we may be looking at a sample of the data, we make the safe assumption that // they could be nullable - for (i, column_type) in - column_types.iter_mut().enumerate().take(header_length) - { + for (i, column_type) in column_types.iter_mut().enumerate().take(header_length) { if let Some(string) = record.get(i) { - if !string.is_empty() { + if !self.null_regex.is_null(string) { column_type.update(string) } } @@ -307,7 +328,7 @@ impl Format { /// Build a [`csv::Reader`] for this [`Format`] fn build_reader(&self, reader: R) -> csv::Reader { let mut builder = csv::ReaderBuilder::new(); - builder.has_headers(self.has_header); + builder.has_headers(self.header); if let Some(c) = self.delimiter { builder.delimiter(c); @@ -379,7 +400,7 @@ pub fn infer_reader_schema( ) -> Result<(Schema, usize), ArrowError> { let format = Format { delimiter: Some(delimiter), - has_header, + header: has_header, ..Default::default() }; format.infer_schema(reader, max_read_records) @@ -401,7 +422,7 @@ pub fn infer_schema_from_files( let mut records_to_read = max_read_records.unwrap_or(usize::MAX); let format = Format { delimiter: Some(delimiter), - has_header, + header: has_header, ..Default::default() }; @@ -557,6 +578,9 @@ pub struct Decoder { /// A decoder for [`StringRecords`] record_decoder: RecordDecoder, + + /// Check if the string matches this pattern for `NULL`. + null_regex: NullRegex, } impl Decoder { @@ -579,8 +603,7 @@ impl Decoder { return Ok(bytes); } - let to_read = - self.batch_size.min(self.end - self.line_number) - self.record_decoder.len(); + let to_read = self.batch_size.min(self.end - self.line_number) - self.record_decoder.len(); let (_, bytes) = self.record_decoder.decode(buf, to_read)?; Ok(bytes) } @@ -603,6 +626,7 @@ impl Decoder { Some(self.schema.metadata.clone()), self.projection.as_ref(), self.line_number, + &self.null_regex, )?; self.line_number += rows.len(); Ok(Some(batch)) @@ -621,6 +645,7 @@ fn parse( metadata: Option>, projection: Option<&Vec>, line_number: usize, + null_regex: &NullRegex, ) -> Result { let projection: Vec = match projection { Some(v) => v.clone(), @@ -633,70 +658,70 @@ fn parse( let i = *i; let field = &fields[i]; match field.data_type() { - DataType::Boolean => build_boolean_array(line_number, rows, i), - DataType::Decimal128(precision, scale) => { - build_decimal_array::( - line_number, - rows, - i, - *precision, - *scale, - ) - } - DataType::Decimal256(precision, scale) => { - build_decimal_array::( - line_number, - rows, - i, - *precision, - *scale, - ) + DataType::Boolean => build_boolean_array(line_number, rows, i, null_regex), + DataType::Decimal128(precision, scale) => build_decimal_array::( + line_number, + rows, + i, + *precision, + *scale, + null_regex, + ), + DataType::Decimal256(precision, scale) => build_decimal_array::( + line_number, + rows, + i, + *precision, + *scale, + null_regex, + ), + DataType::Int8 => { + build_primitive_array::(line_number, rows, i, null_regex) } - DataType::Int8 => build_primitive_array::(line_number, rows, i), DataType::Int16 => { - build_primitive_array::(line_number, rows, i) + build_primitive_array::(line_number, rows, i, null_regex) } DataType::Int32 => { - build_primitive_array::(line_number, rows, i) + build_primitive_array::(line_number, rows, i, null_regex) } DataType::Int64 => { - build_primitive_array::(line_number, rows, i) + build_primitive_array::(line_number, rows, i, null_regex) } DataType::UInt8 => { - build_primitive_array::(line_number, rows, i) + build_primitive_array::(line_number, rows, i, null_regex) } DataType::UInt16 => { - build_primitive_array::(line_number, rows, i) + build_primitive_array::(line_number, rows, i, null_regex) } DataType::UInt32 => { - build_primitive_array::(line_number, rows, i) + build_primitive_array::(line_number, rows, i, null_regex) } DataType::UInt64 => { - build_primitive_array::(line_number, rows, i) + build_primitive_array::(line_number, rows, i, null_regex) } DataType::Float32 => { - build_primitive_array::(line_number, rows, i) + build_primitive_array::(line_number, rows, i, null_regex) } DataType::Float64 => { - build_primitive_array::(line_number, rows, i) + build_primitive_array::(line_number, rows, i, null_regex) } DataType::Date32 => { - build_primitive_array::(line_number, rows, i) + build_primitive_array::(line_number, rows, i, null_regex) } DataType::Date64 => { - build_primitive_array::(line_number, rows, i) + build_primitive_array::(line_number, rows, i, null_regex) } DataType::Time32(TimeUnit::Second) => { - build_primitive_array::(line_number, rows, i) + build_primitive_array::(line_number, rows, i, null_regex) } DataType::Time32(TimeUnit::Millisecond) => { - build_primitive_array::(line_number, rows, i) + build_primitive_array::(line_number, rows, i, null_regex) } DataType::Time64(TimeUnit::Microsecond) => { - build_primitive_array::(line_number, rows, i) + build_primitive_array::(line_number, rows, i, null_regex) } DataType::Time64(TimeUnit::Nanosecond) => { - build_primitive_array::(line_number, rows, i) + build_primitive_array::(line_number, rows, i, null_regex) } DataType::Timestamp(TimeUnit::Second, tz) => { build_timestamp_array::( @@ -704,6 +729,7 @@ fn parse( rows, i, tz.as_deref(), + null_regex, ) } DataType::Timestamp(TimeUnit::Millisecond, tz) => { @@ -712,6 +738,7 @@ fn parse( rows, i, tz.as_deref(), + null_regex, ) } DataType::Timestamp(TimeUnit::Microsecond, tz) => { @@ -720,6 +747,7 @@ fn parse( rows, i, tz.as_deref(), + null_regex, ) } DataType::Timestamp(TimeUnit::Nanosecond, tz) => { @@ -728,11 +756,16 @@ fn parse( rows, i, tz.as_deref(), + null_regex, ) } + DataType::Null => Ok(Arc::new(NullArray::builder(rows.len()).finish()) as ArrayRef), DataType::Utf8 => Ok(Arc::new( rows.iter() - .map(|row| Some(row.get(i))) + .map(|row| { + let s = row.get(i); + (!null_regex.is_null(s)).then_some(s) + }) .collect::(), ) as ArrayRef), DataType::Dictionary(key_type, value_type) @@ -791,8 +824,7 @@ fn parse( }) .collect(); - let projected_fields: Fields = - projection.iter().map(|i| fields[*i].clone()).collect(); + let projected_fields: Fields = projection.iter().map(|i| fields[*i].clone()).collect(); let projected_schema = Arc::new(match metadata { None => Schema::new(projected_fields), @@ -827,16 +859,16 @@ fn build_decimal_array( col_idx: usize, precision: u8, scale: i8, + null_regex: &NullRegex, ) -> Result { let mut decimal_builder = PrimitiveBuilder::::with_capacity(rows.len()); for row in rows.iter() { let s = row.get(col_idx); - if s.is_empty() { + if null_regex.is_null(s) { // append null decimal_builder.append_null(); } else { - let decimal_value: Result = - parse_decimal::(s, precision, scale); + let decimal_value: Result = parse_decimal::(s, precision, scale); match decimal_value { Ok(v) => { decimal_builder.append_value(v); @@ -859,12 +891,13 @@ fn build_primitive_array( line_number: usize, rows: &StringRecords<'_>, col_idx: usize, + null_regex: &NullRegex, ) -> Result { rows.iter() .enumerate() .map(|(row_index, row)| { let s = row.get(col_idx); - if s.is_empty() { + if null_regex.is_null(s) { return Ok(None); } @@ -888,14 +921,15 @@ fn build_timestamp_array( rows: &StringRecords<'_>, col_idx: usize, timezone: Option<&str>, + null_regex: &NullRegex, ) -> Result { Ok(Arc::new(match timezone { Some(timezone) => { let tz: Tz = timezone.parse()?; - build_timestamp_array_impl::(line_number, rows, col_idx, &tz)? + build_timestamp_array_impl::(line_number, rows, col_idx, &tz, null_regex)? .with_timezone(timezone) } - None => build_timestamp_array_impl::(line_number, rows, col_idx, &Utc)?, + None => build_timestamp_array_impl::(line_number, rows, col_idx, &Utc, null_regex)?, })) } @@ -904,29 +938,36 @@ fn build_timestamp_array_impl( rows: &StringRecords<'_>, col_idx: usize, timezone: &Tz, + null_regex: &NullRegex, ) -> Result, ArrowError> { rows.iter() .enumerate() .map(|(row_index, row)| { let s = row.get(col_idx); - if s.is_empty() { + if null_regex.is_null(s) { return Ok(None); } - let date = string_to_datetime(timezone, s).map_err(|e| { - ArrowError::ParseError(format!( - "Error parsing column {col_idx} at line {}: {}", - line_number + row_index, - e - )) - })?; - - Ok(Some(match T::UNIT { - TimeUnit::Second => date.timestamp(), - TimeUnit::Millisecond => date.timestamp_millis(), - TimeUnit::Microsecond => date.timestamp_micros(), - TimeUnit::Nanosecond => date.timestamp_nanos(), - })) + let date = string_to_datetime(timezone, s) + .and_then(|date| match T::UNIT { + TimeUnit::Second => Ok(date.timestamp()), + TimeUnit::Millisecond => Ok(date.timestamp_millis()), + TimeUnit::Microsecond => Ok(date.timestamp_micros()), + TimeUnit::Nanosecond => date.timestamp_nanos_opt().ok_or_else(|| { + ArrowError::ParseError(format!( + "{} would overflow 64-bit signed nanoseconds", + date.to_rfc3339(), + )) + }), + }) + .map_err(|e| { + ArrowError::ParseError(format!( + "Error parsing column {col_idx} at line {}: {}", + line_number + row_index, + e + )) + })?; + Ok(Some(date)) }) .collect() } @@ -936,12 +977,13 @@ fn build_boolean_array( line_number: usize, rows: &StringRecords<'_>, col_idx: usize, + null_regex: &NullRegex, ) -> Result { rows.iter() .enumerate() .map(|(row_index, row)| { let s = row.get(col_idx); - if s.is_empty() { + if null_regex.is_null(s) { return Ok(None); } let parsed = parse_bool(s); @@ -1010,8 +1052,16 @@ impl ReaderBuilder { } /// Set whether the CSV file has headers + #[deprecated(note = "Use with_header")] + #[doc(hidden)] pub fn has_header(mut self, has_header: bool) -> Self { - self.format.has_header = has_header; + self.format.header = has_header; + self + } + + /// Set whether the CSV file has a header + pub fn with_header(mut self, has_header: bool) -> Self { + self.format.header = has_header; self } @@ -1042,6 +1092,12 @@ impl ReaderBuilder { self } + /// Provide a regex to match null values, defaults to `^$` + pub fn with_null_regex(mut self, null_regex: Regex) -> Self { + self.format.null_regex = NullRegex(Some(null_regex)); + self + } + /// Set the batch size (number of records to load at one time) pub fn with_batch_size(mut self, batch_size: usize) -> Self { self.batch_size = batch_size; @@ -1070,10 +1126,7 @@ impl ReaderBuilder { } /// Create a new `BufReader` from a buffered reader - pub fn build_buffered( - self, - reader: R, - ) -> Result, ArrowError> { + pub fn build_buffered(self, reader: R) -> Result, ArrowError> { Ok(BufReader { reader, decoder: self.build_decoder(), @@ -1085,7 +1138,7 @@ impl ReaderBuilder { let delimiter = self.format.build_parser(); let record_decoder = RecordDecoder::new(delimiter, self.schema.fields().len()); - let header = self.format.has_header as usize; + let header = self.format.header as usize; let (start, end) = match self.bounds { Some((start, end)) => (start + header, end + header), @@ -1100,6 +1153,7 @@ impl ReaderBuilder { end, projection: self.projection, batch_size: self.batch_size, + null_regex: self.format.null_regex, } } } @@ -1218,14 +1272,13 @@ mod tests { Field::new("lng", DataType::Float64, false), ]); - let file_with_headers = - File::open("test/data/uk_cities_with_headers.csv").unwrap(); + let file_with_headers = File::open("test/data/uk_cities_with_headers.csv").unwrap(); let file_without_headers = File::open("test/data/uk_cities.csv").unwrap(); let both_files = file_with_headers .chain(Cursor::new("\n".to_string())) .chain(file_without_headers); let mut csv = ReaderBuilder::new(Arc::new(schema)) - .has_header(true) + .with_header(true) .build(both_files) .unwrap(); let batch = csv.next().unwrap().unwrap(); @@ -1243,7 +1296,7 @@ mod tests { .unwrap(); file.rewind().unwrap(); - let builder = ReaderBuilder::new(Arc::new(schema)).has_header(true); + let builder = ReaderBuilder::new(Arc::new(schema)).with_header(true); let mut csv = builder.build(file).unwrap(); let expected_schema = Schema::new(vec![ @@ -1406,14 +1459,14 @@ mod tests { let schema = Arc::new(Schema::new(vec![ Field::new("c_int", DataType::UInt64, false), Field::new("c_float", DataType::Float32, true), - Field::new("c_string", DataType::Utf8, false), + Field::new("c_string", DataType::Utf8, true), Field::new("c_bool", DataType::Boolean, false), ])); let file = File::open("test/data/null_test.csv").unwrap(); let mut csv = ReaderBuilder::new(schema) - .has_header(true) + .with_header(true) .build(file) .unwrap(); @@ -1426,6 +1479,91 @@ mod tests { assert!(!batch.column(1).is_null(4)); } + #[test] + fn test_init_nulls() { + let schema = Arc::new(Schema::new(vec![ + Field::new("c_int", DataType::UInt64, true), + Field::new("c_float", DataType::Float32, true), + Field::new("c_string", DataType::Utf8, true), + Field::new("c_bool", DataType::Boolean, true), + Field::new("c_null", DataType::Null, true), + ])); + let file = File::open("test/data/init_null_test.csv").unwrap(); + + let mut csv = ReaderBuilder::new(schema) + .with_header(true) + .build(file) + .unwrap(); + + let batch = csv.next().unwrap().unwrap(); + + assert!(batch.column(1).is_null(0)); + assert!(!batch.column(1).is_null(1)); + assert!(batch.column(1).is_null(2)); + assert!(!batch.column(1).is_null(3)); + assert!(!batch.column(1).is_null(4)); + } + + #[test] + fn test_init_nulls_with_inference() { + let format = Format::default().with_header(true).with_delimiter(b','); + + let mut file = File::open("test/data/init_null_test.csv").unwrap(); + let (schema, _) = format.infer_schema(&mut file, None).unwrap(); + file.rewind().unwrap(); + + let expected_schema = Schema::new(vec![ + Field::new("c_int", DataType::Int64, true), + Field::new("c_float", DataType::Float64, true), + Field::new("c_string", DataType::Utf8, true), + Field::new("c_bool", DataType::Boolean, true), + Field::new("c_null", DataType::Null, true), + ]); + assert_eq!(schema, expected_schema); + + let mut csv = ReaderBuilder::new(Arc::new(schema)) + .with_format(format) + .build(file) + .unwrap(); + + let batch = csv.next().unwrap().unwrap(); + + assert!(batch.column(1).is_null(0)); + assert!(!batch.column(1).is_null(1)); + assert!(batch.column(1).is_null(2)); + assert!(!batch.column(1).is_null(3)); + assert!(!batch.column(1).is_null(4)); + } + + #[test] + fn test_custom_nulls() { + let schema = Arc::new(Schema::new(vec![ + Field::new("c_int", DataType::UInt64, true), + Field::new("c_float", DataType::Float32, true), + Field::new("c_string", DataType::Utf8, true), + Field::new("c_bool", DataType::Boolean, true), + ])); + + let file = File::open("test/data/custom_null_test.csv").unwrap(); + + let null_regex = Regex::new("^nil$").unwrap(); + + let mut csv = ReaderBuilder::new(schema) + .with_header(true) + .with_null_regex(null_regex) + .build(file) + .unwrap(); + + let batch = csv.next().unwrap().unwrap(); + + // "nil"s should be NULL + assert!(batch.column(0).is_null(1)); + assert!(batch.column(1).is_null(2)); + assert!(batch.column(3).is_null(4)); + assert!(batch.column(2).is_null(3)); + assert!(!batch.column(2).is_null(4)); + } + #[test] fn test_nulls_with_inference() { let mut file = File::open("test/data/various_types.csv").unwrap(); @@ -1457,8 +1595,7 @@ mod tests { schema.field(5).data_type() ); - let names: Vec<&str> = - schema.fields().iter().map(|x| x.name().as_str()).collect(); + let names: Vec<&str> = schema.fields().iter().map(|x| x.name().as_str()).collect(); assert_eq!( names, vec![ @@ -1485,6 +1622,42 @@ mod tests { assert!(!batch.column(1).is_null(4)); } + #[test] + fn test_custom_nulls_with_inference() { + let mut file = File::open("test/data/custom_null_test.csv").unwrap(); + + let null_regex = Regex::new("^nil$").unwrap(); + + let format = Format::default() + .with_header(true) + .with_null_regex(null_regex); + + let (schema, _) = format.infer_schema(&mut file, None).unwrap(); + file.rewind().unwrap(); + + let expected_schema = Schema::new(vec![ + Field::new("c_int", DataType::Int64, true), + Field::new("c_float", DataType::Float64, true), + Field::new("c_string", DataType::Utf8, true), + Field::new("c_bool", DataType::Boolean, true), + ]); + + assert_eq!(schema, expected_schema); + + let builder = ReaderBuilder::new(Arc::new(schema)) + .with_format(format) + .with_batch_size(512) + .with_projection(vec![0, 1, 2, 3]); + + let mut csv = builder.build(file).unwrap(); + let batch = csv.next().unwrap().unwrap(); + + assert_eq!(5, batch.num_rows()); + assert_eq!(4, batch.num_columns()); + + assert_eq!(batch.schema().as_ref(), &expected_schema); + } + #[test] fn test_parse_invalid_csv() { let file = File::open("test/data/various_types_invalid.csv").unwrap(); @@ -1497,7 +1670,7 @@ mod tests { ]); let builder = ReaderBuilder::new(Arc::new(schema)) - .has_header(true) + .with_header(true) .with_delimiter(b'|') .with_batch_size(512) .with_projection(vec![0, 1, 2, 3]); @@ -1598,16 +1771,11 @@ mod tests { -2203932304000 ); assert_eq!( - Date64Type::parse_formatted("1900-02-28 12:34:56", "%Y-%m-%d %H:%M:%S") - .unwrap(), + Date64Type::parse_formatted("1900-02-28 12:34:56", "%Y-%m-%d %H:%M:%S").unwrap(), -2203932304000 ); assert_eq!( - Date64Type::parse_formatted( - "1900-02-28 12:34:56+0030", - "%Y-%m-%d %H:%M:%S%z" - ) - .unwrap(), + Date64Type::parse_formatted("1900-02-28 12:34:56+0030", "%Y-%m-%d %H:%M:%S%z").unwrap(), -2203932304000 - (30 * 60 * 1000) ); } @@ -1644,10 +1812,7 @@ mod tests { #[test] fn test_parse_timestamp() { - test_parse_timestamp_impl::( - None, - &[0, 0, -7_200_000_000_000], - ); + test_parse_timestamp_impl::(None, &[0, 0, -7_200_000_000_000]); test_parse_timestamp_impl::( Some("+00:00".into()), &[0, 0, -7_200_000_000_000], @@ -1664,10 +1829,7 @@ mod tests { Some("-03".into()), &[10_800_000, 0, -7_200_000], ); - test_parse_timestamp_impl::( - Some("-03".into()), - &[10_800, 0, -7_200], - ); + test_parse_timestamp_impl::(Some("-03".into()), &[10_800, 0, -7_200]); } #[test] @@ -1824,7 +1986,7 @@ mod tests { Field::new("text2", DataType::Utf8, false), ]); let builder = ReaderBuilder::new(Arc::new(schema)) - .has_header(false) + .with_header(false) .with_quote(b'~'); // default is ", change to ~ let mut csv_text = Vec::new(); @@ -1856,7 +2018,7 @@ mod tests { Field::new("text2", DataType::Utf8, false), ]); let builder = ReaderBuilder::new(Arc::new(schema)) - .has_header(false) + .with_header(false) .with_escape(b'\\'); // default is None, change to \ let mut csv_text = Vec::new(); @@ -1888,7 +2050,7 @@ mod tests { Field::new("text2", DataType::Utf8, false), ]); let builder = ReaderBuilder::new(Arc::new(schema)) - .has_header(false) + .with_header(false) .with_terminator(b'\n'); // default is CRLF, change to LF let mut csv_text = Vec::new(); @@ -1930,7 +2092,7 @@ mod tests { ])); for (idx, (bounds, has_header, expected)) in tests.into_iter().enumerate() { - let mut reader = ReaderBuilder::new(schema.clone()).has_header(has_header); + let mut reader = ReaderBuilder::new(schema.clone()).with_header(has_header); if let Some((start, end)) = bounds { reader = reader.with_bounds(start, end); } @@ -1995,7 +2157,7 @@ mod tests { for capacity in [1, 3, 7, 100] { let reader = ReaderBuilder::new(schema.clone()) .with_batch_size(batch_size) - .has_header(has_header) + .with_header(has_header) .build(File::open(path).unwrap()) .unwrap(); @@ -2006,14 +2168,12 @@ mod tests { expected_rows ); - let buffered = std::io::BufReader::with_capacity( - capacity, - File::open(path).unwrap(), - ); + let buffered = + std::io::BufReader::with_capacity(capacity, File::open(path).unwrap()); let reader = ReaderBuilder::new(schema.clone()) .with_batch_size(batch_size) - .has_header(has_header) + .with_header(has_header) .build_buffered(buffered) .unwrap(); @@ -2026,8 +2186,8 @@ mod tests { fn err_test(csv: &[u8], expected: &str) { let schema = Arc::new(Schema::new(vec![ - Field::new("text1", DataType::Utf8, false), - Field::new("text2", DataType::Utf8, false), + Field::new("text1", DataType::Utf8, true), + Field::new("text2", DataType::Utf8, true), ])); let buffer = std::io::BufReader::with_capacity(2, Cursor::new(csv)); let b = ReaderBuilder::new(schema) @@ -2132,7 +2292,7 @@ mod tests { #[test] fn test_inference() { let cases: &[(&[&str], DataType)] = &[ - (&[], DataType::Utf8), + (&[], DataType::Null), (&["false", "12"], DataType::Utf8), (&["12", "cupcakes"], DataType::Utf8), (&["12", "12.4"], DataType::Float64), diff --git a/arrow-csv/src/reader/records.rs b/arrow-csv/src/reader/records.rs index a59d02e0e2d8..877cfb3ee653 100644 --- a/arrow-csv/src/reader/records.rs +++ b/arrow-csv/src/reader/records.rs @@ -76,11 +76,7 @@ impl RecordDecoder { /// Decodes records from `input` returning the number of records and bytes read /// /// Note: this expects to be called with an empty `input` to signal EOF - pub fn decode( - &mut self, - input: &[u8], - to_read: usize, - ) -> Result<(usize, usize), ArrowError> { + pub fn decode(&mut self, input: &[u8], to_read: usize) -> Result<(usize, usize), ArrowError> { if to_read == 0 { return Ok((0, 0)); } @@ -124,11 +120,17 @@ impl RecordDecoder { // Need to allocate more capacity ReadRecordResult::OutputFull => break, ReadRecordResult::OutputEndsFull => { - return Err(ArrowError::CsvError(format!("incorrect number of fields for line {}, expected {} got more than {}", self.line_number, self.num_columns, self.current_field))); + return Err(ArrowError::CsvError(format!( + "incorrect number of fields for line {}, expected {} got more than {}", + self.line_number, self.num_columns, self.current_field + ))); } ReadRecordResult::Record => { if self.current_field != self.num_columns { - return Err(ArrowError::CsvError(format!("incorrect number of fields for line {}, expected {} got {}", self.line_number, self.num_columns, self.current_field))); + return Err(ArrowError::CsvError(format!( + "incorrect number of fields for line {}, expected {} got {}", + self.line_number, self.num_columns, self.current_field + ))); } read += 1; self.current_field = 0; @@ -334,8 +336,7 @@ mod tests { let mut decoder = RecordDecoder::new(Reader::new(), 2); let err = decoder.decode(csv.as_bytes(), 4).unwrap_err().to_string(); - let expected = - "Csv error: incorrect number of fields for line 3, expected 2 got 1"; + let expected = "Csv error: incorrect number of fields for line 3, expected 2 got 1"; assert_eq!(err, expected); diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index 840e8e8a93cc..0bb76e536e67 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -70,11 +70,6 @@ use csv::ByteRecord; use std::io::Write; use crate::map_csv_error; - -const DEFAULT_DATE_FORMAT: &str = "%F"; -const DEFAULT_TIME_FORMAT: &str = "%T"; -const DEFAULT_TIMESTAMP_FORMAT: &str = "%FT%H:%M:%S.%9f"; -const DEFAULT_TIMESTAMP_TZ_FORMAT: &str = "%FT%H:%M:%S.%9f%:z"; const DEFAULT_NULL_VALUE: &str = ""; /// A CSV writer @@ -82,41 +77,29 @@ const DEFAULT_NULL_VALUE: &str = ""; pub struct Writer { /// The object to write to writer: csv::Writer, - /// Whether file should be written with headers. Defaults to `true` + /// Whether file should be written with headers, defaults to `true` has_headers: bool, - /// The date format for date arrays + /// The date format for date arrays, defaults to RFC3339 date_format: Option, - /// The datetime format for datetime arrays + /// The datetime format for datetime arrays, defaults to RFC3339 datetime_format: Option, - /// The timestamp format for timestamp arrays + /// The timestamp format for timestamp arrays, defaults to RFC3339 timestamp_format: Option, - /// The timestamp format for timestamp (with timezone) arrays + /// The timestamp format for timestamp (with timezone) arrays, defaults to RFC3339 timestamp_tz_format: Option, - /// The time format for time arrays + /// The time format for time arrays, defaults to RFC3339 time_format: Option, /// Is the beginning-of-writer beginning: bool, - /// The value to represent null entries - null_value: String, + /// The value to represent null entries, defaults to [`DEFAULT_NULL_VALUE`] + null_value: Option, } impl Writer { /// Create a new CsvWriter from a writable object, with default options pub fn new(writer: W) -> Self { let delimiter = b','; - let mut builder = csv::WriterBuilder::new(); - let writer = builder.delimiter(delimiter).from_writer(writer); - Writer { - writer, - has_headers: true, - date_format: Some(DEFAULT_DATE_FORMAT.to_string()), - datetime_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()), - time_format: Some(DEFAULT_TIME_FORMAT.to_string()), - timestamp_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()), - timestamp_tz_format: Some(DEFAULT_TIMESTAMP_TZ_FORMAT.to_string()), - beginning: true, - null_value: DEFAULT_NULL_VALUE.to_string(), - } + WriterBuilder::new().with_delimiter(delimiter).build(writer) } /// Write a vector of record batches to a writable object @@ -138,7 +121,7 @@ impl Writer { } let options = FormatOptions::default() - .with_null(&self.null_value) + .with_null(self.null_value.as_deref().unwrap_or(DEFAULT_NULL_VALUE)) .with_date_format(self.date_format.as_deref()) .with_datetime_format(self.datetime_format.as_deref()) .with_timestamp_format(self.timestamp_format.as_deref()) @@ -207,9 +190,9 @@ impl RecordBatchWriter for Writer { #[derive(Clone, Debug)] pub struct WriterBuilder { /// Optional column delimiter. Defaults to `b','` - delimiter: Option, + delimiter: u8, /// Whether to write column names as file headers. Defaults to `true` - has_headers: bool, + has_header: bool, /// Optional date format for date arrays date_format: Option, /// Optional datetime format for datetime arrays @@ -227,14 +210,14 @@ pub struct WriterBuilder { impl Default for WriterBuilder { fn default() -> Self { Self { - has_headers: true, - delimiter: None, - date_format: Some(DEFAULT_DATE_FORMAT.to_string()), - datetime_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()), - time_format: Some(DEFAULT_TIME_FORMAT.to_string()), - timestamp_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()), - timestamp_tz_format: Some(DEFAULT_TIMESTAMP_TZ_FORMAT.to_string()), - null_value: Some(DEFAULT_NULL_VALUE.to_string()), + has_header: true, + delimiter: b',', + date_format: None, + datetime_format: None, + time_format: None, + timestamp_format: None, + timestamp_tz_format: None, + null_value: None, } } } @@ -254,7 +237,7 @@ impl WriterBuilder { /// let file = File::create("target/out.csv").unwrap(); /// /// // create a builder that doesn't write headers - /// let builder = WriterBuilder::new().has_headers(false); + /// let builder = WriterBuilder::new().with_header(false); /// let writer = builder.build(file); /// /// writer @@ -265,48 +248,92 @@ impl WriterBuilder { } /// Set whether to write headers + #[deprecated(note = "Use Self::with_header")] + #[doc(hidden)] pub fn has_headers(mut self, has_headers: bool) -> Self { - self.has_headers = has_headers; + self.has_header = has_headers; self } + /// Set whether to write the CSV file with a header + pub fn with_header(mut self, header: bool) -> Self { + self.has_header = header; + self + } + + /// Returns `true` if this writer is configured to write a header + pub fn header(&self) -> bool { + self.has_header + } + /// Set the CSV file's column delimiter as a byte character pub fn with_delimiter(mut self, delimiter: u8) -> Self { - self.delimiter = Some(delimiter); + self.delimiter = delimiter; self } + /// Get the CSV file's column delimiter as a byte character + pub fn delimiter(&self) -> u8 { + self.delimiter + } + /// Set the CSV file's date format pub fn with_date_format(mut self, format: String) -> Self { self.date_format = Some(format); self } + /// Get the CSV file's date format if set, defaults to RFC3339 + pub fn date_format(&self) -> Option<&str> { + self.date_format.as_deref() + } + /// Set the CSV file's datetime format pub fn with_datetime_format(mut self, format: String) -> Self { self.datetime_format = Some(format); self } + /// Get the CSV file's datetime format if set, defaults to RFC3339 + pub fn datetime_format(&self) -> Option<&str> { + self.datetime_format.as_deref() + } + /// Set the CSV file's time format pub fn with_time_format(mut self, format: String) -> Self { self.time_format = Some(format); self } + /// Get the CSV file's datetime time if set, defaults to RFC3339 + pub fn time_format(&self) -> Option<&str> { + self.time_format.as_deref() + } + /// Set the CSV file's timestamp format pub fn with_timestamp_format(mut self, format: String) -> Self { self.timestamp_format = Some(format); self } + /// Get the CSV file's timestamp format if set, defaults to RFC3339 + pub fn timestamp_format(&self) -> Option<&str> { + self.timestamp_format.as_deref() + } + /// Set the value to represent null in output pub fn with_null(mut self, null_value: String) -> Self { self.null_value = Some(null_value); self } - /// Use RFC3339 format for date/time/timestamps + /// Get the value to represent null in output + pub fn null(&self) -> &str { + self.null_value.as_deref().unwrap_or(DEFAULT_NULL_VALUE) + } + + /// Use RFC3339 format for date/time/timestamps (default) + #[deprecated(note = "Use WriterBuilder::default()")] pub fn with_rfc3339(mut self) -> Self { self.date_format = None; self.datetime_format = None; @@ -318,21 +345,18 @@ impl WriterBuilder { /// Create a new `Writer` pub fn build(self, writer: W) -> Writer { - let delimiter = self.delimiter.unwrap_or(b','); let mut builder = csv::WriterBuilder::new(); - let writer = builder.delimiter(delimiter).from_writer(writer); + let writer = builder.delimiter(self.delimiter).from_writer(writer); Writer { writer, - has_headers: self.has_headers, + beginning: true, + has_headers: self.has_header, date_format: self.date_format, datetime_format: self.datetime_format, time_format: self.time_format, timestamp_format: self.timestamp_format, timestamp_tz_format: self.timestamp_tz_format, - beginning: true, - null_value: self - .null_value - .unwrap_or_else(|| DEFAULT_NULL_VALUE.to_string()), + null_value: self.null_value, } } } @@ -365,18 +389,12 @@ mod tests { "consectetur adipiscing elit", "sed do eiusmod tempor", ]); - let c2 = PrimitiveArray::::from(vec![ - Some(123.564532), - None, - Some(-556132.25), - ]); + let c2 = + PrimitiveArray::::from(vec![Some(123.564532), None, Some(-556132.25)]); let c3 = PrimitiveArray::::from(vec![3, 2, 1]); let c4 = BooleanArray::from(vec![Some(true), Some(false), None]); - let c5 = TimestampMillisecondArray::from(vec![ - None, - Some(1555584887378), - Some(1555555555555), - ]); + let c5 = + TimestampMillisecondArray::from(vec![None, Some(1555584887378), Some(1555555555555)]); let c6 = Time32SecondArray::from(vec![1234, 24680, 85563]); let c7: DictionaryArray = vec!["cupcakes", "cupcakes", "foo"].into_iter().collect(); @@ -411,11 +429,11 @@ mod tests { let expected = r#"c1,c2,c3,c4,c5,c6,c7 Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34,cupcakes -consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20,cupcakes -sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo +consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378,06:51:20,cupcakes +sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34,cupcakes -consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20,cupcakes -sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo +consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378,06:51:20,cupcakes +sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo "#; assert_eq!(expected.to_string(), String::from_utf8(buffer).unwrap()); } @@ -427,13 +445,11 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo Field::new("c2", DataType::Decimal256(76, 6), true), ]); - let mut c1_builder = - Decimal128Builder::new().with_data_type(DataType::Decimal128(38, 6)); + let mut c1_builder = Decimal128Builder::new().with_data_type(DataType::Decimal128(38, 6)); c1_builder.extend(vec![Some(-3335724), Some(2179404), None, Some(290472)]); let c1 = c1_builder.finish(); - let mut c2_builder = - Decimal256Builder::new().with_data_type(DataType::Decimal256(76, 6)); + let mut c2_builder = Decimal256Builder::new().with_data_type(DataType::Decimal256(76, 6)); c2_builder.extend(vec![ Some(i256::from_i128(-3335724)), Some(i256::from_i128(2179404)), @@ -443,8 +459,7 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo let c2 = c2_builder.finish(); let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]) - .unwrap(); + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]).unwrap(); let mut file = tempfile::tempfile().unwrap(); @@ -488,11 +503,8 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo "consectetur adipiscing elit", "sed do eiusmod tempor", ]); - let c2 = PrimitiveArray::::from(vec![ - Some(123.564532), - None, - Some(-556132.25), - ]); + let c2 = + PrimitiveArray::::from(vec![Some(123.564532), None, Some(-556132.25)]); let c3 = PrimitiveArray::::from(vec![3, 2, 1]); let c4 = BooleanArray::from(vec![Some(true), Some(false), None]); let c6 = Time32SecondArray::from(vec![1234, 24680, 85563]); @@ -512,7 +524,7 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo let mut file = tempfile::tempfile().unwrap(); let builder = WriterBuilder::new() - .has_headers(false) + .with_header(false) .with_delimiter(b'|') .with_null("NULL".to_string()) .with_time_format("%r".to_string()); @@ -560,7 +572,7 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo ) .unwrap(); - let builder = WriterBuilder::new().has_headers(false); + let builder = WriterBuilder::new().with_header(false); let mut buf: Cursor> = Default::default(); // drop the writer early to release the borrow. @@ -605,8 +617,7 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo let c0 = UInt32Array::from(vec![Some(123), Some(234)]); let c1 = Date64Array::from(vec![Some(1926632005177), Some(1926632005177685347)]); let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c0), Arc::new(c1)]) - .unwrap(); + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c0), Arc::new(c1)]).unwrap(); let mut file = tempfile::tempfile().unwrap(); let mut writer = Writer::new(&mut file); @@ -632,15 +643,9 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo Field::new("c4", DataType::Time32(TimeUnit::Second), false), ]); - let c1 = TimestampMillisecondArray::from(vec![ - Some(1555584887378), - Some(1635577147000), - ]) - .with_timezone("+00:00".to_string()); - let c2 = TimestampMillisecondArray::from(vec![ - Some(1555584887378), - Some(1635577147000), - ]); + let c1 = TimestampMillisecondArray::from(vec![Some(1555584887378), Some(1635577147000)]) + .with_timezone("+00:00".to_string()); + let c2 = TimestampMillisecondArray::from(vec![Some(1555584887378), Some(1635577147000)]); let c3 = Date32Array::from(vec![3, 2]); let c4 = Time32SecondArray::from(vec![1234, 24680]); @@ -652,7 +657,7 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo let mut file = tempfile::tempfile().unwrap(); - let builder = WriterBuilder::new().with_rfc3339(); + let builder = WriterBuilder::new(); let mut writer = builder.build(&mut file); let batches = vec![&batch]; for batch in batches { diff --git a/arrow-csv/test/data/custom_null_test.csv b/arrow-csv/test/data/custom_null_test.csv new file mode 100644 index 000000000000..39f9fc4b3eff --- /dev/null +++ b/arrow-csv/test/data/custom_null_test.csv @@ -0,0 +1,6 @@ +c_int,c_float,c_string,c_bool +1,1.1,"1.11",True +nil,2.2,"2.22",TRUE +3,nil,"3.33",true +4,4.4,nil,False +5,6.6,"",nil diff --git a/arrow-csv/test/data/example.csv b/arrow-csv/test/data/example.csv new file mode 100644 index 000000000000..0c03cee84528 --- /dev/null +++ b/arrow-csv/test/data/example.csv @@ -0,0 +1,4 @@ +c1,c2,c3,c4 +1,1.1,"hong kong",true +3,323.12,"XiAn",false +10,131323.12,"cheng du",false \ No newline at end of file diff --git a/arrow-csv/test/data/init_null_test.csv b/arrow-csv/test/data/init_null_test.csv new file mode 100644 index 000000000000..f7d8a299645d --- /dev/null +++ b/arrow-csv/test/data/init_null_test.csv @@ -0,0 +1,6 @@ +c_int,c_float,c_string,c_bool,c_null +,,,, +2,2.2,"a",TRUE, +3,,"b",true, +4,4.4,,False, +5,6.6,"",FALSE, \ No newline at end of file diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 7e07194012bf..10c53c549e2b 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -42,9 +42,7 @@ pub(crate) fn contains_nulls( ) -> bool { match null_bit_buffer { Some(buffer) => { - match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len) - .next() - { + match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len).next() { Some((start, end)) => start != 0 || end != len, None => len != 0, // No non-null values } @@ -130,9 +128,9 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff MutableBuffer::new(capacity * k.primitive_width().unwrap()), empty_buffer, ], - DataType::FixedSizeList(_, _) - | DataType::Struct(_) - | DataType::RunEndEncoded(_, _) => [empty_buffer, MutableBuffer::new(0)], + DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => { + [empty_buffer, MutableBuffer::new(0)] + } DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => [ MutableBuffer::new(capacity * mem::size_of::()), empty_buffer, @@ -159,10 +157,9 @@ pub(crate) fn into_buffers( ) -> Vec { match data_type { DataType::Null | DataType::Struct(_) | DataType::FixedSizeList(_, _) => vec![], - DataType::Utf8 - | DataType::Binary - | DataType::LargeUtf8 - | DataType::LargeBinary => vec![buffer1.into(), buffer2.into()], + DataType::Utf8 | DataType::Binary | DataType::LargeUtf8 | DataType::LargeBinary => { + vec![buffer1.into(), buffer2.into()] + } DataType::Union(_, mode) => { match mode { // Based on Union's DataTypeLayout @@ -174,7 +171,7 @@ pub(crate) fn into_buffers( } } -/// An generic representation of Arrow array data which encapsulates common attributes and +/// A generic representation of Arrow array data which encapsulates common attributes and /// operations for Arrow array. Specific operations for different arrays types (e.g., /// primitive, list, struct) are implemented in `Array`. /// @@ -452,12 +449,11 @@ impl ArrayData { for spec in layout.buffers.iter() { match spec { BufferSpec::FixedWidth { byte_width, .. } => { - let buffer_size = - self.len.checked_mul(*byte_width).ok_or_else(|| { - ArrowError::ComputeError( - "Integer overflow computing buffer size".to_string(), - ) - })?; + let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| { + ArrowError::ComputeError( + "Integer overflow computing buffer size".to_string(), + ) + })?; result += buffer_size; } BufferSpec::VariableWidth => { @@ -590,9 +586,7 @@ impl ArrayData { DataType::LargeBinary | DataType::LargeUtf8 => { (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true) } - DataType::FixedSizeBinary(i) => { - (vec![zeroed(*i as usize * len)], vec![], true) - } + DataType::FixedSizeBinary(i) => (vec![zeroed(*i as usize * len)], vec![], true), DataType::List(f) | DataType::Map(f, _) => ( vec![zeroed((len + 1) * 4)], vec![ArrayData::new_empty(f.data_type())], @@ -705,7 +699,7 @@ impl ArrayData { /// /// This can be useful for when interacting with data sent over IPC or FFI, that may /// not meet the minimum alignment requirements - fn align_buffers(&mut self) { + pub fn align_buffers(&mut self) { let layout = layout(&self.data_type); for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) { if let BufferSpec::FixedWidth { alignment, .. } = spec { @@ -749,9 +743,7 @@ impl ArrayData { ))); } - for (i, (buffer, spec)) in - self.buffers.iter().zip(layout.buffers.iter()).enumerate() - { + for (i, (buffer, spec)) in self.buffers.iter().zip(layout.buffers.iter()).enumerate() { match spec { BufferSpec::FixedWidth { byte_width, @@ -999,10 +991,8 @@ impl ArrayData { } DataType::RunEndEncoded(run_ends_field, values_field) => { self.validate_num_child_data(2)?; - let run_ends_data = - self.get_valid_child_data(0, run_ends_field.data_type())?; - let values_data = - self.get_valid_child_data(1, values_field.data_type())?; + let run_ends_data = self.get_valid_child_data(0, run_ends_field.data_type())?; + let values_data = self.get_valid_child_data(1, values_field.data_type())?; if run_ends_data.len != values_data.len { return Err(ArrowError::InvalidArgumentError(format!( "The run_ends array length should be the same as values array length. Run_ends array length is {}, values array length is {}", @@ -1022,9 +1012,7 @@ impl ArrayData { for (i, (_, field)) in fields.iter().enumerate() { let field_data = self.get_valid_child_data(i, field.data_type())?; - if mode == &UnionMode::Sparse - && field_data.len < (self.len + self.offset) - { + if mode == &UnionMode::Sparse && field_data.len < (self.len + self.offset) { return Err(ArrowError::InvalidArgumentError(format!( "Sparse union child array #{} has length smaller than expected for union array ({} < {})", i, field_data.len, self.len + self.offset @@ -1083,14 +1071,14 @@ impl ArrayData { i: usize, expected_type: &DataType, ) -> Result<&ArrayData, ArrowError> { - let values_data = self.child_data - .get(i) - .ok_or_else(|| { - ArrowError::InvalidArgumentError(format!( - "{} did not have enough child arrays. Expected at least {} but had only {}", - self.data_type, i+1, self.child_data.len() - )) - })?; + let values_data = self.child_data.get(i).ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "{} did not have enough child arrays. Expected at least {} but had only {}", + self.data_type, + i + 1, + self.child_data.len() + )) + })?; if expected_type != &values_data.data_type { return Err(ArrowError::InvalidArgumentError(format!( @@ -1160,7 +1148,8 @@ impl ArrayData { if actual != nulls.null_count() { return Err(ArrowError::InvalidArgumentError(format!( "null_count value ({}) doesn't match actual number of nulls in array ({})", - nulls.null_count(), actual + nulls.null_count(), + actual ))); } } @@ -1209,23 +1198,22 @@ impl ArrayData { ) -> Result<(), ArrowError> { let mask = match mask { Some(mask) => mask, - None => return match child.null_count() { - 0 => Ok(()), - _ => Err(ArrowError::InvalidArgumentError(format!( - "non-nullable child of type {} contains nulls not present in parent {}", - child.data_type, - self.data_type - ))), - }, + None => { + return match child.null_count() { + 0 => Ok(()), + _ => Err(ArrowError::InvalidArgumentError(format!( + "non-nullable child of type {} contains nulls not present in parent {}", + child.data_type, self.data_type + ))), + } + } }; match child.nulls() { - Some(nulls) if !mask.contains(nulls) => { - Err(ArrowError::InvalidArgumentError(format!( - "non-nullable child of type {} contains nulls not present in parent", - child.data_type - ))) - } + Some(nulls) if !mask.contains(nulls) => Err(ArrowError::InvalidArgumentError(format!( + "non-nullable child of type {} contains nulls not present in parent", + child.data_type + ))), _ => Ok(()), } } @@ -1240,9 +1228,7 @@ impl ArrayData { DataType::Utf8 => self.validate_utf8::(), DataType::LargeUtf8 => self.validate_utf8::(), DataType::Binary => self.validate_offsets_full::(self.buffers[1].len()), - DataType::LargeBinary => { - self.validate_offsets_full::(self.buffers[1].len()) - } + DataType::LargeBinary => self.validate_offsets_full::(self.buffers[1].len()), DataType::List(_) | DataType::Map(_, _) => { let child = &self.child_data[0]; self.validate_offsets_full::(child.len) @@ -1300,11 +1286,7 @@ impl ArrayData { /// /// For example, the offsets buffer contained `[1, 2, 4]`, this /// function would call `validate([1,2])`, and `validate([2,4])` - fn validate_each_offset( - &self, - offset_limit: usize, - validate: V, - ) -> Result<(), ArrowError> + fn validate_each_offset(&self, offset_limit: usize, validate: V) -> Result<(), ArrowError> where T: ArrowNativeType + TryInto + num::Num + std::fmt::Display, V: Fn(usize, Range) -> Result<(), ArrowError>, @@ -1358,32 +1340,26 @@ impl ArrayData { let values_buffer = &self.buffers[1].as_slice(); if let Ok(values_str) = std::str::from_utf8(values_buffer) { // Validate Offsets are correct - self.validate_each_offset::( - values_buffer.len(), - |string_index, range| { - if !values_str.is_char_boundary(range.start) - || !values_str.is_char_boundary(range.end) - { - return Err(ArrowError::InvalidArgumentError(format!( - "incomplete utf-8 byte sequence from index {string_index}" - ))); - } - Ok(()) - }, - ) + self.validate_each_offset::(values_buffer.len(), |string_index, range| { + if !values_str.is_char_boundary(range.start) + || !values_str.is_char_boundary(range.end) + { + return Err(ArrowError::InvalidArgumentError(format!( + "incomplete utf-8 byte sequence from index {string_index}" + ))); + } + Ok(()) + }) } else { // find specific offset that failed utf8 validation - self.validate_each_offset::( - values_buffer.len(), - |string_index, range| { - std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| { - ArrowError::InvalidArgumentError(format!( - "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}" - )) - })?; - Ok(()) - }, - ) + self.validate_each_offset::(values_buffer.len(), |string_index, range| { + std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| { + ArrowError::InvalidArgumentError(format!( + "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}" + )) + })?; + Ok(()) + }) } } @@ -1414,8 +1390,7 @@ impl ArrayData { assert!(buffer.len() / mem::size_of::() >= required_len); // Justification: buffer size was validated above - let indexes: &[T] = - &buffer.typed_data::()[self.offset..self.offset + self.len]; + let indexes: &[T] = &buffer.typed_data::()[self.offset..self.offset + self.len]; indexes.iter().enumerate().try_for_each(|(i, &dict_index)| { // Do not check the value is null (value can be arbitrary) diff --git a/arrow-data/src/decimal.rs b/arrow-data/src/decimal.rs index f74ab880d478..74279bfb9af1 100644 --- a/arrow-data/src/decimal.rs +++ b/arrow-data/src/decimal.rs @@ -19,8 +19,8 @@ use arrow_buffer::i256; use arrow_schema::ArrowError; pub use arrow_schema::{ - DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, - DECIMAL256_MAX_SCALE, DECIMAL_DEFAULT_SCALE, + DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, + DECIMAL_DEFAULT_SCALE, }; // MAX decimal256 value of little-endian format for each precision. @@ -28,308 +28,308 @@ pub use arrow_schema::{ // is encoded to the 32-byte width format of little-endian. pub(crate) const MAX_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION: [i256; 76] = [ i256::from_le_bytes([ - 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, + 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, ]), i256::from_le_bytes([ - 99, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, + 99, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, ]), i256::from_le_bytes([ - 231, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, + 231, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, ]), i256::from_le_bytes([ - 15, 39, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, + 15, 39, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, ]), i256::from_le_bytes([ - 159, 134, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, + 159, 134, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, ]), i256::from_le_bytes([ - 63, 66, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, + 63, 66, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, ]), i256::from_le_bytes([ - 127, 150, 152, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, + 127, 150, 152, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 224, 245, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, + 255, 224, 245, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 201, 154, 59, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, + 255, 201, 154, 59, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 227, 11, 84, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, + 255, 227, 11, 84, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 231, 118, 72, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, + 255, 231, 118, 72, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 15, 165, 212, 232, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, + 255, 15, 165, 212, 232, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 159, 114, 78, 24, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, + 255, 159, 114, 78, 24, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 63, 122, 16, 243, 90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, + 255, 63, 122, 16, 243, 90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 127, 198, 164, 126, 141, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 127, 198, 164, 126, 141, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 192, 111, 242, 134, 35, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 192, 111, 242, 134, 35, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 137, 93, 120, 69, 99, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 137, 93, 120, 69, 99, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 99, 167, 179, 182, 224, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 99, 167, 179, 182, 224, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 231, 137, 4, 35, 199, 138, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 231, 137, 4, 35, 199, 138, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 15, 99, 45, 94, 199, 107, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 15, 99, 45, 94, 199, 107, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 159, 222, 197, 173, 201, 53, 54, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 159, 222, 197, 173, 201, 53, 54, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 63, 178, 186, 201, 224, 25, 30, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 63, 178, 186, 201, 224, 25, 30, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 127, 246, 74, 225, 199, 2, 45, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 127, 246, 74, 225, 199, 2, 45, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 160, 237, 204, 206, 27, 194, 211, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 160, 237, 204, 206, 27, 194, 211, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 73, 72, 1, 20, 22, 149, 69, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 73, 72, 1, 20, 22, 149, 69, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 227, 210, 12, 200, 220, 210, 183, 82, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 227, 210, 12, 200, 220, 210, 183, 82, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 231, 60, 128, 208, 159, 60, 46, 59, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 231, 60, 128, 208, 159, 60, 46, 59, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 15, 97, 2, 37, 62, 94, 206, 79, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 15, 97, 2, 37, 62, 94, 206, 79, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 159, 202, 23, 114, 109, 174, 15, 30, 67, 1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 159, 202, 23, 114, 109, 174, 15, 30, 67, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 63, 234, 237, 116, 70, 208, 156, 44, 159, 12, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 63, 234, 237, 116, 70, 208, 156, 44, 159, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 127, 38, 75, 145, 192, 34, 32, 190, 55, 126, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 127, 38, 75, 145, 192, 34, 32, 190, 55, 126, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 128, 239, 172, 133, 91, 65, 109, 45, 238, 4, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 128, 239, 172, 133, 91, 65, 109, 45, 238, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 9, 91, 193, 56, 147, 141, 68, 198, 77, 49, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 9, 91, 193, 56, 147, 141, 68, 198, 77, 49, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 99, 142, 141, 55, 192, 135, 173, 190, 9, 237, 1, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 99, 142, 141, 55, 192, 135, 173, 190, 9, 237, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 231, 143, 135, 43, 130, 77, 199, 114, 97, 66, 19, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 231, 143, 135, 43, 130, 77, 199, 114, 97, 66, 19, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 15, 159, 75, 179, 21, 7, 201, 123, 206, 151, 192, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 15, 159, 75, 179, 21, 7, 201, 123, 206, 151, 192, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 159, 54, 244, 0, 217, 70, 218, 213, 16, 238, 133, 7, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 159, 54, 244, 0, 217, 70, 218, 213, 16, 238, 133, 7, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 63, 34, 138, 9, 122, 196, 134, 90, 168, 76, 59, 75, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 63, 34, 138, 9, 122, 196, 134, 90, 168, 76, 59, 75, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 127, 86, 101, 95, 196, 172, 67, 137, 147, 254, 80, 240, 2, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 127, 86, 101, 95, 196, 172, 67, 137, 147, 254, 80, 240, 2, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 96, 245, 185, 171, 191, 164, 92, 195, 241, 41, 99, 29, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 96, 245, 185, 171, 191, 164, 92, 195, 241, 41, 99, 29, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 201, 149, 67, 181, 124, 111, 158, 161, 113, 163, 223, - 37, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 201, 149, 67, 181, 124, 111, 158, 161, 113, 163, 223, 37, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 227, 217, 163, 20, 223, 90, 48, 80, 112, 98, 188, 122, - 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 227, 217, 163, 20, 223, 90, 48, 80, 112, 98, 188, 122, 11, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 231, 130, 102, 206, 182, 140, 227, 33, 99, 216, 91, 203, - 114, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 231, 130, 102, 206, 182, 140, 227, 33, 99, 216, 91, 203, 114, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 15, 29, 1, 16, 36, 127, 227, 82, 223, 115, 150, 241, - 123, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 15, 29, 1, 16, 36, 127, 227, 82, 223, 115, 150, 241, 123, 4, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 159, 34, 11, 160, 104, 247, 226, 60, 185, 134, 224, 111, - 215, 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 159, 34, 11, 160, 104, 247, 226, 60, 185, 134, 224, 111, 215, 44, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 63, 90, 111, 64, 22, 170, 221, 96, 60, 67, 197, 94, 106, - 192, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 63, 90, 111, 64, 22, 170, 221, 96, 60, 67, 197, 94, 106, 192, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 127, 134, 89, 132, 222, 164, 168, 200, 91, 160, 180, - 179, 39, 132, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 127, 134, 89, 132, 222, 164, 168, 200, 91, 160, 180, 179, 39, 132, + 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 64, 127, 43, 177, 112, 150, 214, 149, 67, 14, 5, - 141, 41, 175, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 64, 127, 43, 177, 112, 150, 214, 149, 67, 14, 5, 141, 41, + 175, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 137, 248, 178, 235, 102, 224, 97, 218, 163, 142, - 50, 130, 159, 215, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 137, 248, 178, 235, 102, 224, 97, 218, 163, 142, 50, 130, + 159, 215, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 99, 181, 253, 52, 5, 196, 210, 135, 102, 146, 249, - 21, 59, 108, 68, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 99, 181, 253, 52, 5, 196, 210, 135, 102, 146, 249, 21, 59, + 108, 68, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 231, 21, 233, 17, 52, 168, 59, 78, 1, 184, 191, - 219, 78, 58, 172, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 231, 21, 233, 17, 52, 168, 59, 78, 1, 184, 191, 219, 78, 58, + 172, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 15, 219, 26, 179, 8, 146, 84, 14, 13, 48, 125, 149, - 20, 71, 186, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 15, 219, 26, 179, 8, 146, 84, 14, 13, 48, 125, 149, 20, 71, + 186, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 159, 142, 12, 255, 86, 180, 77, 143, 130, 224, 227, - 214, 205, 198, 70, 11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 159, 142, 12, 255, 86, 180, 77, 143, 130, 224, 227, 214, 205, + 198, 70, 11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 63, 146, 125, 246, 101, 11, 9, 153, 25, 197, 230, - 100, 10, 196, 195, 112, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 63, 146, 125, 246, 101, 11, 9, 153, 25, 197, 230, 100, 10, + 196, 195, 112, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 127, 182, 231, 160, 251, 113, 90, 250, 255, 178, 3, - 241, 103, 168, 165, 103, 104, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 127, 182, 231, 160, 251, 113, 90, 250, 255, 178, 3, 241, 103, + 168, 165, 103, 104, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 32, 13, 73, 212, 115, 136, 199, 255, 253, 36, - 106, 15, 148, 120, 12, 20, 4, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 32, 13, 73, 212, 115, 136, 199, 255, 253, 36, 106, 15, + 148, 120, 12, 20, 4, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 73, 131, 218, 74, 134, 84, 203, 253, 235, 113, - 37, 154, 200, 181, 124, 200, 40, 0, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 73, 131, 218, 74, 134, 84, 203, 253, 235, 113, 37, 154, + 200, 181, 124, 200, 40, 0, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 227, 32, 137, 236, 62, 77, 241, 233, 55, 115, - 118, 5, 214, 25, 223, 212, 151, 1, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 227, 32, 137, 236, 62, 77, 241, 233, 55, 115, 118, 5, + 214, 25, 223, 212, 151, 1, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 231, 72, 91, 61, 117, 4, 109, 35, 47, 128, - 160, 54, 92, 2, 183, 80, 238, 15, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 231, 72, 91, 61, 117, 4, 109, 35, 47, 128, 160, 54, 92, + 2, 183, 80, 238, 15, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 15, 217, 144, 101, 148, 44, 66, 98, 215, 1, - 69, 34, 154, 23, 38, 39, 79, 159, 0, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 15, 217, 144, 101, 148, 44, 66, 98, 215, 1, 69, 34, 154, + 23, 38, 39, 79, 159, 0, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 159, 122, 168, 247, 203, 189, 149, 214, 105, - 18, 178, 86, 5, 236, 124, 135, 23, 57, 6, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 159, 122, 168, 247, 203, 189, 149, 214, 105, 18, 178, + 86, 5, 236, 124, 135, 23, 57, 6, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 63, 202, 148, 172, 247, 105, 217, 97, 34, 184, - 244, 98, 53, 56, 225, 74, 235, 58, 62, 0, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 63, 202, 148, 172, 247, 105, 217, 97, 34, 184, 244, 98, + 53, 56, 225, 74, 235, 58, 62, 0, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 127, 230, 207, 189, 172, 35, 126, 210, 87, 49, - 143, 221, 21, 50, 204, 236, 48, 77, 110, 2, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 127, 230, 207, 189, 172, 35, 126, 210, 87, 49, 143, 221, + 21, 50, 204, 236, 48, 77, 110, 2, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 0, 31, 106, 191, 100, 237, 56, 110, 237, - 151, 167, 218, 244, 249, 63, 233, 3, 79, 24, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 255, 0, 31, 106, 191, 100, 237, 56, 110, 237, 151, 167, + 218, 244, 249, 63, 233, 3, 79, 24, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 9, 54, 37, 122, 239, 69, 57, 78, 70, 239, - 139, 138, 144, 195, 127, 28, 39, 22, 243, 0, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 255, 9, 54, 37, 122, 239, 69, 57, 78, 70, 239, 139, 138, + 144, 195, 127, 28, 39, 22, 243, 0, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 99, 28, 116, 197, 90, 187, 60, 14, 191, - 88, 119, 105, 165, 163, 253, 28, 135, 221, 126, 9, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 255, 99, 28, 116, 197, 90, 187, 60, 14, 191, 88, 119, + 105, 165, 163, 253, 28, 135, 221, 126, 9, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 231, 27, 137, 182, 139, 81, 95, 142, 118, - 119, 169, 30, 118, 100, 232, 33, 71, 167, 244, 94, 0, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 255, 231, 27, 137, 182, 139, 81, 95, 142, 118, 119, 169, + 30, 118, 100, 232, 33, 71, 167, 244, 94, 0, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 15, 23, 91, 33, 117, 47, 185, 143, 161, - 170, 158, 50, 157, 236, 19, 83, 199, 136, 142, 181, 3, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 255, 15, 23, 91, 33, 117, 47, 185, 143, 161, 170, 158, + 50, 157, 236, 19, 83, 199, 136, 142, 181, 3, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 159, 230, 142, 77, 147, 218, 59, 157, 79, - 170, 50, 250, 35, 62, 199, 62, 201, 87, 145, 23, 37, 0, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 255, 159, 230, 142, 77, 147, 218, 59, 157, 79, 170, 50, + 250, 35, 62, 199, 62, 201, 87, 145, 23, 37, 0, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 63, 2, 149, 7, 193, 137, 86, 36, 28, 167, - 250, 197, 103, 109, 200, 115, 220, 109, 173, 235, 114, 1, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 255, 63, 2, 149, 7, 193, 137, 86, 36, 28, 167, 250, 197, + 103, 109, 200, 115, 220, 109, 173, 235, 114, 1, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 127, 22, 210, 75, 138, 97, 97, 107, 25, - 135, 202, 187, 13, 70, 212, 133, 156, 74, 198, 52, 125, 14, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 255, 127, 22, 210, 75, 138, 97, 97, 107, 25, 135, 202, + 187, 13, 70, 212, 133, 156, 74, 198, 52, 125, 14, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 255, 224, 52, 246, 102, 207, 205, 49, - 254, 70, 233, 85, 137, 188, 74, 58, 29, 234, 190, 15, 228, 144, 0, 0, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 224, 52, 246, 102, 207, 205, 49, 254, 70, 233, + 85, 137, 188, 74, 58, 29, 234, 190, 15, 228, 144, 0, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 255, 201, 16, 158, 5, 26, 10, 242, 237, - 197, 28, 91, 93, 93, 235, 70, 36, 37, 117, 157, 232, 168, 5, 0, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 201, 16, 158, 5, 26, 10, 242, 237, 197, 28, + 91, 93, 93, 235, 70, 36, 37, 117, 157, 232, 168, 5, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 255, 227, 167, 44, 56, 4, 101, 116, 75, - 187, 31, 143, 165, 165, 49, 197, 106, 115, 147, 38, 22, 153, 56, 0, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 227, 167, 44, 56, 4, 101, 116, 75, 187, 31, + 143, 165, 165, 49, 197, 106, 115, 147, 38, 22, 153, 56, 0, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 255, 231, 142, 190, 49, 42, 242, 139, - 242, 80, 61, 151, 119, 120, 240, 179, 43, 130, 194, 129, 221, 250, 53, 2, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 231, 142, 190, 49, 42, 242, 139, 242, 80, 61, + 151, 119, 120, 240, 179, 43, 130, 194, 129, 221, 250, 53, 2, ]), i256::from_le_bytes([ - 255, 255, 255, 255, 255, 255, 255, 255, 255, 15, 149, 113, 241, 165, 117, 119, - 121, 41, 101, 232, 171, 180, 100, 7, 181, 21, 153, 17, 167, 204, 27, 22, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 15, 149, 113, 241, 165, 117, 119, 121, 41, + 101, 232, 171, 180, 100, 7, 181, 21, 153, 17, 167, 204, 27, 22, ]), ]; @@ -338,308 +338,308 @@ pub(crate) const MAX_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION: [i256; 76] = [ // is encoded to the 76-byte width format of little-endian. pub(crate) const MIN_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION: [i256; 76] = [ i256::from_le_bytes([ - 247, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 247, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 157, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 157, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 25, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 25, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 241, 216, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 241, 216, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 97, 121, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 97, 121, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 193, 189, 240, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 193, 189, 240, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 129, 105, 103, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 129, 105, 103, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 31, 10, 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 31, 10, 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 54, 101, 196, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 54, 101, 196, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 28, 244, 171, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 28, 244, 171, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 24, 137, 183, 232, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 24, 137, 183, 232, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 240, 90, 43, 23, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 240, 90, 43, 23, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 96, 141, 177, 231, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 96, 141, 177, 231, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 192, 133, 239, 12, 165, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 192, 133, 239, 12, 165, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 128, 57, 91, 129, 114, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 128, 57, 91, 129, 114, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 63, 144, 13, 121, 220, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 63, 144, 13, 121, 220, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 118, 162, 135, 186, 156, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 118, 162, 135, 186, 156, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 156, 88, 76, 73, 31, 242, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 156, 88, 76, 73, 31, 242, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 24, 118, 251, 220, 56, 117, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 24, 118, 251, 220, 56, 117, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 240, 156, 210, 161, 56, 148, 250, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 240, 156, 210, 161, 56, 148, 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 96, 33, 58, 82, 54, 202, 201, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 96, 33, 58, 82, 54, 202, 201, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 192, 77, 69, 54, 31, 230, 225, 253, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 192, 77, 69, 54, 31, 230, 225, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 128, 9, 181, 30, 56, 253, 210, 234, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 128, 9, 181, 30, 56, 253, 210, 234, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 95, 18, 51, 49, 228, 61, 44, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 95, 18, 51, 49, 228, 61, 44, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 182, 183, 254, 235, 233, 106, 186, 247, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 182, 183, 254, 235, 233, 106, 186, 247, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 28, 45, 243, 55, 35, 45, 72, 173, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 28, 45, 243, 55, 35, 45, 72, 173, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 24, 195, 127, 47, 96, 195, 209, 196, 252, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 24, 195, 127, 47, 96, 195, 209, 196, 252, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 240, 158, 253, 218, 193, 161, 49, 176, 223, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 240, 158, 253, 218, 193, 161, 49, 176, 223, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 96, 53, 232, 141, 146, 81, 240, 225, 188, 254, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 96, 53, 232, 141, 146, 81, 240, 225, 188, 254, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 192, 21, 18, 139, 185, 47, 99, 211, 96, 243, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 192, 21, 18, 139, 185, 47, 99, 211, 96, 243, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 128, 217, 180, 110, 63, 221, 223, 65, 200, 129, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 128, 217, 180, 110, 63, 221, 223, 65, 200, 129, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 127, 16, 83, 122, 164, 190, 146, 210, 17, 251, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 127, 16, 83, 122, 164, 190, 146, 210, 17, 251, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 246, 164, 62, 199, 108, 114, 187, 57, 178, 206, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 246, 164, 62, 199, 108, 114, 187, 57, 178, 206, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 156, 113, 114, 200, 63, 120, 82, 65, 246, 18, 254, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 156, 113, 114, 200, 63, 120, 82, 65, 246, 18, 254, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 24, 112, 120, 212, 125, 178, 56, 141, 158, 189, 236, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 24, 112, 120, 212, 125, 178, 56, 141, 158, 189, 236, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 240, 96, 180, 76, 234, 248, 54, 132, 49, 104, 63, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 240, 96, 180, 76, 234, 248, 54, 132, 49, 104, 63, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 96, 201, 11, 255, 38, 185, 37, 42, 239, 17, 122, 248, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 96, 201, 11, 255, 38, 185, 37, 42, 239, 17, 122, 248, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 192, 221, 117, 246, 133, 59, 121, 165, 87, 179, 196, 180, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 192, 221, 117, 246, 133, 59, 121, 165, 87, 179, 196, 180, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 128, 169, 154, 160, 59, 83, 188, 118, 108, 1, 175, 15, 253, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 128, 169, 154, 160, 59, 83, 188, 118, 108, 1, 175, 15, 253, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 159, 10, 70, 84, 64, 91, 163, 60, 14, 214, 156, 226, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 159, 10, 70, 84, 64, 91, 163, 60, 14, 214, 156, 226, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 54, 106, 188, 74, 131, 144, 97, 94, 142, 92, 32, 218, 254, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 54, 106, 188, 74, 131, 144, 97, 94, 142, 92, 32, 218, 254, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 28, 38, 92, 235, 32, 165, 207, 175, 143, 157, 67, 133, 244, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 28, 38, 92, 235, 32, 165, 207, 175, 143, 157, 67, 133, 244, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 24, 125, 153, 49, 73, 115, 28, 222, 156, 39, 164, 52, 141, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 24, 125, 153, 49, 73, 115, 28, 222, 156, 39, 164, 52, 141, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 240, 226, 254, 239, 219, 128, 28, 173, 32, 140, 105, 14, 132, 251, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 240, 226, 254, 239, 219, 128, 28, 173, 32, 140, 105, 14, 132, 251, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 96, 221, 244, 95, 151, 8, 29, 195, 70, 121, 31, 144, 40, 211, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 96, 221, 244, 95, 151, 8, 29, 195, 70, 121, 31, 144, 40, 211, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 192, 165, 144, 191, 233, 85, 34, 159, 195, 188, 58, 161, 149, 63, - 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 192, 165, 144, 191, 233, 85, 34, 159, 195, 188, 58, 161, 149, 63, 254, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 128, 121, 166, 123, 33, 91, 87, 55, 164, 95, 75, 76, 216, 123, - 238, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 128, 121, 166, 123, 33, 91, 87, 55, 164, 95, 75, 76, 216, 123, 238, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 191, 128, 212, 78, 143, 105, 41, 106, 188, 241, 250, 114, 214, - 80, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 191, 128, 212, 78, 143, 105, 41, 106, 188, 241, 250, 114, 214, 80, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 118, 7, 77, 20, 153, 31, 158, 37, 92, 113, 205, 125, 96, 40, - 249, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 118, 7, 77, 20, 153, 31, 158, 37, 92, 113, 205, 125, 96, 40, 249, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 156, 74, 2, 203, 250, 59, 45, 120, 153, 109, 6, 234, 196, 147, - 187, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 156, 74, 2, 203, 250, 59, 45, 120, 153, 109, 6, 234, 196, 147, 187, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 24, 234, 22, 238, 203, 87, 196, 177, 254, 71, 64, 36, 177, 197, - 83, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 24, 234, 22, 238, 203, 87, 196, 177, 254, 71, 64, 36, 177, 197, 83, 253, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 240, 36, 229, 76, 247, 109, 171, 241, 242, 207, 130, 106, 235, - 184, 69, 229, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 240, 36, 229, 76, 247, 109, 171, 241, 242, 207, 130, 106, 235, 184, 69, + 229, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 96, 113, 243, 0, 169, 75, 178, 112, 125, 31, 28, 41, 50, 57, - 185, 244, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 96, 113, 243, 0, 169, 75, 178, 112, 125, 31, 28, 41, 50, 57, 185, 244, + 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 192, 109, 130, 9, 154, 244, 246, 102, 230, 58, 25, 155, 245, - 59, 60, 143, 245, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 192, 109, 130, 9, 154, 244, 246, 102, 230, 58, 25, 155, 245, 59, 60, 143, + 245, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 128, 73, 24, 95, 4, 142, 165, 5, 0, 77, 252, 14, 152, 87, 90, - 152, 151, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 128, 73, 24, 95, 4, 142, 165, 5, 0, 77, 252, 14, 152, 87, 90, 152, 151, + 255, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 223, 242, 182, 43, 140, 119, 56, 0, 2, 219, 149, 240, 107, - 135, 243, 235, 251, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 223, 242, 182, 43, 140, 119, 56, 0, 2, 219, 149, 240, 107, 135, 243, + 235, 251, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 182, 124, 37, 181, 121, 171, 52, 2, 20, 142, 218, 101, 55, - 74, 131, 55, 215, 255, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 182, 124, 37, 181, 121, 171, 52, 2, 20, 142, 218, 101, 55, 74, 131, + 55, 215, 255, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 28, 223, 118, 19, 193, 178, 14, 22, 200, 140, 137, 250, 41, - 230, 32, 43, 104, 254, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 28, 223, 118, 19, 193, 178, 14, 22, 200, 140, 137, 250, 41, 230, 32, + 43, 104, 254, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 24, 183, 164, 194, 138, 251, 146, 220, 208, 127, 95, 201, - 163, 253, 72, 175, 17, 240, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 24, 183, 164, 194, 138, 251, 146, 220, 208, 127, 95, 201, 163, 253, + 72, 175, 17, 240, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 240, 38, 111, 154, 107, 211, 189, 157, 40, 254, 186, 221, - 101, 232, 217, 216, 176, 96, 255, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 240, 38, 111, 154, 107, 211, 189, 157, 40, 254, 186, 221, 101, 232, + 217, 216, 176, 96, 255, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 96, 133, 87, 8, 52, 66, 106, 41, 150, 237, 77, 169, 250, 19, - 131, 120, 232, 198, 249, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 96, 133, 87, 8, 52, 66, 106, 41, 150, 237, 77, 169, 250, 19, 131, 120, + 232, 198, 249, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 192, 53, 107, 83, 8, 150, 38, 158, 221, 71, 11, 157, 202, - 199, 30, 181, 20, 197, 193, 255, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 192, 53, 107, 83, 8, 150, 38, 158, 221, 71, 11, 157, 202, 199, 30, + 181, 20, 197, 193, 255, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 128, 25, 48, 66, 83, 220, 129, 45, 168, 206, 112, 34, 234, - 205, 51, 19, 207, 178, 145, 253, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 128, 25, 48, 66, 83, 220, 129, 45, 168, 206, 112, 34, 234, 205, 51, + 19, 207, 178, 145, 253, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 255, 224, 149, 64, 155, 18, 199, 145, 18, 104, 88, 37, - 11, 6, 192, 22, 252, 176, 231, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 0, 255, 224, 149, 64, 155, 18, 199, 145, 18, 104, 88, 37, 11, 6, 192, + 22, 252, 176, 231, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 246, 201, 218, 133, 16, 186, 198, 177, 185, 16, 116, 117, - 111, 60, 128, 227, 216, 233, 12, 255, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 0, 246, 201, 218, 133, 16, 186, 198, 177, 185, 16, 116, 117, 111, 60, + 128, 227, 216, 233, 12, 255, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 156, 227, 139, 58, 165, 68, 195, 241, 64, 167, 136, 150, - 90, 92, 2, 227, 120, 34, 129, 246, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 0, 156, 227, 139, 58, 165, 68, 195, 241, 64, 167, 136, 150, 90, 92, 2, + 227, 120, 34, 129, 246, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 24, 228, 118, 73, 116, 174, 160, 113, 137, 136, 86, 225, - 137, 155, 23, 222, 184, 88, 11, 161, 255, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 0, 24, 228, 118, 73, 116, 174, 160, 113, 137, 136, 86, 225, 137, 155, + 23, 222, 184, 88, 11, 161, 255, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 240, 232, 164, 222, 138, 208, 70, 112, 94, 85, 97, 205, - 98, 19, 236, 172, 56, 119, 113, 74, 252, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 0, 240, 232, 164, 222, 138, 208, 70, 112, 94, 85, 97, 205, 98, 19, + 236, 172, 56, 119, 113, 74, 252, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 96, 25, 113, 178, 108, 37, 196, 98, 176, 85, 205, 5, 220, - 193, 56, 193, 54, 168, 110, 232, 218, 255, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 0, 96, 25, 113, 178, 108, 37, 196, 98, 176, 85, 205, 5, 220, 193, 56, + 193, 54, 168, 110, 232, 218, 255, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 192, 253, 106, 248, 62, 118, 169, 219, 227, 88, 5, 58, - 152, 146, 55, 140, 35, 146, 82, 20, 141, 254, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 0, 192, 253, 106, 248, 62, 118, 169, 219, 227, 88, 5, 58, 152, 146, + 55, 140, 35, 146, 82, 20, 141, 254, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 128, 233, 45, 180, 117, 158, 158, 148, 230, 120, 53, 68, - 242, 185, 43, 122, 99, 181, 57, 203, 130, 241, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 0, 128, 233, 45, 180, 117, 158, 158, 148, 230, 120, 53, 68, 242, 185, + 43, 122, 99, 181, 57, 203, 130, 241, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 0, 31, 203, 9, 153, 48, 50, 206, 1, 185, 22, 170, 118, - 67, 181, 197, 226, 21, 65, 240, 27, 111, 255, 255, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 31, 203, 9, 153, 48, 50, 206, 1, 185, 22, 170, 118, 67, 181, + 197, 226, 21, 65, 240, 27, 111, 255, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 0, 54, 239, 97, 250, 229, 245, 13, 18, 58, 227, 164, 162, - 162, 20, 185, 219, 218, 138, 98, 23, 87, 250, 255, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 54, 239, 97, 250, 229, 245, 13, 18, 58, 227, 164, 162, 162, 20, + 185, 219, 218, 138, 98, 23, 87, 250, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 0, 28, 88, 211, 199, 251, 154, 139, 180, 68, 224, 112, - 90, 90, 206, 58, 149, 140, 108, 217, 233, 102, 199, 255, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 28, 88, 211, 199, 251, 154, 139, 180, 68, 224, 112, 90, 90, 206, + 58, 149, 140, 108, 217, 233, 102, 199, 255, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 0, 24, 113, 65, 206, 213, 13, 116, 13, 175, 194, 104, - 136, 135, 15, 76, 212, 125, 61, 126, 34, 5, 202, 253, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 24, 113, 65, 206, 213, 13, 116, 13, 175, 194, 104, 136, 135, 15, + 76, 212, 125, 61, 126, 34, 5, 202, 253, ]), i256::from_le_bytes([ - 1, 0, 0, 0, 0, 0, 0, 0, 0, 240, 106, 142, 14, 90, 138, 136, 134, 214, 154, 23, - 84, 75, 155, 248, 74, 234, 102, 238, 88, 51, 228, 233, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 240, 106, 142, 14, 90, 138, 136, 134, 214, 154, 23, 84, 75, 155, + 248, 74, 234, 102, 238, 88, 51, 228, 233, ]), ]; @@ -758,10 +758,7 @@ pub fn validate_decimal_precision(value: i128, precision: u8) -> Result<(), Arro /// Validates that the specified `i256` of value can be properly /// interpreted as a Decimal256 number with precision `precision` #[inline] -pub fn validate_decimal256_precision( - value: i256, - precision: u8, -) -> Result<(), ArrowError> { +pub fn validate_decimal256_precision(value: i256, precision: u8) -> Result<(), ArrowError> { if precision > DECIMAL256_MAX_PRECISION { return Err(ArrowError::InvalidArgumentError(format!( "Max precision of a Decimal256 is {DECIMAL256_MAX_PRECISION}, but got {precision}", diff --git a/arrow-data/src/equal/boolean.rs b/arrow-data/src/equal/boolean.rs index a20ca5ac0bd7..addae936f118 100644 --- a/arrow-data/src/equal/boolean.rs +++ b/arrow-data/src/equal/boolean.rs @@ -78,11 +78,10 @@ pub(super) fn boolean_equal( // get a ref of the null buffer bytes, to use in testing for nullness let lhs_nulls = lhs.nulls().unwrap(); - BitIndexIterator::new(lhs_nulls.validity(), lhs_start + lhs_nulls.offset(), len) - .all(|i| { - let lhs_pos = lhs_start + lhs.offset() + i; - let rhs_pos = rhs_start + rhs.offset() + i; - get_bit(lhs_values, lhs_pos) == get_bit(rhs_values, rhs_pos) - }) + BitIndexIterator::new(lhs_nulls.validity(), lhs_start + lhs_nulls.offset(), len).all(|i| { + let lhs_pos = lhs_start + lhs.offset() + i; + let rhs_pos = rhs_start + rhs.offset() + i; + get_bit(lhs_values, lhs_pos) == get_bit(rhs_values, rhs_pos) + }) } } diff --git a/arrow-data/src/equal/fixed_binary.rs b/arrow-data/src/equal/fixed_binary.rs index 40dacdddd3a0..0778d77e2fdd 100644 --- a/arrow-data/src/equal/fixed_binary.rs +++ b/arrow-data/src/equal/fixed_binary.rs @@ -75,20 +75,15 @@ pub(super) fn fixed_binary_equal( }) } else { let lhs_nulls = lhs.nulls().unwrap(); - let lhs_slices_iter = BitSliceIterator::new( - lhs_nulls.validity(), - lhs_start + lhs_nulls.offset(), - len, - ); + let lhs_slices_iter = + BitSliceIterator::new(lhs_nulls.validity(), lhs_start + lhs_nulls.offset(), len); let rhs_nulls = rhs.nulls().unwrap(); - let rhs_slices_iter = BitSliceIterator::new( - rhs_nulls.validity(), - rhs_start + rhs_nulls.offset(), - len, - ); + let rhs_slices_iter = + BitSliceIterator::new(rhs_nulls.validity(), rhs_start + rhs_nulls.offset(), len); - lhs_slices_iter.zip(rhs_slices_iter).all( - |((l_start, l_end), (r_start, r_end))| { + lhs_slices_iter + .zip(rhs_slices_iter) + .all(|((l_start, l_end), (r_start, r_end))| { l_start == r_start && l_end == r_end && equal_len( @@ -98,8 +93,7 @@ pub(super) fn fixed_binary_equal( (rhs_start + r_start) * size, (l_end - l_start) * size, ) - }, - ) + }) } } } diff --git a/arrow-data/src/equal/mod.rs b/arrow-data/src/equal/mod.rs index fbc868d3f5c4..b279546474a0 100644 --- a/arrow-data/src/equal/mod.rs +++ b/arrow-data/src/equal/mod.rs @@ -76,24 +76,16 @@ fn equal_values( DataType::Int64 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Float32 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Float64 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::Decimal128(_, _) => { - primitive_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::Decimal256(_, _) => { - primitive_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => { + DataType::Decimal128(_, _) => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Decimal256(_, _) => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Date32 | DataType::Time32(_) | DataType::Interval(IntervalUnit::YearMonth) => { primitive_equal::(lhs, rhs, lhs_start, rhs_start, len) } DataType::Date64 | DataType::Interval(IntervalUnit::DayTime) | DataType::Time64(_) | DataType::Timestamp(_, _) - | DataType::Duration(_) => { - primitive_equal::(lhs, rhs, lhs_start, rhs_start, len) - } + | DataType::Duration(_) => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Interval(IntervalUnit::MonthDayNano) => { primitive_equal::(lhs, rhs, lhs_start, rhs_start, len) } @@ -103,39 +95,21 @@ fn equal_values( DataType::LargeUtf8 | DataType::LargeBinary => { variable_sized_equal::(lhs, rhs, lhs_start, rhs_start, len) } - DataType::FixedSizeBinary(_) => { - fixed_binary_equal(lhs, rhs, lhs_start, rhs_start, len) - } + DataType::FixedSizeBinary(_) => fixed_binary_equal(lhs, rhs, lhs_start, rhs_start, len), DataType::List(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::LargeList(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::FixedSizeList(_, _) => { - fixed_list_equal(lhs, rhs, lhs_start, rhs_start, len) - } + DataType::FixedSizeList(_, _) => fixed_list_equal(lhs, rhs, lhs_start, rhs_start, len), DataType::Struct(_) => struct_equal(lhs, rhs, lhs_start, rhs_start, len), DataType::Union(_, _) => union_equal(lhs, rhs, lhs_start, rhs_start, len), DataType::Dictionary(data_type, _) => match data_type.as_ref() { DataType::Int8 => dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::Int16 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::Int32 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::Int64 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::UInt8 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::UInt16 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::UInt32 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } - DataType::UInt64 => { - dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len) - } + DataType::Int16 => dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Int32 => dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Int64 => dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::UInt8 => dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::UInt16 => dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::UInt32 => dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::UInt64 => dictionary_equal::(lhs, rhs, lhs_start, rhs_start, len), _ => unreachable!(), }, DataType::Float16 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), diff --git a/arrow-data/src/equal/primitive.rs b/arrow-data/src/equal/primitive.rs index 7b3cbc9eb949..e92fdd2ba23b 100644 --- a/arrow-data/src/equal/primitive.rs +++ b/arrow-data/src/equal/primitive.rs @@ -73,20 +73,15 @@ pub(super) fn primitive_equal( }) } else { let lhs_nulls = lhs.nulls().unwrap(); - let lhs_slices_iter = BitSliceIterator::new( - lhs_nulls.validity(), - lhs_start + lhs_nulls.offset(), - len, - ); + let lhs_slices_iter = + BitSliceIterator::new(lhs_nulls.validity(), lhs_start + lhs_nulls.offset(), len); let rhs_nulls = rhs.nulls().unwrap(); - let rhs_slices_iter = BitSliceIterator::new( - rhs_nulls.validity(), - rhs_start + rhs_nulls.offset(), - len, - ); + let rhs_slices_iter = + BitSliceIterator::new(rhs_nulls.validity(), rhs_start + rhs_nulls.offset(), len); - lhs_slices_iter.zip(rhs_slices_iter).all( - |((l_start, l_end), (r_start, r_end))| { + lhs_slices_iter + .zip(rhs_slices_iter) + .all(|((l_start, l_end), (r_start, r_end))| { l_start == r_start && l_end == r_end && equal_len( @@ -96,8 +91,7 @@ pub(super) fn primitive_equal( (rhs_start + r_start) * byte_width, (l_end - l_start) * byte_width, ) - }, - ) + }) } } } diff --git a/arrow-data/src/equal/union.rs b/arrow-data/src/equal/union.rs index 5869afc30dbe..62de276e507f 100644 --- a/arrow-data/src/equal/union.rs +++ b/arrow-data/src/equal/union.rs @@ -116,10 +116,7 @@ pub(super) fn union_equal( rhs_fields, ) } - ( - DataType::Union(_, UnionMode::Sparse), - DataType::Union(_, UnionMode::Sparse), - ) => { + (DataType::Union(_, UnionMode::Sparse), DataType::Union(_, UnionMode::Sparse)) => { lhs_type_id_range == rhs_type_id_range && equal_sparse(lhs, rhs, lhs_start, rhs_start, len) } diff --git a/arrow-data/src/equal/utils.rs b/arrow-data/src/equal/utils.rs index fa6211542550..cc81943756d2 100644 --- a/arrow-data/src/equal/utils.rs +++ b/arrow-data/src/equal/utils.rs @@ -73,11 +73,9 @@ pub(super) fn base_equal(lhs: &ArrayData, rhs: &ArrayData) -> bool { let r_value_field = r_fields.get(1).unwrap(); // We don't enforce the equality of field names - let data_type_equal = l_key_field.data_type() - == r_key_field.data_type() + let data_type_equal = l_key_field.data_type() == r_key_field.data_type() && l_value_field.data_type() == r_value_field.data_type(); - let nullability_equal = l_key_field.is_nullable() - == r_key_field.is_nullable() + let nullability_equal = l_key_field.is_nullable() == r_key_field.is_nullable() && l_value_field.is_nullable() == r_value_field.is_nullable(); let metadata_equal = l_key_field.metadata() == r_key_field.metadata() && l_value_field.metadata() == r_value_field.metadata(); diff --git a/arrow-data/src/ffi.rs b/arrow-data/src/ffi.rs index 7623ced043cc..589f7dac6d19 100644 --- a/arrow-data/src/ffi.rs +++ b/arrow-data/src/ffi.rs @@ -168,6 +168,12 @@ impl FFI_ArrowArray { .collect::>(); let n_children = children.len() as i64; + // As in the IPC format, emit null_count = length for Null type + let null_count = match data.data_type() { + DataType::Null => data.len(), + _ => data.null_count(), + }; + // create the private data owning everything. // any other data must be added here, e.g. via a struct, to track lifetime. let mut private_data = Box::new(ArrayPrivateData { @@ -179,7 +185,7 @@ impl FFI_ArrowArray { Self { length: data.len() as i64, - null_count: data.null_count() as i64, + null_count: null_count as i64, offset: data.offset() as i64, n_buffers, n_children, @@ -191,6 +197,22 @@ impl FFI_ArrowArray { } } + /// Takes ownership of the pointed to [`FFI_ArrowArray`] + /// + /// This acts to [move] the data out of `array`, setting the release callback to NULL + /// + /// # Safety + /// + /// * `array` must be [valid] for reads and writes + /// * `array` must be properly aligned + /// * `array` must point to a properly initialized value of [`FFI_ArrowArray`] + /// + /// [move]: https://arrow.apache.org/docs/format/CDataInterface.html#moving-an-array + /// [valid]: https://doc.rust-lang.org/std/ptr/index.html#safety + pub unsafe fn from_raw(array: *mut FFI_ArrowArray) -> Self { + std::ptr::replace(array, Self::empty()) + } + /// create an empty `FFI_ArrowArray`, which can be used to import data into pub fn empty() -> Self { Self { diff --git a/arrow-data/src/transform/list.rs b/arrow-data/src/transform/list.rs index 9d5d8330cb1e..d9a1c62a8e8e 100644 --- a/arrow-data/src/transform/list.rs +++ b/arrow-data/src/transform/list.rs @@ -23,9 +23,7 @@ use crate::ArrayData; use arrow_buffer::ArrowNativeType; use num::{CheckedAdd, Integer}; -pub(super) fn build_extend( - array: &ArrayData, -) -> Extend { +pub(super) fn build_extend(array: &ArrayData) -> Extend { let offsets = array.buffer::(0); Box::new( move |mutable: &mut _MutableArrayData, index: usize, start: usize, len: usize| { @@ -35,11 +33,7 @@ pub(super) fn build_extend( let last_offset: T = unsafe { get_last_offset(offset_buffer) }; // offsets - extend_offsets::( - offset_buffer, - last_offset, - &offsets[start..start + len + 1], - ); + extend_offsets::(offset_buffer, last_offset, &offsets[start..start + len + 1]); mutable.child_data[0].extend( index, @@ -50,10 +44,7 @@ pub(super) fn build_extend( ) } -pub(super) fn extend_nulls( - mutable: &mut _MutableArrayData, - len: usize, -) { +pub(super) fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) { let offset_buffer = &mut mutable.buffer1; // this is safe due to how offset is built. See details on `get_last_offset` diff --git a/arrow-data/src/transform/mod.rs b/arrow-data/src/transform/mod.rs index f4b2b46d1723..268cf10f2326 100644 --- a/arrow-data/src/transform/mod.rs +++ b/arrow-data/src/transform/mod.rs @@ -173,11 +173,7 @@ impl<'a> std::fmt::Debug for MutableArrayData<'a> { /// Builds an extend that adds `offset` to the source primitive /// Additionally validates that `max` fits into the /// the underlying primitive returning None if not -fn build_extend_dictionary( - array: &ArrayData, - offset: usize, - max: usize, -) -> Option { +fn build_extend_dictionary(array: &ArrayData, offset: usize, max: usize) -> Option { macro_rules! validate_and_build { ($dt: ty) => {{ let _: $dt = max.try_into().ok()?; @@ -215,27 +211,19 @@ fn build_extend(array: &ArrayData) -> Extend { DataType::Int64 => primitive::build_extend::(array), DataType::Float32 => primitive::build_extend::(array), DataType::Float64 => primitive::build_extend::(array), - DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => { + DataType::Date32 | DataType::Time32(_) | DataType::Interval(IntervalUnit::YearMonth) => { primitive::build_extend::(array) } DataType::Date64 | DataType::Time64(_) | DataType::Timestamp(_, _) | DataType::Duration(_) - | DataType::Interval(IntervalUnit::DayTime) => { - primitive::build_extend::(array) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - primitive::build_extend::(array) - } + | DataType::Interval(IntervalUnit::DayTime) => primitive::build_extend::(array), + DataType::Interval(IntervalUnit::MonthDayNano) => primitive::build_extend::(array), DataType::Decimal128(_, _) => primitive::build_extend::(array), DataType::Decimal256(_, _) => primitive::build_extend::(array), DataType::Utf8 | DataType::Binary => variable_size::build_extend::(array), - DataType::LargeUtf8 | DataType::LargeBinary => { - variable_size::build_extend::(array) - } + DataType::LargeUtf8 | DataType::LargeBinary => variable_size::build_extend::(array), DataType::Map(_, _) | DataType::List(_) => list::build_extend::(array), DataType::LargeList(_) => list::build_extend::(array), DataType::Dictionary(_, _) => unreachable!("should use build_extend_dictionary"), @@ -265,9 +253,9 @@ fn build_extend_nulls(data_type: &DataType) -> ExtendNulls { DataType::Int64 => primitive::extend_nulls::, DataType::Float32 => primitive::extend_nulls::, DataType::Float64 => primitive::extend_nulls::, - DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => primitive::extend_nulls::, + DataType::Date32 | DataType::Time32(_) | DataType::Interval(IntervalUnit::YearMonth) => { + primitive::extend_nulls:: + } DataType::Date64 | DataType::Time64(_) | DataType::Timestamp(_, _) @@ -366,6 +354,14 @@ impl<'a> MutableArrayData<'a> { ) -> Self { let data_type = arrays[0].data_type(); + for a in arrays.iter().skip(1) { + assert_eq!( + data_type, + a.data_type(), + "Arrays with inconsistent types passed to MutableArrayData" + ) + } + // if any of the arrays has nulls, insertions from any array requires setting bits // as there is at least one array with nulls. let use_nulls = use_nulls | arrays.iter().any(|array| array.null_count() > 0); @@ -380,10 +376,7 @@ impl<'a> MutableArrayData<'a> { array_capacity = *capacity; preallocate_offset_and_binary_buffer::(*capacity, *value_cap) } - ( - DataType::Utf8 | DataType::Binary, - Capacities::Binary(capacity, Some(value_cap)), - ) => { + (DataType::Utf8 | DataType::Binary, Capacities::Binary(capacity, Some(value_cap))) => { array_capacity = *capacity; preallocate_offset_and_binary_buffer::(*capacity, *value_cap) } @@ -391,10 +384,7 @@ impl<'a> MutableArrayData<'a> { array_capacity = *capacity; new_buffers(data_type, *capacity) } - ( - DataType::List(_) | DataType::LargeList(_), - Capacities::List(capacity, _), - ) => { + (DataType::List(_) | DataType::LargeList(_), Capacities::List(capacity, _)) => { array_capacity = *capacity; new_buffers(data_type, *capacity) } @@ -435,16 +425,15 @@ impl<'a> MutableArrayData<'a> { .map(|array| &array.child_data()[0]) .collect::>(); - let capacities = if let Capacities::List(capacity, ref child_capacities) = - capacities - { - child_capacities - .clone() - .map(|c| *c) - .unwrap_or(Capacities::Array(capacity)) - } else { - Capacities::Array(array_capacity) - }; + let capacities = + if let Capacities::List(capacity, ref child_capacities) = capacities { + child_capacities + .clone() + .map(|c| *c) + .unwrap_or(Capacities::Array(capacity)) + } else { + Capacities::Array(array_capacity) + }; vec![MutableArrayData::with_capacities( children, use_nulls, capacities, @@ -546,8 +535,7 @@ impl<'a> MutableArrayData<'a> { .collect(); let capacity = lengths.iter().sum(); - let mut mutable = - MutableArrayData::new(dictionaries, false, capacity); + let mut mutable = MutableArrayData::new(dictionaries, false, capacity); for (i, len) in lengths.iter().enumerate() { mutable.extend(i, 0, *len) diff --git a/arrow-data/src/transform/primitive.rs b/arrow-data/src/transform/primitive.rs index b5c826438bfc..627dc00de1df 100644 --- a/arrow-data/src/transform/primitive.rs +++ b/arrow-data/src/transform/primitive.rs @@ -47,9 +47,6 @@ where ) } -pub(super) fn extend_nulls( - mutable: &mut _MutableArrayData, - len: usize, -) { +pub(super) fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) { mutable.buffer1.extend_zeros(len * size_of::()); } diff --git a/arrow-data/src/transform/utils.rs b/arrow-data/src/transform/utils.rs index 17bb87e88a5c..5407f68e0d0c 100644 --- a/arrow-data/src/transform/utils.rs +++ b/arrow-data/src/transform/utils.rs @@ -45,9 +45,7 @@ pub(super) fn extend_offsets( } #[inline] -pub(super) unsafe fn get_last_offset( - offset_buffer: &MutableBuffer, -) -> T { +pub(super) unsafe fn get_last_offset(offset_buffer: &MutableBuffer) -> T { // JUSTIFICATION // Benefit // 20% performance improvement extend of variable sized arrays (see bench `mutable_array`) diff --git a/arrow-data/src/transform/variable_size.rs b/arrow-data/src/transform/variable_size.rs index 597a8b2b6645..fa1592d973ed 100644 --- a/arrow-data/src/transform/variable_size.rs +++ b/arrow-data/src/transform/variable_size.rs @@ -39,9 +39,7 @@ fn extend_offset_values>( buffer.extend_from_slice(new_values); } -pub(super) fn build_extend< - T: ArrowNativeType + Integer + CheckedAdd + AsPrimitive, ->( +pub(super) fn build_extend>( array: &ArrayData, ) -> Extend { let offsets = array.buffer::(0); @@ -54,21 +52,14 @@ pub(super) fn build_extend< // this is safe due to how offset is built. See details on `get_last_offset` let last_offset = unsafe { get_last_offset(offset_buffer) }; - extend_offsets::( - offset_buffer, - last_offset, - &offsets[start..start + len + 1], - ); + extend_offsets::(offset_buffer, last_offset, &offsets[start..start + len + 1]); // values extend_offset_values::(values_buffer, offsets, values, start, len); }, ) } -pub(super) fn extend_nulls( - mutable: &mut _MutableArrayData, - len: usize, -) { +pub(super) fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) { let offset_buffer = &mut mutable.buffer1; // this is safe due to how offset is built. See details on `get_last_offset` diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 1a53dbddb13d..1bea347c3037 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -44,14 +44,15 @@ bytes = { version = "1", default-features = false } futures = { version = "0.3", default-features = false, features = ["alloc"] } once_cell = { version = "1", optional = true } paste = { version = "1.0" } -prost = { version = "0.11", default-features = false, features = ["prost-derive"] } +prost = { version = "0.12.1", default-features = false, features = ["prost-derive"] } tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "rt-multi-thread"] } -tonic = { version = "0.9", default-features = false, features = ["transport", "codegen", "prost"] } +tonic = { version = "0.10.0", default-features = false, features = ["transport", "codegen", "prost"] } # CLI-related dependencies -clap = { version = "4.1", default-features = false, features = ["std", "derive", "env", "help", "error-context", "usage"], optional = true } -tracing-log = { version = "0.1", optional = true } -tracing-subscriber = { version = "0.3.1", default-features = false, features = ["ansi", "fmt"], optional = true } +anyhow = { version = "1.0", optional = true } +clap = { version = "4.4.6", default-features = false, features = ["std", "derive", "env", "help", "error-context", "usage", "wrap_help", "color", "suggestions"], optional = true } +tracing-log = { version = "0.2", optional = true } +tracing-subscriber = { version = "0.3.1", default-features = false, features = ["ansi", "env-filter", "fmt"], optional = true } [package.metadata.docs.rs] all-features = true @@ -62,7 +63,7 @@ flight-sql-experimental = ["arrow-arith", "arrow-data", "arrow-ord", "arrow-row" tls = ["tonic/tls"] # Enable CLI tools -cli = ["arrow-cast/prettyprint", "clap", "tracing-log", "tracing-subscriber", "tonic/tls-webpki-roots"] +cli = ["anyhow", "arrow-cast/prettyprint", "clap", "tracing-log", "tracing-subscriber", "tonic/tls-webpki-roots"] [dev-dependencies] arrow-cast = { workspace = true, features = ["prettyprint"] } diff --git a/arrow-flight/README.md b/arrow-flight/README.md index 9194b209fe72..b80772ac927e 100644 --- a/arrow-flight/README.md +++ b/arrow-flight/README.md @@ -44,5 +44,33 @@ that demonstrate how to build a Flight server implemented with [tonic](https://d ## Feature Flags - `flight-sql-experimental`: Enables experimental support for - [Apache Arrow FlightSQL](https://arrow.apache.org/docs/format/FlightSql.html), - a protocol for interacting with SQL databases. + [Apache Arrow FlightSQL], a protocol for interacting with SQL databases. + +## CLI + +This crates offers a basic [Apache Arrow FlightSQL] command line interface. + +The client can be installed from the repository: + +```console +$ cargo install --features=cli,flight-sql-experimental,tls --bin=flight_sql_client --path=. --locked +``` + +The client comes with extensive help text: + +```console +$ flight_sql_client help +``` + +A query can be executed using: + +```console +$ flight_sql_client --host example.com statement-query "SELECT 1;" ++----------+ +| Int64(1) | ++----------+ +| 1 | ++----------+ +``` + +[apache arrow flightsql]: https://arrow.apache.org/docs/format/FlightSql.html diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index 1e99957390d8..bd94d3c499ca 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use arrow_flight::sql::server::PeekableFlightDataStream; use base64::prelude::BASE64_STANDARD; use base64::Engine; use futures::{stream, Stream, TryStreamExt}; @@ -31,28 +32,26 @@ use arrow_array::builder::StringBuilder; use arrow_array::{ArrayRef, RecordBatch}; use arrow_flight::encode::FlightDataEncoderBuilder; use arrow_flight::sql::metadata::{ - SqlInfoData, SqlInfoDataBuilder, XdbcTypeInfo, XdbcTypeInfoData, - XdbcTypeInfoDataBuilder, + SqlInfoData, SqlInfoDataBuilder, XdbcTypeInfo, XdbcTypeInfoData, XdbcTypeInfoDataBuilder, }; use arrow_flight::sql::{ server::FlightSqlService, ActionBeginSavepointRequest, ActionBeginSavepointResult, - ActionBeginTransactionRequest, ActionBeginTransactionResult, - ActionCancelQueryRequest, ActionCancelQueryResult, - ActionClosePreparedStatementRequest, ActionCreatePreparedStatementRequest, - ActionCreatePreparedStatementResult, ActionCreatePreparedSubstraitPlanRequest, - ActionEndSavepointRequest, ActionEndTransactionRequest, Any, CommandGetCatalogs, - CommandGetCrossReference, CommandGetDbSchemas, CommandGetExportedKeys, - CommandGetImportedKeys, CommandGetPrimaryKeys, CommandGetSqlInfo, - CommandGetTableTypes, CommandGetTables, CommandGetXdbcTypeInfo, + ActionBeginTransactionRequest, ActionBeginTransactionResult, ActionCancelQueryRequest, + ActionCancelQueryResult, ActionClosePreparedStatementRequest, + ActionCreatePreparedStatementRequest, ActionCreatePreparedStatementResult, + ActionCreatePreparedSubstraitPlanRequest, ActionEndSavepointRequest, + ActionEndTransactionRequest, Any, CommandGetCatalogs, CommandGetCrossReference, + CommandGetDbSchemas, CommandGetExportedKeys, CommandGetImportedKeys, CommandGetPrimaryKeys, + CommandGetSqlInfo, CommandGetTableTypes, CommandGetTables, CommandGetXdbcTypeInfo, CommandPreparedStatementQuery, CommandPreparedStatementUpdate, CommandStatementQuery, - CommandStatementSubstraitPlan, CommandStatementUpdate, Nullable, ProstMessageExt, - Searchable, SqlInfo, TicketStatementQuery, XdbcDataType, + CommandStatementSubstraitPlan, CommandStatementUpdate, Nullable, ProstMessageExt, Searchable, + SqlInfo, TicketStatementQuery, XdbcDataType, }; use arrow_flight::utils::batches_to_flight_data; use arrow_flight::{ - flight_service_server::FlightService, flight_service_server::FlightServiceServer, - Action, FlightData, FlightDescriptor, FlightEndpoint, FlightInfo, HandshakeRequest, - HandshakeResponse, IpcMessage, Location, SchemaAsIpc, Ticket, + flight_service_server::FlightService, flight_service_server::FlightServiceServer, Action, + FlightData, FlightDescriptor, FlightEndpoint, FlightInfo, HandshakeRequest, HandshakeResponse, + IpcMessage, Location, SchemaAsIpc, Ticket, }; use arrow_ipc::writer::IpcWriteOptions; use arrow_schema::{ArrowError, DataType, Field, Schema}; @@ -166,8 +165,7 @@ impl FlightSqlService for FlightSqlServiceImpl { let bytes = BASE64_STANDARD .decode(base64) .map_err(|e| status!("authorization not decodable", e))?; - let str = String::from_utf8(bytes) - .map_err(|e| status!("authorization not parsable", e))?; + let str = String::from_utf8(bytes).map_err(|e| status!("authorization not parsable", e))?; let parts: Vec<_> = str.split(':').collect(); let (user, pass) = match parts.as_slice() { [user, pass] => (user, pass), @@ -194,8 +192,7 @@ impl FlightSqlService for FlightSqlServiceImpl { _message: Any, ) -> Result::DoGetStream>, Status> { self.check_token(&request)?; - let batch = - Self::fake_result().map_err(|e| status!("Could not fake a result", e))?; + let batch = Self::fake_result().map_err(|e| status!("Could not fake a result", e))?; let schema = batch.schema(); let batches = vec![batch]; let flight_data = batches_to_flight_data(schema.as_ref(), batches) @@ -237,8 +234,7 @@ impl FlightSqlService for FlightSqlServiceImpl { self.check_token(&request)?; let handle = std::str::from_utf8(&cmd.prepared_statement_handle) .map_err(|e| status!("Unable to parse handle", e))?; - let batch = - Self::fake_result().map_err(|e| status!("Could not fake a result", e))?; + let batch = Self::fake_result().map_err(|e| status!("Could not fake a result", e))?; let schema = (*batch.schema()).clone(); let num_rows = batch.num_rows(); let num_bytes = batch.get_array_memory_size(); @@ -602,7 +598,7 @@ impl FlightSqlService for FlightSqlServiceImpl { async fn do_put_statement_update( &self, _ticket: CommandStatementUpdate, - _request: Request>, + _request: Request, ) -> Result { Ok(FAKE_UPDATE_RESULT) } @@ -610,7 +606,7 @@ impl FlightSqlService for FlightSqlServiceImpl { async fn do_put_substrait_plan( &self, _ticket: CommandStatementSubstraitPlan, - _request: Request>, + _request: Request, ) -> Result { Err(Status::unimplemented( "do_put_substrait_plan not implemented", @@ -620,7 +616,7 @@ impl FlightSqlService for FlightSqlServiceImpl { async fn do_put_prepared_statement_query( &self, _query: CommandPreparedStatementQuery, - _request: Request>, + _request: Request, ) -> Result::DoPutStream>, Status> { Err(Status::unimplemented( "do_put_prepared_statement_query not implemented", @@ -630,7 +626,7 @@ impl FlightSqlService for FlightSqlServiceImpl { async fn do_put_prepared_statement_update( &self, _query: CommandPreparedStatementUpdate, - _request: Request>, + _request: Request, ) -> Result { Err(Status::unimplemented( "do_put_prepared_statement_update not implemented", @@ -735,8 +731,7 @@ async fn main() -> Result<(), Box> { if std::env::var("USE_TLS").ok().is_some() { let cert = std::fs::read_to_string("arrow-flight/examples/data/server.pem")?; let key = std::fs::read_to_string("arrow-flight/examples/data/server.key")?; - let client_ca = - std::fs::read_to_string("arrow-flight/examples/data/client_ca.pem")?; + let client_ca = std::fs::read_to_string("arrow-flight/examples/data/client_ca.pem")?; let tls_config = ServerTlsConfig::new() .identity(Identity::from_pem(&cert, &key)) @@ -788,7 +783,6 @@ mod tests { use arrow_cast::pretty::pretty_format_batches; use arrow_flight::sql::client::FlightSqlServiceClient; - use arrow_flight::utils::flight_data_to_batches; use tonic::transport::server::TcpIncoming; use tonic::transport::{Certificate, Endpoint}; use tower::service_fn; @@ -954,8 +948,7 @@ mod tests { let ticket = flight_info.endpoint[0].ticket.as_ref().unwrap().clone(); let flight_data = client.do_get(ticket).await.unwrap(); - let flight_data: Vec = flight_data.try_collect().await.unwrap(); - let batches = flight_data_to_batches(&flight_data).unwrap(); + let batches: Vec<_> = flight_data.try_collect().await.unwrap(); let res = pretty_format_batches(batches.as_slice()).unwrap(); let expected = r#" diff --git a/arrow-flight/examples/server.rs b/arrow-flight/examples/server.rs index 1ed21acef9b8..85ac4ca1384c 100644 --- a/arrow-flight/examples/server.rs +++ b/arrow-flight/examples/server.rs @@ -20,9 +20,9 @@ use tonic::transport::Server; use tonic::{Request, Response, Status, Streaming}; use arrow_flight::{ - flight_service_server::FlightService, flight_service_server::FlightServiceServer, - Action, ActionType, Criteria, Empty, FlightData, FlightDescriptor, FlightInfo, - HandshakeRequest, HandshakeResponse, PutResult, SchemaResult, Ticket, + flight_service_server::FlightService, flight_service_server::FlightServiceServer, Action, + ActionType, Criteria, Empty, FlightData, FlightDescriptor, FlightInfo, HandshakeRequest, + HandshakeResponse, PutResult, SchemaResult, Ticket, }; #[derive(Clone)] diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index 8f889c0a7cb9..4f7a032f51e5 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -32,6 +32,6 @@ publish = false [dependencies] # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing -proc-macro2 = { version = "=1.0.66", default-features = false } -prost-build = { version = "=0.11.9", default-features = false } -tonic-build = { version = "=0.9.2", default-features = false, features = ["transport", "prost"] } +proc-macro2 = { version = "=1.0.70", default-features = false } +prost-build = { version = "=0.12.3", default-features = false } +tonic-build = { version = "=0.10.2", default-features = false, features = ["transport", "prost"] } diff --git a/arrow-flight/src/arrow.flight.protocol.rs b/arrow-flight/src/arrow.flight.protocol.rs index 10dc7ace0356..e76013bd7c5f 100644 --- a/arrow-flight/src/arrow.flight.protocol.rs +++ b/arrow-flight/src/arrow.flight.protocol.rs @@ -685,7 +685,7 @@ pub mod flight_service_server { #[async_trait] pub trait FlightService: Send + Sync + 'static { /// Server streaming response type for the Handshake method. - type HandshakeStream: futures_core::Stream< + type HandshakeStream: tonic::codegen::tokio_stream::Stream< Item = std::result::Result, > + Send @@ -700,7 +700,7 @@ pub mod flight_service_server { request: tonic::Request>, ) -> std::result::Result, tonic::Status>; /// Server streaming response type for the ListFlights method. - type ListFlightsStream: futures_core::Stream< + type ListFlightsStream: tonic::codegen::tokio_stream::Stream< Item = std::result::Result, > + Send @@ -744,7 +744,7 @@ pub mod flight_service_server { request: tonic::Request, ) -> std::result::Result, tonic::Status>; /// Server streaming response type for the DoGet method. - type DoGetStream: futures_core::Stream< + type DoGetStream: tonic::codegen::tokio_stream::Stream< Item = std::result::Result, > + Send @@ -759,7 +759,7 @@ pub mod flight_service_server { request: tonic::Request, ) -> std::result::Result, tonic::Status>; /// Server streaming response type for the DoPut method. - type DoPutStream: futures_core::Stream< + type DoPutStream: tonic::codegen::tokio_stream::Stream< Item = std::result::Result, > + Send @@ -776,7 +776,7 @@ pub mod flight_service_server { request: tonic::Request>, ) -> std::result::Result, tonic::Status>; /// Server streaming response type for the DoExchange method. - type DoExchangeStream: futures_core::Stream< + type DoExchangeStream: tonic::codegen::tokio_stream::Stream< Item = std::result::Result, > + Send @@ -792,7 +792,7 @@ pub mod flight_service_server { request: tonic::Request>, ) -> std::result::Result, tonic::Status>; /// Server streaming response type for the DoAction method. - type DoActionStream: futures_core::Stream< + type DoActionStream: tonic::codegen::tokio_stream::Stream< Item = std::result::Result, > + Send @@ -809,7 +809,7 @@ pub mod flight_service_server { request: tonic::Request, ) -> std::result::Result, tonic::Status>; /// Server streaming response type for the ListActions method. - type ListActionsStream: futures_core::Stream< + type ListActionsStream: tonic::codegen::tokio_stream::Stream< Item = std::result::Result, > + Send @@ -930,7 +930,9 @@ pub mod flight_service_server { >, ) -> Self::Future { let inner = Arc::clone(&self.0); - let fut = async move { (*inner).handshake(request).await }; + let fut = async move { + ::handshake(&inner, request).await + }; Box::pin(fut) } } @@ -976,7 +978,7 @@ pub mod flight_service_server { ) -> Self::Future { let inner = Arc::clone(&self.0); let fut = async move { - (*inner).list_flights(request).await + ::list_flights(&inner, request).await }; Box::pin(fut) } @@ -1022,7 +1024,7 @@ pub mod flight_service_server { ) -> Self::Future { let inner = Arc::clone(&self.0); let fut = async move { - (*inner).get_flight_info(request).await + ::get_flight_info(&inner, request).await }; Box::pin(fut) } @@ -1067,7 +1069,9 @@ pub mod flight_service_server { request: tonic::Request, ) -> Self::Future { let inner = Arc::clone(&self.0); - let fut = async move { (*inner).get_schema(request).await }; + let fut = async move { + ::get_schema(&inner, request).await + }; Box::pin(fut) } } @@ -1112,7 +1116,9 @@ pub mod flight_service_server { request: tonic::Request, ) -> Self::Future { let inner = Arc::clone(&self.0); - let fut = async move { (*inner).do_get(request).await }; + let fut = async move { + ::do_get(&inner, request).await + }; Box::pin(fut) } } @@ -1157,7 +1163,9 @@ pub mod flight_service_server { request: tonic::Request>, ) -> Self::Future { let inner = Arc::clone(&self.0); - let fut = async move { (*inner).do_put(request).await }; + let fut = async move { + ::do_put(&inner, request).await + }; Box::pin(fut) } } @@ -1202,7 +1210,9 @@ pub mod flight_service_server { request: tonic::Request>, ) -> Self::Future { let inner = Arc::clone(&self.0); - let fut = async move { (*inner).do_exchange(request).await }; + let fut = async move { + ::do_exchange(&inner, request).await + }; Box::pin(fut) } } @@ -1247,7 +1257,9 @@ pub mod flight_service_server { request: tonic::Request, ) -> Self::Future { let inner = Arc::clone(&self.0); - let fut = async move { (*inner).do_action(request).await }; + let fut = async move { + ::do_action(&inner, request).await + }; Box::pin(fut) } } @@ -1293,7 +1305,7 @@ pub mod flight_service_server { ) -> Self::Future { let inner = Arc::clone(&self.0); let fut = async move { - (*inner).list_actions(request).await + ::list_actions(&inner, request).await }; Box::pin(fut) } diff --git a/arrow-flight/src/bin/flight_sql_client.rs b/arrow-flight/src/bin/flight_sql_client.rs index 20c8062f899e..296efc1c308e 100644 --- a/arrow-flight/src/bin/flight_sql_client.rs +++ b/arrow-flight/src/bin/flight_sql_client.rs @@ -17,62 +17,58 @@ use std::{sync::Arc, time::Duration}; -use arrow_array::RecordBatch; -use arrow_cast::pretty::pretty_format_batches; -use arrow_flight::{ - sql::client::FlightSqlServiceClient, utils::flight_data_to_batches, FlightData, -}; -use arrow_schema::{ArrowError, Schema}; -use clap::Parser; +use anyhow::{bail, Context, Result}; +use arrow_array::{ArrayRef, Datum, RecordBatch, StringArray}; +use arrow_cast::{cast_with_options, pretty::pretty_format_batches, CastOptions}; +use arrow_flight::{sql::client::FlightSqlServiceClient, FlightInfo}; +use arrow_schema::Schema; +use clap::{Parser, Subcommand}; use futures::TryStreamExt; -use tonic::transport::{Channel, ClientTlsConfig, Endpoint}; +use tonic::{ + metadata::MetadataMap, + transport::{Channel, ClientTlsConfig, Endpoint}, +}; use tracing_log::log::info; -/// A ':' separated key value pair -#[derive(Debug, Clone)] -struct KeyValue { - pub key: K, - pub value: V, -} - -impl std::str::FromStr for KeyValue -where - K: std::str::FromStr, - V: std::str::FromStr, - K::Err: std::fmt::Display, - V::Err: std::fmt::Display, -{ - type Err = String; - - fn from_str(s: &str) -> std::result::Result { - let parts = s.splitn(2, ':').collect::>(); - match parts.as_slice() { - [key, value] => { - let key = K::from_str(key).map_err(|e| e.to_string())?; - let value = V::from_str(value.trim()).map_err(|e| e.to_string())?; - Ok(Self { key, value }) - } - _ => Err(format!( - "Invalid key value pair - expected 'KEY:VALUE' got '{s}'" - )), - } - } +/// Logging CLI config. +#[derive(Debug, Parser)] +pub struct LoggingArgs { + /// Log verbosity. + /// + /// Defaults to "warn". + /// + /// Use `-v` for "info", `-vv` for "debug", `-vvv` for "trace". + /// + /// Note you can also set logging level using `RUST_LOG` environment variable: + /// `RUST_LOG=debug`. + #[clap( + short = 'v', + long = "verbose", + action = clap::ArgAction::Count, + )] + log_verbose_count: u8, } #[derive(Debug, Parser)] struct ClientArgs { /// Additional headers. /// - /// Values should be key value pairs separated by ':' - #[clap(long, value_delimiter = ',')] - headers: Vec>, + /// Can be given multiple times. Headers and values are separated by '='. + /// + /// Example: `-H foo=bar -H baz=42` + #[clap(long = "header", short = 'H', value_parser = parse_key_val)] + headers: Vec<(String, String)>, - /// Username - #[clap(long)] + /// Username. + /// + /// Optional. If given, `password` must also be set. + #[clap(long, requires = "password")] username: Option, - /// Password - #[clap(long)] + /// Password. + /// + /// Optional. If given, `username` must also be set. + #[clap(long, requires = "username")] password: Option, /// Auth token. @@ -80,78 +76,199 @@ struct ClientArgs { token: Option, /// Use TLS. + /// + /// If not provided, use cleartext connection. #[clap(long)] tls: bool, /// Server host. + /// + /// Required. #[clap(long)] host: String, /// Server port. + /// + /// Defaults to `443` if `tls` is set, otherwise defaults to `80`. #[clap(long)] port: Option, } #[derive(Debug, Parser)] struct Args { + /// Logging args. + #[clap(flatten)] + logging_args: LoggingArgs, + /// Client args. #[clap(flatten)] client_args: ClientArgs, - /// SQL query. - query: String, + #[clap(subcommand)] + cmd: Command, +} + +/// Different available commands. +#[derive(Debug, Subcommand)] +enum Command { + /// Execute given statement. + StatementQuery { + /// SQL query. + /// + /// Required. + query: String, + }, + + /// Prepare given statement and then execute it. + PreparedStatementQuery { + /// SQL query. + /// + /// Required. + /// + /// Can contains placeholders like `$1`. + /// + /// Example: `SELECT * FROM t WHERE x = $1` + query: String, + + /// Additional parameters. + /// + /// Can be given multiple times. Names and values are separated by '='. Values will be + /// converted to the type that the server reported for the prepared statement. + /// + /// Example: `-p $1=42` + #[clap(short, value_parser = parse_key_val)] + params: Vec<(String, String)>, + }, } #[tokio::main] -async fn main() { +async fn main() -> Result<()> { let args = Args::parse(); - setup_logging(); - let mut client = setup_client(args.client_args).await.expect("setup client"); + setup_logging(args.logging_args)?; + let mut client = setup_client(args.client_args) + .await + .context("setup client")?; + + let flight_info = match args.cmd { + Command::StatementQuery { query } => client + .execute(query, None) + .await + .context("execute statement")?, + Command::PreparedStatementQuery { query, params } => { + let mut prepared_stmt = client + .prepare(query, None) + .await + .context("prepare statement")?; + + if !params.is_empty() { + prepared_stmt + .set_parameters( + construct_record_batch_from_params( + ¶ms, + prepared_stmt + .parameter_schema() + .context("get parameter schema")?, + ) + .context("construct parameters")?, + ) + .context("bind parameters")?; + } - let info = client - .execute(args.query, None) + prepared_stmt + .execute() + .await + .context("execute prepared statement")? + } + }; + + let batches = execute_flight(&mut client, flight_info) .await - .expect("prepare statement"); - info!("got flight info"); + .context("read flight data")?; + + let res = pretty_format_batches(batches.as_slice()).context("format results")?; + println!("{res}"); + + Ok(()) +} - let schema = Arc::new(Schema::try_from(info.clone()).expect("valid schema")); +async fn execute_flight( + client: &mut FlightSqlServiceClient, + info: FlightInfo, +) -> Result> { + let schema = Arc::new(Schema::try_from(info.clone()).context("valid schema")?); let mut batches = Vec::with_capacity(info.endpoint.len() + 1); batches.push(RecordBatch::new_empty(schema)); info!("decoded schema"); for endpoint in info.endpoint { let Some(ticket) = &endpoint.ticket else { - panic!("did not get ticket"); + bail!("did not get ticket"); }; - let flight_data = client.do_get(ticket.clone()).await.expect("do get"); - let flight_data: Vec = flight_data + + let mut flight_data = client.do_get(ticket.clone()).await.context("do get")?; + log_metadata(flight_data.headers(), "header"); + + let mut endpoint_batches: Vec<_> = (&mut flight_data) .try_collect() .await - .expect("collect data stream"); - let mut endpoint_batches = flight_data_to_batches(&flight_data) - .expect("convert flight data to record batches"); + .context("collect data stream")?; batches.append(&mut endpoint_batches); + + if let Some(trailers) = flight_data.trailers() { + log_metadata(&trailers, "trailer"); + } } info!("received data"); - let res = pretty_format_batches(batches.as_slice()).expect("format results"); - println!("{res}"); + Ok(batches) +} + +fn construct_record_batch_from_params( + params: &[(String, String)], + parameter_schema: &Schema, +) -> Result { + let mut items = Vec::<(&String, ArrayRef)>::new(); + + for (name, value) in params { + let field = parameter_schema.field_with_name(name)?; + let value_as_array = StringArray::new_scalar(value); + let casted = cast_with_options( + value_as_array.get().0, + field.data_type(), + &CastOptions::default(), + )?; + items.push((name, casted)) + } + + Ok(RecordBatch::try_from_iter(items)?) } -fn setup_logging() { - tracing_log::LogTracer::init().expect("tracing log init"); - tracing_subscriber::fmt::init(); +fn setup_logging(args: LoggingArgs) -> Result<()> { + use tracing_subscriber::{util::SubscriberInitExt, EnvFilter, FmtSubscriber}; + + tracing_log::LogTracer::init().context("tracing log init")?; + + let filter = match args.log_verbose_count { + 0 => "warn", + 1 => "info", + 2 => "debug", + _ => "trace", + }; + let filter = EnvFilter::try_new(filter).context("set up log env filter")?; + + let subscriber = FmtSubscriber::builder().with_env_filter(filter).finish(); + subscriber.try_init().context("init logging subscriber")?; + + Ok(()) } -async fn setup_client( - args: ClientArgs, -) -> Result, ArrowError> { +async fn setup_client(args: ClientArgs) -> Result> { let port = args.port.unwrap_or(if args.tls { 443 } else { 80 }); let protocol = if args.tls { "https" } else { "http" }; let mut endpoint = Endpoint::new(format!("{}://{}:{}", protocol, args.host, port)) - .map_err(|_| ArrowError::IpcError("Cannot create endpoint".to_string()))? + .context("create endpoint")? .connect_timeout(Duration::from_secs(20)) .timeout(Duration::from_secs(20)) .tcp_nodelay(true) // Disable Nagle's Algorithm since we don't want packets to wait @@ -162,21 +279,18 @@ async fn setup_client( if args.tls { let tls_config = ClientTlsConfig::new(); - endpoint = endpoint.tls_config(tls_config).map_err(|_| { - ArrowError::IpcError("Cannot create TLS endpoint".to_string()) - })?; + endpoint = endpoint + .tls_config(tls_config) + .context("create TLS endpoint")?; } - let channel = endpoint - .connect() - .await - .map_err(|e| ArrowError::IpcError(format!("Cannot connect to endpoint: {e}")))?; + let channel = endpoint.connect().await.context("connect to endpoint")?; let mut client = FlightSqlServiceClient::new(channel); info!("connected"); - for kv in args.headers { - client.set_header(kv.key, kv.value); + for (k, v) in args.headers { + client.set_header(k, v); } if let Some(token) = args.token { @@ -190,16 +304,48 @@ async fn setup_client( client .handshake(&username, &password) .await - .expect("handshake"); + .context("handshake")?; info!("performed handshake"); } (Some(_), None) => { - panic!("when username is set, you also need to set a password") + bail!("when username is set, you also need to set a password") } (None, Some(_)) => { - panic!("when password is set, you also need to set a username") + bail!("when password is set, you also need to set a username") } } Ok(client) } + +/// Parse a single key-value pair +fn parse_key_val(s: &str) -> Result<(String, String), String> { + let pos = s + .find('=') + .ok_or_else(|| format!("invalid KEY=value: no `=` found in `{s}`"))?; + Ok((s[..pos].to_owned(), s[pos + 1..].to_owned())) +} + +/// Log headers/trailers. +fn log_metadata(map: &MetadataMap, what: &'static str) { + for k_v in map.iter() { + match k_v { + tonic::metadata::KeyAndValueRef::Ascii(k, v) => { + info!( + "{}: {}={}", + what, + k.as_str(), + v.to_str().unwrap_or(""), + ); + } + tonic::metadata::KeyAndValueRef::Binary(k, v) => { + info!( + "{}: {}={}", + what, + k.as_str(), + String::from_utf8_lossy(v.as_ref()), + ); + } + } + } +} diff --git a/arrow-flight/src/client.rs b/arrow-flight/src/client.rs index 8793f7834bfb..a264012c82ec 100644 --- a/arrow-flight/src/client.rs +++ b/arrow-flight/src/client.rs @@ -249,10 +249,7 @@ impl FlightClient { /// .expect("error fetching data"); /// # } /// ``` - pub async fn get_flight_info( - &mut self, - descriptor: FlightDescriptor, - ) -> Result { + pub async fn get_flight_info(&mut self, descriptor: FlightDescriptor) -> Result { let request = self.make_request(descriptor); let response = self.inner.get_flight_info(request).await?.into_inner(); @@ -452,10 +449,7 @@ impl FlightClient { /// .expect("error making request"); /// # } /// ``` - pub async fn get_schema( - &mut self, - flight_descriptor: FlightDescriptor, - ) -> Result { + pub async fn get_schema(&mut self, flight_descriptor: FlightDescriptor) -> Result { let request = self.make_request(flight_descriptor); let schema_result = self.inner.get_schema(request).await?.into_inner(); @@ -488,9 +482,7 @@ impl FlightClient { /// .expect("error gathering actions"); /// # } /// ``` - pub async fn list_actions( - &mut self, - ) -> Result>> { + pub async fn list_actions(&mut self) -> Result>> { let request = self.make_request(Empty {}); let action_stream = self @@ -528,10 +520,7 @@ impl FlightClient { /// .expect("error gathering action results"); /// # } /// ``` - pub async fn do_action( - &mut self, - action: Action, - ) -> Result>> { + pub async fn do_action(&mut self, action: Action) -> Result>> { let request = self.make_request(action); let result_stream = self diff --git a/arrow-flight/src/decode.rs b/arrow-flight/src/decode.rs index dfcdd260602c..95bbe2b46bb2 100644 --- a/arrow-flight/src/decode.rs +++ b/arrow-flight/src/decode.rs @@ -21,9 +21,7 @@ use arrow_buffer::Buffer; use arrow_schema::{Schema, SchemaRef}; use bytes::Bytes; use futures::{ready, stream::BoxStream, Stream, StreamExt}; -use std::{ - collections::HashMap, convert::TryFrom, fmt::Debug, pin::Pin, sync::Arc, task::Poll, -}; +use std::{collections::HashMap, convert::TryFrom, fmt::Debug, pin::Pin, sync::Arc, task::Poll}; use tonic::metadata::MetadataMap; use crate::error::{FlightError, Result}; @@ -270,16 +268,14 @@ impl FlightDataDecoder { /// state as necessary. fn extract_message(&mut self, data: FlightData) -> Result> { use arrow_ipc::MessageHeader; - let message = arrow_ipc::root_as_message(&data.data_header[..]).map_err(|e| { - FlightError::DecodeError(format!("Error decoding root message: {e}")) - })?; + let message = arrow_ipc::root_as_message(&data.data_header[..]) + .map_err(|e| FlightError::DecodeError(format!("Error decoding root message: {e}")))?; match message.header_type() { MessageHeader::NONE => Ok(Some(DecodedFlightData::new_none(data))), MessageHeader::Schema => { - let schema = Schema::try_from(&data).map_err(|e| { - FlightError::DecodeError(format!("Error decoding schema: {e}")) - })?; + let schema = Schema::try_from(&data) + .map_err(|e| FlightError::DecodeError(format!("Error decoding schema: {e}")))?; let schema = Arc::new(schema); let dictionaries_by_field = HashMap::new(); @@ -300,12 +296,11 @@ impl FlightDataDecoder { }; let buffer = Buffer::from_bytes(data.data_body.into()); - let dictionary_batch = - message.header_as_dictionary_batch().ok_or_else(|| { - FlightError::protocol( - "Could not get dictionary batch from DictionaryBatch message", - ) - })?; + let dictionary_batch = message.header_as_dictionary_batch().ok_or_else(|| { + FlightError::protocol( + "Could not get dictionary batch from DictionaryBatch message", + ) + })?; arrow_ipc::reader::read_dictionary( &buffer, @@ -315,9 +310,7 @@ impl FlightDataDecoder { &message.version(), ) .map_err(|e| { - FlightError::DecodeError(format!( - "Error decoding ipc dictionary: {e}" - )) + FlightError::DecodeError(format!("Error decoding ipc dictionary: {e}")) })?; // Updated internal state, but no decoded message @@ -338,9 +331,7 @@ impl FlightDataDecoder { &state.dictionaries_by_field, ) .map_err(|e| { - FlightError::DecodeError(format!( - "Error decoding ipc RecordBatch: {e}" - )) + FlightError::DecodeError(format!("Error decoding ipc RecordBatch: {e}")) })?; Ok(Some(DecodedFlightData::new_record_batch(data, batch))) diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index cd2ee7c02b68..e6ef9994d487 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -30,10 +30,17 @@ use futures::{ready, stream::BoxStream, Stream, StreamExt}; /// This can be used to implement [`FlightService::do_get`] in an /// Arrow Flight implementation; /// +/// This structure encodes a stream of `Result`s rather than `RecordBatch`es to +/// propagate errors from streaming execution, where the generation of the +/// `RecordBatch`es is incremental, and an error may occur even after +/// several have already been successfully produced. +/// /// # Caveats -/// 1. [`DictionaryArray`](arrow_array::array::DictionaryArray)s -/// are converted to their underlying types prior to transport, due to -/// . +/// 1. When [`DictionaryHandling`] is [`DictionaryHandling::Hydrate`], [`DictionaryArray`](arrow_array::array::DictionaryArray)s +/// are converted to their underlying types prior to transport. +/// When [`DictionaryHandling`] is [`DictionaryHandling::Resend`], Dictionary [`FlightData`] is sent with every +/// [`RecordBatch`] that contains a [`DictionaryArray`](arrow_array::array::DictionaryArray). +/// See . /// /// # Example /// ```no_run @@ -41,14 +48,14 @@ use futures::{ready, stream::BoxStream, Stream, StreamExt}; /// # use arrow_array::{ArrayRef, RecordBatch, UInt32Array}; /// # async fn f() { /// # let c1 = UInt32Array::from(vec![1, 2, 3, 4, 5, 6]); -/// # let record_batch = RecordBatch::try_from_iter(vec![ +/// # let batch = RecordBatch::try_from_iter(vec![ /// # ("a", Arc::new(c1) as ArrayRef) /// # ]) /// # .expect("cannot create record batch"); /// use arrow_flight::encode::FlightDataEncoderBuilder; /// /// // Get an input stream of Result -/// let input_stream = futures::stream::iter(vec![Ok(record_batch)]); +/// let input_stream = futures::stream::iter(vec![Ok(batch)]); /// /// // Build a stream of `Result` (e.g. to return for do_get) /// let flight_data_stream = FlightDataEncoderBuilder::new() @@ -59,6 +66,39 @@ use futures::{ready, stream::BoxStream, Stream, StreamExt}; /// # } /// ``` /// +/// # Example: Sending `Vec` +/// +/// You can create a [`Stream`] to pass to [`Self::build`] from an existing +/// `Vec` of `RecordBatch`es like this: +/// +/// ``` +/// # use std::sync::Arc; +/// # use arrow_array::{ArrayRef, RecordBatch, UInt32Array}; +/// # async fn f() { +/// # fn make_batches() -> Vec { +/// # let c1 = UInt32Array::from(vec![1, 2, 3, 4, 5, 6]); +/// # let batch = RecordBatch::try_from_iter(vec![ +/// # ("a", Arc::new(c1) as ArrayRef) +/// # ]) +/// # .expect("cannot create record batch"); +/// # vec![batch.clone(), batch.clone()] +/// # } +/// use arrow_flight::encode::FlightDataEncoderBuilder; +/// +/// // Get batches that you want to send via Flight +/// let batches: Vec = make_batches(); +/// +/// // Create an input stream of Result +/// let input_stream = futures::stream::iter( +/// batches.into_iter().map(Ok) +/// ); +/// +/// // Build a stream of `Result` (e.g. to return for do_get) +/// let flight_data_stream = FlightDataEncoderBuilder::new() +/// .build(input_stream); +/// # } +/// ``` +/// /// [`FlightService::do_get`]: crate::flight_service_server::FlightService::do_get /// [`FlightError`]: crate::error::FlightError #[derive(Debug)] @@ -74,6 +114,9 @@ pub struct FlightDataEncoderBuilder { schema: Option, /// Optional flight descriptor, if known before data. descriptor: Option, + /// Deterimines how `DictionaryArray`s are encoded for transport. + /// See [`DictionaryHandling`] for more information. + dictionary_handling: DictionaryHandling, } /// Default target size for encoded [`FlightData`]. @@ -90,6 +133,7 @@ impl Default for FlightDataEncoderBuilder { app_metadata: Bytes::new(), schema: None, descriptor: None, + dictionary_handling: DictionaryHandling::Hydrate, } } } @@ -114,6 +158,12 @@ impl FlightDataEncoderBuilder { self } + /// Set [`DictionaryHandling`] for encoder + pub fn with_dictionary_handling(mut self, dictionary_handling: DictionaryHandling) -> Self { + self.dictionary_handling = dictionary_handling; + self + } + /// Specify application specific metadata included in the /// [`FlightData::app_metadata`] field of the the first Schema /// message @@ -138,16 +188,15 @@ impl FlightDataEncoderBuilder { } /// Specify a flight descriptor in the first FlightData message. - pub fn with_flight_descriptor( - mut self, - descriptor: Option, - ) -> Self { + pub fn with_flight_descriptor(mut self, descriptor: Option) -> Self { self.descriptor = descriptor; self } - /// Return a [`Stream`] of [`FlightData`], - /// consuming self. More details on [`FlightDataEncoder`] + /// Takes a [`Stream`] of [`Result`] and returns a [`Stream`] + /// of [`FlightData`], consuming self. + /// + /// See example on [`Self`] and [`FlightDataEncoder`] for more details pub fn build(self, input: S) -> FlightDataEncoder where S: Stream> + Send + 'static, @@ -158,6 +207,7 @@ impl FlightDataEncoderBuilder { app_metadata, schema, descriptor, + dictionary_handling, } = self; FlightDataEncoder::new( @@ -167,6 +217,7 @@ impl FlightDataEncoderBuilder { options, app_metadata, descriptor, + dictionary_handling, ) } } @@ -192,6 +243,9 @@ pub struct FlightDataEncoder { done: bool, /// cleared after the first FlightData message is sent descriptor: Option, + /// Deterimines how `DictionaryArray`s are encoded for transport. + /// See [`DictionaryHandling`] for more information. + dictionary_handling: DictionaryHandling, } impl FlightDataEncoder { @@ -202,16 +256,21 @@ impl FlightDataEncoder { options: IpcWriteOptions, app_metadata: Bytes, descriptor: Option, + dictionary_handling: DictionaryHandling, ) -> Self { let mut encoder = Self { inner, schema: None, max_flight_data_size, - encoder: FlightIpcEncoder::new(options), + encoder: FlightIpcEncoder::new( + options, + dictionary_handling != DictionaryHandling::Resend, + ), app_metadata: Some(app_metadata), queue: VecDeque::new(), done: false, descriptor, + dictionary_handling, }; // If schema is known up front, enqueue it immediately @@ -242,7 +301,8 @@ impl FlightDataEncoder { fn encode_schema(&mut self, schema: &SchemaRef) -> SchemaRef { // The first message is the schema message, and all // batches have the same schema - let schema = Arc::new(prepare_schema_for_flight(schema)); + let send_dictionaries = self.dictionary_handling == DictionaryHandling::Resend; + let schema = Arc::new(prepare_schema_for_flight(schema, send_dictionaries)); let mut schema_flight_data = self.encoder.encode_schema(&schema); // attach any metadata requested @@ -264,11 +324,11 @@ impl FlightDataEncoder { }; // encode the batch - let batch = prepare_batch_for_flight(&batch, schema)?; + let send_dictionaries = self.dictionary_handling == DictionaryHandling::Resend; + let batch = prepare_batch_for_flight(&batch, schema, send_dictionaries)?; for batch in split_batch_for_grpc_response(batch, self.max_flight_data_size) { - let (flight_dictionaries, flight_batch) = - self.encoder.encode_batch(&batch)?; + let (flight_dictionaries, flight_batch) = self.encoder.encode_batch(&batch)?; self.queue_messages(flight_dictionaries); self.queue_message(flight_batch); @@ -325,17 +385,46 @@ impl Stream for FlightDataEncoder { } } +/// Defines how a [`FlightDataEncoder`] encodes [`DictionaryArray`]s +/// +/// [`DictionaryArray`]: arrow_array::DictionaryArray +#[derive(Debug, PartialEq)] +pub enum DictionaryHandling { + /// Expands to the underlying type (default). This likely sends more data + /// over the network but requires less memory (dictionaries are not tracked) + /// and is more compatible with other arrow flight client implementations + /// that may not support `DictionaryEncoding` + /// + /// An IPC response, streaming or otherwise, defines its schema up front + /// which defines the mapping from dictionary IDs. It then sends these + /// dictionaries over the wire. + /// + /// This requires identifying the different dictionaries in use, assigning + /// them IDs, and sending new dictionaries, delta or otherwise, when needed + /// + /// See also: + /// * + Hydrate, + /// Send dictionary FlightData with every RecordBatch that contains a + /// [`DictionaryArray`]. See [`Self::Hydrate`] for more tradeoffs. No + /// attempt is made to skip sending the same (logical) dictionary values + /// twice. + /// + /// [`DictionaryArray`]: arrow_array::DictionaryArray + Resend, +} + /// Prepare an arrow Schema for transport over the Arrow Flight protocol /// /// Convert dictionary types to underlying types /// /// See hydrate_dictionary for more information -fn prepare_schema_for_flight(schema: &Schema) -> Schema { +fn prepare_schema_for_flight(schema: &Schema, send_dictionaries: bool) -> Schema { let fields: Fields = schema .fields() .iter() .map(|field| match field.data_type() { - DataType::Dictionary(_, value_type) => Field::new( + DataType::Dictionary(_, value_type) if !send_dictionaries => Field::new( field.name(), value_type.as_ref().clone(), field.is_nullable(), @@ -364,9 +453,8 @@ fn split_batch_for_grpc_response( .map(|col| col.get_buffer_memory_size()) .sum::(); - let n_batches = (size / max_flight_data_size - + usize::from(size % max_flight_data_size != 0)) - .max(1); + let n_batches = + (size / max_flight_data_size + usize::from(size % max_flight_data_size != 0)).max(1); let rows_per_batch = (batch.num_rows() / n_batches).max(1); let mut out = Vec::with_capacity(n_batches + 1); @@ -394,8 +482,7 @@ struct FlightIpcEncoder { } impl FlightIpcEncoder { - fn new(options: IpcWriteOptions) -> Self { - let error_on_replacement = true; + fn new(options: IpcWriteOptions, error_on_replacement: bool) -> Self { Self { options, data_gen: IpcDataGenerator::default(), @@ -410,18 +497,12 @@ impl FlightIpcEncoder { /// Convert a `RecordBatch` to a Vec of `FlightData` representing /// dictionaries and a `FlightData` representing the batch - fn encode_batch( - &mut self, - batch: &RecordBatch, - ) -> Result<(Vec, FlightData)> { - let (encoded_dictionaries, encoded_batch) = self.data_gen.encoded_batch( - batch, - &mut self.dictionary_tracker, - &self.options, - )?; - - let flight_dictionaries = - encoded_dictionaries.into_iter().map(Into::into).collect(); + fn encode_batch(&mut self, batch: &RecordBatch) -> Result<(Vec, FlightData)> { + let (encoded_dictionaries, encoded_batch) = + self.data_gen + .encoded_batch(batch, &mut self.dictionary_tracker, &self.options)?; + + let flight_dictionaries = encoded_dictionaries.into_iter().map(Into::into).collect(); let flight_batch = encoded_batch.into(); Ok((flight_dictionaries, flight_batch)) @@ -438,12 +519,14 @@ impl FlightIpcEncoder { fn prepare_batch_for_flight( batch: &RecordBatch, schema: SchemaRef, + send_dictionaries: bool, ) -> Result { let columns = batch .columns() .iter() - .map(hydrate_dictionary) + .map(|c| hydrate_dictionary(c, send_dictionaries)) .collect::>>()?; + let options = RecordBatchOptions::new().with_row_count(Some(batch.num_rows())); Ok(RecordBatch::try_new_with_options( @@ -451,35 +534,26 @@ fn prepare_batch_for_flight( )?) } -/// Hydrates a dictionary to its underlying type -/// -/// An IPC response, streaming or otherwise, defines its schema up front -/// which defines the mapping from dictionary IDs. It then sends these -/// dictionaries over the wire. -/// -/// This requires identifying the different dictionaries in use, assigning -/// them IDs, and sending new dictionaries, delta or otherwise, when needed -/// -/// See also: -/// * -/// -/// For now we just hydrate the dictionaries to their underlying type -fn hydrate_dictionary(array: &ArrayRef) -> Result { - let arr = if let DataType::Dictionary(_, value) = array.data_type() { - arrow_cast::cast(array, value)? - } else { - Arc::clone(array) +/// Hydrates a dictionary to its underlying type if send_dictionaries is false. If send_dictionaries +/// is true, dictionaries are sent with every batch which is not as optimal as described in [DictionaryHandling::Hydrate] above, +/// but does enable sending DictionaryArray's via Flight. +fn hydrate_dictionary(array: &ArrayRef, send_dictionaries: bool) -> Result { + let arr = match array.data_type() { + DataType::Dictionary(_, value) if !send_dictionaries => arrow_cast::cast(array, value)?, + _ => Arc::clone(array), }; Ok(arr) } #[cfg(test)] mod tests { - use arrow_array::types::*; use arrow_array::*; + use arrow_array::{cast::downcast_array, types::*}; use arrow_cast::pretty::pretty_format_batches; use std::collections::HashMap; + use crate::decode::{DecodedPayload, FlightDataDecoder}; + use super::*; #[test] @@ -496,11 +570,9 @@ mod tests { let (_, baseline_flight_batch) = make_flight_data(&batch, &options); let big_batch = batch.slice(0, batch.num_rows() - 1); - let optimized_big_batch = - prepare_batch_for_flight(&big_batch, Arc::clone(&schema)) - .expect("failed to optimize"); - let (_, optimized_big_flight_batch) = - make_flight_data(&optimized_big_batch, &options); + let optimized_big_batch = prepare_batch_for_flight(&big_batch, Arc::clone(&schema), false) + .expect("failed to optimize"); + let (_, optimized_big_flight_batch) = make_flight_data(&optimized_big_batch, &options); assert_eq!( baseline_flight_batch.data_body.len(), @@ -509,25 +581,96 @@ mod tests { let small_batch = batch.slice(0, 1); let optimized_small_batch = - prepare_batch_for_flight(&small_batch, Arc::clone(&schema)) + prepare_batch_for_flight(&small_batch, Arc::clone(&schema), false) .expect("failed to optimize"); - let (_, optimized_small_flight_batch) = - make_flight_data(&optimized_small_batch, &options); + let (_, optimized_small_flight_batch) = make_flight_data(&optimized_small_batch, &options); assert!( - baseline_flight_batch.data_body.len() - > optimized_small_flight_batch.data_body.len() + baseline_flight_batch.data_body.len() > optimized_small_flight_batch.data_body.len() ); } + #[tokio::test] + async fn test_dictionary_hydration() { + let arr: DictionaryArray = vec!["a", "a", "b"].into_iter().collect(); + let schema = Arc::new(Schema::new(vec![Field::new_dictionary( + "dict", + DataType::UInt16, + DataType::Utf8, + false, + )])); + let batch = RecordBatch::try_new(schema, vec![Arc::new(arr)]).unwrap(); + let encoder = + FlightDataEncoderBuilder::default().build(futures::stream::once(async { Ok(batch) })); + let mut decoder = FlightDataDecoder::new(encoder); + let expected_schema = Schema::new(vec![Field::new("dict", DataType::Utf8, false)]); + let expected_schema = Arc::new(expected_schema); + while let Some(decoded) = decoder.next().await { + let decoded = decoded.unwrap(); + match decoded.payload { + DecodedPayload::None => {} + DecodedPayload::Schema(s) => assert_eq!(s, expected_schema), + DecodedPayload::RecordBatch(b) => { + assert_eq!(b.schema(), expected_schema); + let expected_array = StringArray::from(vec!["a", "a", "b"]); + let actual_array = b.column_by_name("dict").unwrap(); + let actual_array = downcast_array::(actual_array); + + assert_eq!(actual_array, expected_array); + } + } + } + } + + #[tokio::test] + async fn test_send_dictionaries() { + let schema = Arc::new(Schema::new(vec![Field::new_dictionary( + "dict", + DataType::UInt16, + DataType::Utf8, + false, + )])); + + let arr_one: Arc> = + Arc::new(vec!["a", "a", "b"].into_iter().collect()); + let arr_two: Arc> = + Arc::new(vec!["b", "a", "c"].into_iter().collect()); + let batch_one = RecordBatch::try_new(schema.clone(), vec![arr_one.clone()]).unwrap(); + let batch_two = RecordBatch::try_new(schema.clone(), vec![arr_two.clone()]).unwrap(); + + let encoder = FlightDataEncoderBuilder::default() + .with_dictionary_handling(DictionaryHandling::Resend) + .build(futures::stream::iter(vec![Ok(batch_one), Ok(batch_two)])); + + let mut decoder = FlightDataDecoder::new(encoder); + let mut expected_array = arr_one; + while let Some(decoded) = decoder.next().await { + let decoded = decoded.unwrap(); + match decoded.payload { + DecodedPayload::None => {} + DecodedPayload::Schema(s) => assert_eq!(s, schema), + DecodedPayload::RecordBatch(b) => { + assert_eq!(b.schema(), schema); + + let actual_array = Arc::new(downcast_array::>( + b.column_by_name("dict").unwrap(), + )); + + assert_eq!(actual_array, expected_array); + + expected_array = arr_two.clone(); + } + } + } + } + #[test] fn test_schema_metadata_encoded() { - let schema = - Schema::new(vec![Field::new("data", DataType::Int32, false)]).with_metadata( - HashMap::from([("some_key".to_owned(), "some_value".to_owned())]), - ); + let schema = Schema::new(vec![Field::new("data", DataType::Int32, false)]).with_metadata( + HashMap::from([("some_key".to_owned(), "some_value".to_owned())]), + ); - let got = prepare_schema_for_flight(&schema); + let got = prepare_schema_for_flight(&schema, false); assert!(got.metadata().contains_key("some_key")); } @@ -540,7 +683,7 @@ mod tests { ) .expect("cannot create record batch"); - prepare_batch_for_flight(&batch, batch.schema()).expect("failed to optimize"); + prepare_batch_for_flight(&batch, batch.schema(), false).expect("failed to optimize"); } pub fn make_flight_data( @@ -554,8 +697,7 @@ mod tests { .encoded_batch(batch, &mut dictionary_tracker, options) .expect("DictionaryTracker configured above to not error on replacement"); - let flight_dictionaries = - encoded_dictionaries.into_iter().map(Into::into).collect(); + let flight_dictionaries = encoded_dictionaries.into_iter().map(Into::into).collect(); let flight_batch = encoded_batch.into(); (flight_dictionaries, flight_batch) @@ -576,8 +718,7 @@ mod tests { // split once let n_rows = max_flight_data_size + 1; assert!(n_rows % 2 == 1, "should be an odd number"); - let c = - UInt8Array::from((0..n_rows).map(|i| (i % 256) as u8).collect::>()); + let c = UInt8Array::from((0..n_rows).map(|i| (i % 256) as u8).collect::>()); let batch = RecordBatch::try_from_iter(vec![("a", Arc::new(c) as ArrayRef)]) .expect("cannot create record batch"); let split = split_batch_for_grpc_response(batch.clone(), max_flight_data_size); @@ -624,8 +765,7 @@ mod tests { let input_rows = batch.num_rows(); - let split = - split_batch_for_grpc_response(batch.clone(), max_flight_data_size_bytes); + let split = split_batch_for_grpc_response(batch.clone(), max_flight_data_size_bytes); let sizes: Vec<_> = split.iter().map(|batch| batch.num_rows()).collect(); let output_rows: usize = sizes.iter().sum(); @@ -638,8 +778,7 @@ mod tests { #[tokio::test] async fn flight_data_size_even() { - let s1 = - StringArray::from_iter_values(std::iter::repeat(".10 bytes.").take(1024)); + let s1 = StringArray::from_iter_values(std::iter::repeat(".10 bytes.").take(1024)); let i1 = Int16Array::from_iter_values(0..1024); let s2 = StringArray::from_iter_values(std::iter::repeat("6bytes").take(1024)); let i2 = Int64Array::from_iter_values(0..1024); @@ -659,8 +798,7 @@ mod tests { async fn flight_data_size_uneven_variable_lengths() { // each row has a longer string than the last with increasing lengths 0 --> 1024 let array = StringArray::from_iter_values((0..1024).map(|i| "*".repeat(i))); - let batch = - RecordBatch::try_from_iter(vec![("data", Arc::new(array) as _)]).unwrap(); + let batch = RecordBatch::try_from_iter(vec![("data", Arc::new(array) as _)]).unwrap(); // overage is much higher than ideal // https://github.com/apache/arrow-rs/issues/3478 @@ -714,8 +852,7 @@ mod tests { }) .collect(); - let batch = - RecordBatch::try_from_iter(vec![("a1", Arc::new(array) as _)]).unwrap(); + let batch = RecordBatch::try_from_iter(vec![("a1", Arc::new(array) as _)]).unwrap(); verify_encoded_split(batch, 160).await; } @@ -725,11 +862,9 @@ mod tests { // large dictionary (all distinct values ==> 1024 entries in dictionary) let values: Vec<_> = (1..1024).map(|i| "**".repeat(i)).collect(); - let array: DictionaryArray = - values.iter().map(|s| Some(s.as_str())).collect(); + let array: DictionaryArray = values.iter().map(|s| Some(s.as_str())).collect(); - let batch = - RecordBatch::try_from_iter(vec![("a1", Arc::new(array) as _)]).unwrap(); + let batch = RecordBatch::try_from_iter(vec![("a1", Arc::new(array) as _)]).unwrap(); // overage is much higher than ideal // https://github.com/apache/arrow-rs/issues/3478 @@ -743,8 +878,7 @@ mod tests { let keys = Int32Array::from_iter_values((0..3000).map(|i| (3000 - i) % 1024)); let array = DictionaryArray::new(keys, Arc::new(values)); - let batch = - RecordBatch::try_from_iter(vec![("a1", Arc::new(array) as _)]).unwrap(); + let batch = RecordBatch::try_from_iter(vec![("a1", Arc::new(array) as _)]).unwrap(); // overage is much higher than ideal // https://github.com/apache/arrow-rs/issues/3478 @@ -760,12 +894,9 @@ mod tests { // medium cardinality let values3: Vec<_> = (1..1024).map(|i| "**".repeat(i % 100)).collect(); - let array1: DictionaryArray = - values1.iter().map(|s| Some(s.as_str())).collect(); - let array2: DictionaryArray = - values2.iter().map(|s| Some(s.as_str())).collect(); - let array3: DictionaryArray = - values3.iter().map(|s| Some(s.as_str())).collect(); + let array1: DictionaryArray = values1.iter().map(|s| Some(s.as_str())).collect(); + let array2: DictionaryArray = values2.iter().map(|s| Some(s.as_str())).collect(); + let array3: DictionaryArray = values3.iter().map(|s| Some(s.as_str())).collect(); let batch = RecordBatch::try_from_iter(vec![ ("a1", Arc::new(array1) as _), @@ -785,17 +916,13 @@ mod tests { .flight_descriptor .as_ref() .map(|descriptor| { - let path_len: usize = - descriptor.path.iter().map(|p| p.as_bytes().len()).sum(); + let path_len: usize = descriptor.path.iter().map(|p| p.as_bytes().len()).sum(); std::mem::size_of_val(descriptor) + descriptor.cmd.len() + path_len }) .unwrap_or(0); - flight_descriptor_size - + d.app_metadata.len() - + d.data_body.len() - + d.data_header.len() + flight_descriptor_size + d.app_metadata.len() + d.data_body.len() + d.data_header.len() } /// Coverage for diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index 04edf266389c..8d05f658703a 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -133,10 +133,7 @@ pub struct IpcMessage(pub Bytes); // Useful conversion functions -fn flight_schema_as_encoded_data( - arrow_schema: &Schema, - options: &IpcWriteOptions, -) -> EncodedData { +fn flight_schema_as_encoded_data(arrow_schema: &Schema, options: &IpcWriteOptions) -> EncodedData { let data_gen = writer::IpcDataGenerator::default(); data_gen.schema_to_bytes(arrow_schema, options) } @@ -316,16 +313,6 @@ impl TryFrom> for SchemaResult { } } -// TryFrom... - -impl TryFrom for DescriptorType { - type Error = ArrowError; - - fn try_from(value: i32) -> ArrowResult { - value.try_into() - } -} - impl TryFrom> for IpcMessage { type Error = ArrowError; diff --git a/arrow-flight/src/sql/arrow.flight.protocol.sql.rs b/arrow-flight/src/sql/arrow.flight.protocol.sql.rs index b2137d8543d3..c7c23311e61e 100644 --- a/arrow-flight/src/sql/arrow.flight.protocol.sql.rs +++ b/arrow-flight/src/sql/arrow.flight.protocol.sql.rs @@ -1077,10 +1077,10 @@ pub enum SqlInfo { /// The returned bitmask should be parsed in order to retrieve the supported commands. /// /// For instance: - /// - return 0 (\b0) => [] (GROUP BY is unsupported); + /// - return 0 (\b0) => \[\] (GROUP BY is unsupported); /// - return 1 (\b1) => \[SQL_GROUP_BY_UNRELATED\]; /// - return 2 (\b10) => \[SQL_GROUP_BY_BEYOND_SELECT\]; - /// - return 3 (\b11) => [SQL_GROUP_BY_UNRELATED, SQL_GROUP_BY_BEYOND_SELECT]. + /// - return 3 (\b11) => \[SQL_GROUP_BY_UNRELATED, SQL_GROUP_BY_BEYOND_SELECT\]. /// Valid GROUP BY types are described under `arrow.flight.protocol.sql.SqlSupportedGroupBy`. SqlSupportedGroupBy = 522, /// @@ -1104,14 +1104,14 @@ pub enum SqlInfo { /// The returned bitmask should be parsed in order to retrieve the supported grammar levels. /// /// For instance: - /// - return 0 (\b0) => [] (SQL grammar is unsupported); + /// - return 0 (\b0) => \[\] (SQL grammar is unsupported); /// - return 1 (\b1) => \[SQL_MINIMUM_GRAMMAR\]; /// - return 2 (\b10) => \[SQL_CORE_GRAMMAR\]; - /// - return 3 (\b11) => [SQL_MINIMUM_GRAMMAR, SQL_CORE_GRAMMAR]; + /// - return 3 (\b11) => \[SQL_MINIMUM_GRAMMAR, SQL_CORE_GRAMMAR\]; /// - return 4 (\b100) => \[SQL_EXTENDED_GRAMMAR\]; - /// - return 5 (\b101) => [SQL_MINIMUM_GRAMMAR, SQL_EXTENDED_GRAMMAR]; - /// - return 6 (\b110) => [SQL_CORE_GRAMMAR, SQL_EXTENDED_GRAMMAR]; - /// - return 7 (\b111) => [SQL_MINIMUM_GRAMMAR, SQL_CORE_GRAMMAR, SQL_EXTENDED_GRAMMAR]. + /// - return 5 (\b101) => \[SQL_MINIMUM_GRAMMAR, SQL_EXTENDED_GRAMMAR\]; + /// - return 6 (\b110) => \[SQL_CORE_GRAMMAR, SQL_EXTENDED_GRAMMAR\]; + /// - return 7 (\b111) => \[SQL_MINIMUM_GRAMMAR, SQL_CORE_GRAMMAR, SQL_EXTENDED_GRAMMAR\]. /// Valid SQL grammar levels are described under `arrow.flight.protocol.sql.SupportedSqlGrammar`. SqlSupportedGrammar = 525, /// @@ -1121,14 +1121,14 @@ pub enum SqlInfo { /// The returned bitmask should be parsed in order to retrieve the supported commands. /// /// For instance: - /// - return 0 (\b0) => [] (ANSI92 SQL grammar is unsupported); + /// - return 0 (\b0) => \[\] (ANSI92 SQL grammar is unsupported); /// - return 1 (\b1) => \[ANSI92_ENTRY_SQL\]; /// - return 2 (\b10) => \[ANSI92_INTERMEDIATE_SQL\]; - /// - return 3 (\b11) => [ANSI92_ENTRY_SQL, ANSI92_INTERMEDIATE_SQL]; + /// - return 3 (\b11) => \[ANSI92_ENTRY_SQL, ANSI92_INTERMEDIATE_SQL\]; /// - return 4 (\b100) => \[ANSI92_FULL_SQL\]; - /// - return 5 (\b101) => [ANSI92_ENTRY_SQL, ANSI92_FULL_SQL]; - /// - return 6 (\b110) => [ANSI92_INTERMEDIATE_SQL, ANSI92_FULL_SQL]; - /// - return 7 (\b111) => [ANSI92_ENTRY_SQL, ANSI92_INTERMEDIATE_SQL, ANSI92_FULL_SQL]. + /// - return 5 (\b101) => \[ANSI92_ENTRY_SQL, ANSI92_FULL_SQL\]; + /// - return 6 (\b110) => \[ANSI92_INTERMEDIATE_SQL, ANSI92_FULL_SQL\]; + /// - return 7 (\b111) => \[ANSI92_ENTRY_SQL, ANSI92_INTERMEDIATE_SQL, ANSI92_FULL_SQL\]. /// Valid ANSI92 SQL grammar levels are described under `arrow.flight.protocol.sql.SupportedAnsi92SqlGrammarLevel`. SqlAnsi92SupportedLevel = 526, /// @@ -1165,14 +1165,14 @@ pub enum SqlInfo { /// The returned bitmask should be parsed in order to retrieve the supported actions for a SQL schema. /// /// For instance: - /// - return 0 (\b0) => [] (no supported actions for SQL schema); + /// - return 0 (\b0) => \[\] (no supported actions for SQL schema); /// - return 1 (\b1) => \[SQL_ELEMENT_IN_PROCEDURE_CALLS\]; /// - return 2 (\b10) => \[SQL_ELEMENT_IN_INDEX_DEFINITIONS\]; - /// - return 3 (\b11) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS]; + /// - return 3 (\b11) => \[SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS\]; /// - return 4 (\b100) => \[SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS\]; - /// - return 5 (\b101) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; - /// - return 6 (\b110) => [SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; - /// - return 7 (\b111) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]. + /// - return 5 (\b101) => \[SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS\]; + /// - return 6 (\b110) => \[SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS\]; + /// - return 7 (\b111) => \[SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS\]. /// Valid actions for a SQL schema described under `arrow.flight.protocol.sql.SqlSupportedElementActions`. SqlSchemasSupportedActions = 533, /// @@ -1182,14 +1182,14 @@ pub enum SqlInfo { /// The returned bitmask should be parsed in order to retrieve the supported actions for a SQL catalog. /// /// For instance: - /// - return 0 (\b0) => [] (no supported actions for SQL catalog); + /// - return 0 (\b0) => \[\] (no supported actions for SQL catalog); /// - return 1 (\b1) => \[SQL_ELEMENT_IN_PROCEDURE_CALLS\]; /// - return 2 (\b10) => \[SQL_ELEMENT_IN_INDEX_DEFINITIONS\]; - /// - return 3 (\b11) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS]; + /// - return 3 (\b11) => \[SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS\]; /// - return 4 (\b100) => \[SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS\]; - /// - return 5 (\b101) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; - /// - return 6 (\b110) => [SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; - /// - return 7 (\b111) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]. + /// - return 5 (\b101) => \[SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS\]; + /// - return 6 (\b110) => \[SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS\]; + /// - return 7 (\b111) => \[SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS\]. /// Valid actions for a SQL catalog are described under `arrow.flight.protocol.sql.SqlSupportedElementActions`. SqlCatalogsSupportedActions = 534, /// @@ -1199,10 +1199,10 @@ pub enum SqlInfo { /// The returned bitmask should be parsed in order to retrieve the supported SQL positioned commands. /// /// For instance: - /// - return 0 (\b0) => [] (no supported SQL positioned commands); + /// - return 0 (\b0) => \[\] (no supported SQL positioned commands); /// - return 1 (\b1) => \[SQL_POSITIONED_DELETE\]; /// - return 2 (\b10) => \[SQL_POSITIONED_UPDATE\]; - /// - return 3 (\b11) => [SQL_POSITIONED_DELETE, SQL_POSITIONED_UPDATE]. + /// - return 3 (\b11) => \[SQL_POSITIONED_DELETE, SQL_POSITIONED_UPDATE\]. /// Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlSupportedPositionedCommands`. SqlSupportedPositionedCommands = 535, /// @@ -1227,22 +1227,22 @@ pub enum SqlInfo { /// The returned bitmask should be parsed in order to retrieve the supported SQL subqueries. /// /// For instance: - /// - return 0 (\b0) => [] (no supported SQL subqueries); + /// - return 0 (\b0) => \[\] (no supported SQL subqueries); /// - return 1 (\b1) => \[SQL_SUBQUERIES_IN_COMPARISONS\]; /// - return 2 (\b10) => \[SQL_SUBQUERIES_IN_EXISTS\]; - /// - return 3 (\b11) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS]; + /// - return 3 (\b11) => \[SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS\]; /// - return 4 (\b100) => \[SQL_SUBQUERIES_IN_INS\]; - /// - return 5 (\b101) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_INS]; - /// - return 6 (\b110) => [SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_EXISTS]; - /// - return 7 (\b111) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS]; + /// - return 5 (\b101) => \[SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_INS\]; + /// - return 6 (\b110) => \[SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_EXISTS\]; + /// - return 7 (\b111) => \[SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS\]; /// - return 8 (\b1000) => \[SQL_SUBQUERIES_IN_QUANTIFIEDS\]; - /// - return 9 (\b1001) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - /// - return 10 (\b1010) => [SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - /// - return 11 (\b1011) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - /// - return 12 (\b1100) => [SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - /// - return 13 (\b1101) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - /// - return 14 (\b1110) => [SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - /// - return 15 (\b1111) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; + /// - return 9 (\b1001) => \[SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_QUANTIFIEDS\]; + /// - return 10 (\b1010) => \[SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_QUANTIFIEDS\]; + /// - return 11 (\b1011) => \[SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_QUANTIFIEDS\]; + /// - return 12 (\b1100) => \[SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS\]; + /// - return 13 (\b1101) => \[SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS\]; + /// - return 14 (\b1110) => \[SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS\]; + /// - return 15 (\b1111) => \[SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS\]; /// - ... /// Valid SQL subqueries are described under `arrow.flight.protocol.sql.SqlSupportedSubqueries`. SqlSupportedSubqueries = 538, @@ -1260,10 +1260,10 @@ pub enum SqlInfo { /// The returned bitmask should be parsed in order to retrieve the supported SQL UNIONs. /// /// For instance: - /// - return 0 (\b0) => [] (no supported SQL positioned commands); + /// - return 0 (\b0) => \[\] (no supported SQL positioned commands); /// - return 1 (\b1) => \[SQL_UNION\]; /// - return 2 (\b10) => \[SQL_UNION_ALL\]; - /// - return 3 (\b11) => [SQL_UNION, SQL_UNION_ALL]. + /// - return 3 (\b11) => \[SQL_UNION, SQL_UNION_ALL\]. /// Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlSupportedUnions`. SqlSupportedUnions = 540, /// Retrieves a int64 value representing the maximum number of hex characters allowed in an inline binary literal. @@ -1341,22 +1341,22 @@ pub enum SqlInfo { /// The returned bitmask should be parsed in order to retrieve the supported transactions isolation levels. /// /// For instance: - /// - return 0 (\b0) => [] (no supported SQL transactions isolation levels); + /// - return 0 (\b0) => \[\] (no supported SQL transactions isolation levels); /// - return 1 (\b1) => \[SQL_TRANSACTION_NONE\]; /// - return 2 (\b10) => \[SQL_TRANSACTION_READ_UNCOMMITTED\]; - /// - return 3 (\b11) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED]; + /// - return 3 (\b11) => \[SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED\]; /// - return 4 (\b100) => \[SQL_TRANSACTION_REPEATABLE_READ\]; - /// - return 5 (\b101) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 6 (\b110) => [SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 7 (\b111) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; + /// - return 5 (\b101) => \[SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ\]; + /// - return 6 (\b110) => \[SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ\]; + /// - return 7 (\b111) => \[SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ\]; /// - return 8 (\b1000) => \[SQL_TRANSACTION_REPEATABLE_READ\]; - /// - return 9 (\b1001) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 10 (\b1010) => [SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 11 (\b1011) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 12 (\b1100) => [SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 13 (\b1101) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 14 (\b1110) => [SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 15 (\b1111) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; + /// - return 9 (\b1001) => \[SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ\]; + /// - return 10 (\b1010) => \[SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ\]; + /// - return 11 (\b1011) => \[SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ\]; + /// - return 12 (\b1100) => \[SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ\]; + /// - return 13 (\b1101) => \[SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ\]; + /// - return 14 (\b1110) => \[SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ\]; + /// - return 15 (\b1111) => \[SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ\]; /// - return 16 (\b10000) => \[SQL_TRANSACTION_SERIALIZABLE\]; /// - ... /// Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlTransactionIsolationLevel`. @@ -1381,14 +1381,14 @@ pub enum SqlInfo { /// The returned bitmask should be parsed in order to retrieve the supported result set types. /// /// For instance: - /// - return 0 (\b0) => [] (no supported result set types); + /// - return 0 (\b0) => \[\] (no supported result set types); /// - return 1 (\b1) => \[SQL_RESULT_SET_TYPE_UNSPECIFIED\]; /// - return 2 (\b10) => \[SQL_RESULT_SET_TYPE_FORWARD_ONLY\]; - /// - return 3 (\b11) => [SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_FORWARD_ONLY]; + /// - return 3 (\b11) => \[SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_FORWARD_ONLY\]; /// - return 4 (\b100) => \[SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE\]; - /// - return 5 (\b101) => [SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE]; - /// - return 6 (\b110) => [SQL_RESULT_SET_TYPE_FORWARD_ONLY, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE]; - /// - return 7 (\b111) => [SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_FORWARD_ONLY, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE]; + /// - return 5 (\b101) => \[SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE\]; + /// - return 6 (\b110) => \[SQL_RESULT_SET_TYPE_FORWARD_ONLY, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE\]; + /// - return 7 (\b111) => \[SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_FORWARD_ONLY, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE\]; /// - return 8 (\b1000) => \[SQL_RESULT_SET_TYPE_SCROLL_SENSITIVE\]; /// - ... /// Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetType`. @@ -1398,14 +1398,14 @@ pub enum SqlInfo { /// `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_UNSPECIFIED`. /// /// For instance: - /// - return 0 (\b0) => [] (no supported concurrency types for this result set type) + /// - return 0 (\b0) => \[\] (no supported concurrency types for this result set type) /// - return 1 (\b1) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED\] /// - return 2 (\b10) => \[SQL_RESULT_SET_CONCURRENCY_READ_ONLY\] - /// - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] + /// - return 3 (\b11) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY\] /// - return 4 (\b100) => \[SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] - /// - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + /// - return 5 (\b101) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] + /// - return 6 (\b110) => \[SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] + /// - return 7 (\b111) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] /// Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. SqlSupportedConcurrenciesForResultSetUnspecified = 568, /// @@ -1413,14 +1413,14 @@ pub enum SqlInfo { /// `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_FORWARD_ONLY`. /// /// For instance: - /// - return 0 (\b0) => [] (no supported concurrency types for this result set type) + /// - return 0 (\b0) => \[\] (no supported concurrency types for this result set type) /// - return 1 (\b1) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED\] /// - return 2 (\b10) => \[SQL_RESULT_SET_CONCURRENCY_READ_ONLY\] - /// - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] + /// - return 3 (\b11) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY\] /// - return 4 (\b100) => \[SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] - /// - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + /// - return 5 (\b101) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] + /// - return 6 (\b110) => \[SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] + /// - return 7 (\b111) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] /// Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. SqlSupportedConcurrenciesForResultSetForwardOnly = 569, /// @@ -1428,14 +1428,14 @@ pub enum SqlInfo { /// `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_SCROLL_SENSITIVE`. /// /// For instance: - /// - return 0 (\b0) => [] (no supported concurrency types for this result set type) + /// - return 0 (\b0) => \[\] (no supported concurrency types for this result set type) /// - return 1 (\b1) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED\] /// - return 2 (\b10) => \[SQL_RESULT_SET_CONCURRENCY_READ_ONLY\] - /// - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] + /// - return 3 (\b11) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY\] /// - return 4 (\b100) => \[SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] - /// - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + /// - return 5 (\b101) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] + /// - return 6 (\b110) => \[SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] + /// - return 7 (\b111) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] /// Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. SqlSupportedConcurrenciesForResultSetScrollSensitive = 570, /// @@ -1443,14 +1443,14 @@ pub enum SqlInfo { /// `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE`. /// /// For instance: - /// - return 0 (\b0) => [] (no supported concurrency types for this result set type) + /// - return 0 (\b0) => \[\] (no supported concurrency types for this result set type) /// - return 1 (\b1) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED\] /// - return 2 (\b10) => \[SQL_RESULT_SET_CONCURRENCY_READ_ONLY\] - /// - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] + /// - return 3 (\b11) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY\] /// - return 4 (\b100) => \[SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] - /// - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + /// - return 5 (\b101) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] + /// - return 6 (\b110) => \[SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] + /// - return 7 (\b111) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] /// Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. SqlSupportedConcurrenciesForResultSetScrollInsensitive = 571, /// diff --git a/arrow-flight/src/sql/client.rs b/arrow-flight/src/sql/client.rs index 4b1f38ebcbb7..133df5b044cf 100644 --- a/arrow-flight/src/sql/client.rs +++ b/arrow-flight/src/sql/client.rs @@ -24,20 +24,23 @@ use std::collections::HashMap; use std::str::FromStr; use tonic::metadata::AsciiMetadataKey; +use crate::decode::FlightRecordBatchStream; +use crate::encode::FlightDataEncoderBuilder; +use crate::error::FlightError; use crate::flight_service_client::FlightServiceClient; use crate::sql::server::{CLOSE_PREPARED_STATEMENT, CREATE_PREPARED_STATEMENT}; use crate::sql::{ ActionClosePreparedStatementRequest, ActionCreatePreparedStatementRequest, - ActionCreatePreparedStatementResult, Any, CommandGetCatalogs, - CommandGetCrossReference, CommandGetDbSchemas, CommandGetExportedKeys, - CommandGetImportedKeys, CommandGetPrimaryKeys, CommandGetSqlInfo, - CommandGetTableTypes, CommandGetTables, CommandGetXdbcTypeInfo, - CommandPreparedStatementQuery, CommandStatementQuery, CommandStatementUpdate, - DoPutUpdateResult, ProstMessageExt, SqlInfo, + ActionCreatePreparedStatementResult, Any, CommandGetCatalogs, CommandGetCrossReference, + CommandGetDbSchemas, CommandGetExportedKeys, CommandGetImportedKeys, CommandGetPrimaryKeys, + CommandGetSqlInfo, CommandGetTableTypes, CommandGetTables, CommandGetXdbcTypeInfo, + CommandPreparedStatementQuery, CommandPreparedStatementUpdate, CommandStatementQuery, + CommandStatementUpdate, DoPutUpdateResult, ProstMessageExt, SqlInfo, }; +use crate::trailers::extract_lazy_trailers; use crate::{ - Action, FlightData, FlightDescriptor, FlightInfo, HandshakeRequest, - HandshakeResponse, IpcMessage, PutResult, Ticket, + Action, FlightData, FlightDescriptor, FlightInfo, HandshakeRequest, HandshakeResponse, + IpcMessage, PutResult, Ticket, }; use arrow_array::RecordBatch; use arrow_buffer::Buffer; @@ -130,11 +133,7 @@ impl FlightSqlServiceClient { /// Perform a `handshake` with the server, passing credentials and establishing a session /// Returns arbitrary auth/handshake info binary blob - pub async fn handshake( - &mut self, - username: &str, - password: &str, - ) -> Result { + pub async fn handshake(&mut self, username: &str, password: &str) -> Result { let cmd = HandshakeRequest { protocol_version: 0, payload: Default::default(), @@ -152,9 +151,9 @@ impl FlightSqlServiceClient { .await .map_err(|e| ArrowError::IpcError(format!("Can't handshake {e}")))?; if let Some(auth) = resp.metadata().get("authorization") { - let auth = auth.to_str().map_err(|_| { - ArrowError::ParseError("Can't read auth header".to_string()) - })?; + let auth = auth + .to_str() + .map_err(|_| ArrowError::ParseError("Can't read auth header".to_string()))?; let bearer = "Bearer "; if !auth.starts_with(bearer) { Err(ArrowError::ParseError("Invalid auth header!".to_string()))?; @@ -162,10 +161,11 @@ impl FlightSqlServiceClient { let auth = auth[bearer.len()..].to_string(); self.token = Some(auth); } - let responses: Vec = - resp.into_inner().try_collect().await.map_err(|_| { - ArrowError::ParseError("Can't collect responses".to_string()) - })?; + let responses: Vec = resp + .into_inner() + .try_collect() + .await + .map_err(|_| ArrowError::ParseError("Can't collect responses".to_string()))?; let resp = match responses.as_slice() { [resp] => resp.payload.clone(), [] => Bytes::new(), @@ -205,8 +205,7 @@ impl FlightSqlServiceClient { .await .map_err(status_to_arrow_error)? .unwrap(); - let any = - Any::decode(&*result.app_metadata).map_err(decode_error_to_arrow_error)?; + let any = Any::decode(&*result.app_metadata).map_err(decode_error_to_arrow_error)?; let result: DoPutUpdateResult = any.unpack()?.unwrap(); Ok(result.record_count) } @@ -229,14 +228,22 @@ impl FlightSqlServiceClient { pub async fn do_get( &mut self, ticket: impl IntoRequest, - ) -> Result, ArrowError> { + ) -> Result { let req = self.set_request_headers(ticket.into_request())?; - Ok(self + + let (md, response_stream, _ext) = self .flight_client .do_get(req) .await .map_err(status_to_arrow_error)? - .into_inner()) + .into_parts(); + let (response_stream, trailers) = extract_lazy_trailers(response_stream); + + Ok(FlightRecordBatchStream::new_from_flight_data( + response_stream.map_err(FlightError::Tonic), + ) + .with_headers(md) + .with_trailers(trailers)) } /// Push a stream to the flight service associated with a particular flight stream. @@ -393,17 +400,13 @@ impl FlightSqlServiceClient { ArrowError::ParseError(format!("Cannot convert header key \"{k}\": {e}")) })?; let v = v.parse().map_err(|e| { - ArrowError::ParseError(format!( - "Cannot convert header value \"{v}\": {e}" - )) + ArrowError::ParseError(format!("Cannot convert header value \"{v}\": {e}")) })?; req.metadata_mut().insert(k, v); } if let Some(token) = &self.token { let val = format!("Bearer {token}").parse().map_err(|e| { - ArrowError::ParseError(format!( - "Cannot convert token to header value: {e}" - )) + ArrowError::ParseError(format!("Cannot convert token to header value: {e}")) })?; req.metadata_mut().insert("authorization", val); } @@ -439,9 +442,12 @@ impl PreparedStatement { /// Executes the prepared statement query on the server. pub async fn execute(&mut self) -> Result { + self.write_bind_params().await?; + let cmd = CommandPreparedStatementQuery { prepared_statement_handle: self.handle.clone(), }; + let result = self .flight_sql_client .get_flight_info_for_command(cmd) @@ -451,7 +457,9 @@ impl PreparedStatement { /// Executes the prepared statement update query on the server. pub async fn execute_update(&mut self) -> Result { - let cmd = CommandPreparedStatementQuery { + self.write_bind_params().await?; + + let cmd = CommandPreparedStatementUpdate { prepared_statement_handle: self.handle.clone(), }; let descriptor = FlightDescriptor::new_cmd(cmd.as_any().encode_to_vec()); @@ -467,8 +475,7 @@ impl PreparedStatement { .await .map_err(status_to_arrow_error)? .unwrap(); - let any = - Any::decode(&*result.app_metadata).map_err(decode_error_to_arrow_error)?; + let any = Any::decode(&*result.app_metadata).map_err(decode_error_to_arrow_error)?; let result: DoPutUpdateResult = any.unpack()?.unwrap(); Ok(result.record_count) } @@ -484,14 +491,41 @@ impl PreparedStatement { } /// Set a RecordBatch that contains the parameters that will be bind. - pub fn set_parameters( - &mut self, - parameter_binding: RecordBatch, - ) -> Result<(), ArrowError> { + pub fn set_parameters(&mut self, parameter_binding: RecordBatch) -> Result<(), ArrowError> { self.parameter_binding = Some(parameter_binding); Ok(()) } + /// Submit parameters to the server, if any have been set on this prepared statement instance + async fn write_bind_params(&mut self) -> Result<(), ArrowError> { + if let Some(ref params_batch) = self.parameter_binding { + let cmd = CommandPreparedStatementQuery { + prepared_statement_handle: self.handle.clone(), + }; + + let descriptor = FlightDescriptor::new_cmd(cmd.as_any().encode_to_vec()); + let flight_stream_builder = FlightDataEncoderBuilder::new() + .with_flight_descriptor(Some(descriptor)) + .with_schema(params_batch.schema()); + let flight_data = flight_stream_builder + .build(futures::stream::iter( + self.parameter_binding.clone().map(Ok), + )) + .try_collect::>() + .await + .map_err(flight_error_to_arrow_error)?; + + self.flight_sql_client + .do_put(stream::iter(flight_data)) + .await? + .try_collect::>() + .await + .map_err(status_to_arrow_error)?; + } + + Ok(()) + } + /// Close the prepared statement, so that this PreparedStatement can not used /// anymore and server can free up any resources. pub async fn close(mut self) -> Result<(), ArrowError> { @@ -515,6 +549,13 @@ fn status_to_arrow_error(status: tonic::Status) -> ArrowError { ArrowError::IpcError(format!("{status:?}")) } +fn flight_error_to_arrow_error(err: FlightError) -> ArrowError { + match err { + FlightError::Arrow(e) => e, + e => ArrowError::ExternalError(Box::new(e)), + } +} + // A polymorphic structure to natively represent different types of data contained in `FlightData` pub enum ArrowFlightData { RecordBatch(RecordBatch), @@ -526,19 +567,16 @@ pub fn arrow_data_from_flight_data( flight_data: FlightData, arrow_schema_ref: &SchemaRef, ) -> Result { - let ipc_message = root_as_message(&flight_data.data_header[..]).map_err(|err| { - ArrowError::ParseError(format!("Unable to get root as message: {err:?}")) - })?; + let ipc_message = root_as_message(&flight_data.data_header[..]) + .map_err(|err| ArrowError::ParseError(format!("Unable to get root as message: {err:?}")))?; match ipc_message.header_type() { MessageHeader::RecordBatch => { - let ipc_record_batch = - ipc_message.header_as_record_batch().ok_or_else(|| { - ArrowError::ComputeError( - "Unable to convert flight data header to a record batch" - .to_string(), - ) - })?; + let ipc_record_batch = ipc_message.header_as_record_batch().ok_or_else(|| { + ArrowError::ComputeError( + "Unable to convert flight data header to a record batch".to_string(), + ) + })?; let dictionaries_by_field = HashMap::new(); let record_batch = read_record_batch( @@ -564,13 +602,11 @@ pub fn arrow_data_from_flight_data( MessageHeader::DictionaryBatch => { let _ = ipc_message.header_as_dictionary_batch().ok_or_else(|| { ArrowError::ComputeError( - "Unable to convert flight data header to a dictionary batch" - .to_string(), + "Unable to convert flight data header to a dictionary batch".to_string(), ) })?; Err(ArrowError::NotYetImplemented( - "no idea on how to convert an ipc dictionary batch to an arrow type" - .to_string(), + "no idea on how to convert an ipc dictionary batch to an arrow type".to_string(), )) } MessageHeader::Tensor => { @@ -590,8 +626,7 @@ pub fn arrow_data_from_flight_data( ) })?; Err(ArrowError::NotYetImplemented( - "no idea on how to convert an ipc sparse tensor to an arrow type" - .to_string(), + "no idea on how to convert an ipc sparse tensor to an arrow type".to_string(), )) } _ => Err(ArrowError::ComputeError(format!( diff --git a/arrow-flight/src/sql/metadata/db_schemas.rs b/arrow-flight/src/sql/metadata/db_schemas.rs index 642802b058d5..303d11cd74ca 100644 --- a/arrow-flight/src/sql/metadata/db_schemas.rs +++ b/arrow-flight/src/sql/metadata/db_schemas.rs @@ -95,11 +95,7 @@ impl GetDbSchemasBuilder { /// Append a row /// /// In case the catalog should be considered as empty, pass in an empty string '""'. - pub fn append( - &mut self, - catalog_name: impl AsRef, - schema_name: impl AsRef, - ) { + pub fn append(&mut self, catalog_name: impl AsRef, schema_name: impl AsRef) { self.catalog_name.append_value(catalog_name); self.db_schema_name.append_value(schema_name); } diff --git a/arrow-flight/src/sql/metadata/mod.rs b/arrow-flight/src/sql/metadata/mod.rs index 71551f1849ae..1e9881ffa70e 100644 --- a/arrow-flight/src/sql/metadata/mod.rs +++ b/arrow-flight/src/sql/metadata/mod.rs @@ -53,7 +53,7 @@ fn lexsort_to_indices(arrays: &[ArrayRef]) -> UInt32Array { .iter() .map(|a| SortField::new(a.data_type().clone())) .collect(); - let mut converter = RowConverter::new(fields).unwrap(); + let converter = RowConverter::new(fields).unwrap(); let rows = converter.convert_columns(arrays).unwrap(); let mut sort: Vec<_> = rows.iter().enumerate().collect(); sort.sort_unstable_by(|(_, a), (_, b)| a.cmp(b)); diff --git a/arrow-flight/src/sql/metadata/sql_info.rs b/arrow-flight/src/sql/metadata/sql_info.rs index 88c97227814d..d4584f4a6827 100644 --- a/arrow-flight/src/sql/metadata/sql_info.rs +++ b/arrow-flight/src/sql/metadata/sql_info.rs @@ -30,8 +30,8 @@ use std::sync::Arc; use arrow_arith::boolean::or; use arrow_array::array::{Array, UInt32Array, UnionArray}; use arrow_array::builder::{ - ArrayBuilder, BooleanBuilder, Int32Builder, Int64Builder, Int8Builder, ListBuilder, - MapBuilder, StringBuilder, UInt32Builder, + ArrayBuilder, BooleanBuilder, Int32Builder, Int64Builder, Int8Builder, ListBuilder, MapBuilder, + StringBuilder, UInt32Builder, }; use arrow_array::{RecordBatch, Scalar}; use arrow_data::ArrayData; @@ -184,11 +184,7 @@ static UNION_TYPE: Lazy = Lazy::new(|| { Field::new("keys", DataType::Int32, false), Field::new( "values", - DataType::List(Arc::new(Field::new( - "item", - DataType::Int32, - true, - ))), + DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), true, ), ])), @@ -420,10 +416,7 @@ pub struct SqlInfoData { impl SqlInfoData { /// Return a [`RecordBatch`] containing only the requested `u32`, if any /// from [`CommandGetSqlInfo`] - pub fn record_batch( - &self, - info: impl IntoIterator, - ) -> Result { + pub fn record_batch(&self, info: impl IntoIterator) -> Result { let arr = self.batch.column(0); let type_filter = info .into_iter() @@ -493,9 +486,7 @@ mod tests { use super::SqlInfoDataBuilder; use crate::sql::metadata::tests::assert_batches_eq; - use crate::sql::{ - SqlInfo, SqlNullOrdering, SqlSupportedTransaction, SqlSupportsConvert, - }; + use crate::sql::{SqlInfo, SqlNullOrdering, SqlSupportedTransaction, SqlSupportsConvert}; #[test] fn test_sql_infos() { diff --git a/arrow-flight/src/sql/metadata/tables.rs b/arrow-flight/src/sql/metadata/tables.rs index 00502a76db53..7ffb76fa1d5f 100644 --- a/arrow-flight/src/sql/metadata/tables.rs +++ b/arrow-flight/src/sql/metadata/tables.rs @@ -329,12 +329,12 @@ mod tests { "b_catalog", ])) as ArrayRef, Arc::new(StringArray::from(vec![ - "a_schema", "a_schema", "b_schema", "b_schema", "a_schema", - "a_schema", "b_schema", "b_schema", + "a_schema", "a_schema", "b_schema", "b_schema", "a_schema", "a_schema", + "b_schema", "b_schema", ])) as ArrayRef, Arc::new(StringArray::from(vec![ - "a_table", "b_table", "a_table", "b_table", "a_table", "a_table", - "b_table", "b_table", + "a_table", "b_table", "a_table", "b_table", "a_table", "a_table", "b_table", + "b_table", ])) as ArrayRef, Arc::new(StringArray::from(vec![ "TABLE", "TABLE", "TABLE", "TABLE", "TABLE", "VIEW", "TABLE", "VIEW", diff --git a/arrow-flight/src/sql/metadata/xdbc_info.rs b/arrow-flight/src/sql/metadata/xdbc_info.rs index 8212c847a4fa..2e635d3037bc 100644 --- a/arrow-flight/src/sql/metadata/xdbc_info.rs +++ b/arrow-flight/src/sql/metadata/xdbc_info.rs @@ -36,9 +36,7 @@ use once_cell::sync::Lazy; use super::lexsort_to_indices; use crate::error::*; -use crate::sql::{ - CommandGetXdbcTypeInfo, Nullable, Searchable, XdbcDataType, XdbcDatetimeSubcode, -}; +use crate::sql::{CommandGetXdbcTypeInfo, Nullable, Searchable, XdbcDataType, XdbcDatetimeSubcode}; /// Data structure representing type information for xdbc types. #[derive(Debug, Clone, Default)] @@ -201,8 +199,7 @@ impl XdbcTypeInfoDataBuilder { minimum_scale_builder.append_option(info.minimum_scale); maximum_scale_builder.append_option(info.maximum_scale); sql_data_type_builder.append_value(info.sql_data_type as i32); - datetime_subcode_builder - .append_option(info.datetime_subcode.map(|code| code as i32)); + datetime_subcode_builder.append_option(info.datetime_subcode.map(|code| code as i32)); num_prec_radix_builder.append_option(info.num_prec_radix); interval_precision_builder.append_option(info.interval_precision); }); @@ -215,8 +212,7 @@ impl XdbcTypeInfoDataBuilder { let (field, offsets, values, nulls) = create_params_builder.finish().into_parts(); // Re-defined the field to be non-nullable let new_field = Arc::new(field.as_ref().clone().with_nullable(false)); - let create_params = - Arc::new(ListArray::new(new_field, offsets, values, nulls)) as ArrayRef; + let create_params = Arc::new(ListArray::new(new_field, offsets, values, nulls)) as ArrayRef; let nullable = Arc::new(nullable_builder.finish()); let case_sensitive = Arc::new(case_sensitive_builder.finish()); let searchable = Arc::new(searchable_builder.finish()); diff --git a/arrow-flight/src/sql/mod.rs b/arrow-flight/src/sql/mod.rs index 4bb8ce8b36e5..97645ae7840d 100644 --- a/arrow-flight/src/sql/mod.rs +++ b/arrow-flight/src/sql/mod.rs @@ -93,6 +93,7 @@ pub use gen::SqlSupportedTransactions; pub use gen::SqlSupportedUnions; pub use gen::SqlSupportsConvert; pub use gen::SqlTransactionIsolationLevel; +pub use gen::SubstraitPlan; pub use gen::SupportedSqlGrammar; pub use gen::TicketStatementQuery; pub use gen::UpdateDeleteRules; @@ -294,9 +295,8 @@ impl Any { if !self.is::() { return Ok(None); } - let m = Message::decode(&*self.value).map_err(|err| { - ArrowError::ParseError(format!("Unable to decode Any value: {err}")) - })?; + let m = Message::decode(&*self.value) + .map_err(|err| ArrowError::ParseError(format!("Unable to decode Any value: {err}")))?; Ok(Some(m)) } diff --git a/arrow-flight/src/sql/server.rs b/arrow-flight/src/sql/server.rs index 102d97105a2e..f1656aca882a 100644 --- a/arrow-flight/src/sql/server.rs +++ b/arrow-flight/src/sql/server.rs @@ -19,28 +19,26 @@ use std::pin::Pin; -use futures::Stream; +use futures::{stream::Peekable, Stream, StreamExt}; use prost::Message; use tonic::{Request, Response, Status, Streaming}; use super::{ - ActionBeginSavepointRequest, ActionBeginSavepointResult, - ActionBeginTransactionRequest, ActionBeginTransactionResult, - ActionCancelQueryRequest, ActionCancelQueryResult, + ActionBeginSavepointRequest, ActionBeginSavepointResult, ActionBeginTransactionRequest, + ActionBeginTransactionResult, ActionCancelQueryRequest, ActionCancelQueryResult, ActionClosePreparedStatementRequest, ActionCreatePreparedStatementRequest, ActionCreatePreparedStatementResult, ActionCreatePreparedSubstraitPlanRequest, - ActionEndSavepointRequest, ActionEndTransactionRequest, Any, Command, - CommandGetCatalogs, CommandGetCrossReference, CommandGetDbSchemas, - CommandGetExportedKeys, CommandGetImportedKeys, CommandGetPrimaryKeys, - CommandGetSqlInfo, CommandGetTableTypes, CommandGetTables, CommandGetXdbcTypeInfo, - CommandPreparedStatementQuery, CommandPreparedStatementUpdate, CommandStatementQuery, - CommandStatementSubstraitPlan, CommandStatementUpdate, DoPutUpdateResult, - ProstMessageExt, SqlInfo, TicketStatementQuery, + ActionEndSavepointRequest, ActionEndTransactionRequest, Any, Command, CommandGetCatalogs, + CommandGetCrossReference, CommandGetDbSchemas, CommandGetExportedKeys, CommandGetImportedKeys, + CommandGetPrimaryKeys, CommandGetSqlInfo, CommandGetTableTypes, CommandGetTables, + CommandGetXdbcTypeInfo, CommandPreparedStatementQuery, CommandPreparedStatementUpdate, + CommandStatementQuery, CommandStatementSubstraitPlan, CommandStatementUpdate, + DoPutUpdateResult, ProstMessageExt, SqlInfo, TicketStatementQuery, }; use crate::{ - flight_service_server::FlightService, Action, ActionType, Criteria, Empty, - FlightData, FlightDescriptor, FlightInfo, HandshakeRequest, HandshakeResponse, - PutResult, SchemaResult, Ticket, + flight_service_server::FlightService, Action, ActionType, Criteria, Empty, FlightData, + FlightDescriptor, FlightInfo, HandshakeRequest, HandshakeResponse, PutResult, SchemaResult, + Ticket, }; pub(crate) static CREATE_PREPARED_STATEMENT: &str = "CreatePreparedStatement"; @@ -227,6 +225,18 @@ pub trait FlightSqlService: Sync + Send + Sized + 'static { )) } + /// Implementors may override to handle additional calls to get_flight_info() + async fn get_flight_info_fallback( + &self, + cmd: Command, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented(format!( + "get_flight_info: The defined request is invalid: {}", + cmd.type_url() + ))) + } + // do_get /// Get a FlightDataStream containing the query results. @@ -366,7 +376,7 @@ pub trait FlightSqlService: Sync + Send + Sized + 'static { /// Implementors may override to handle additional calls to do_put() async fn do_put_fallback( &self, - _request: Request>, + _request: Request, message: Any, ) -> Result::DoPutStream>, Status> { Err(Status::unimplemented(format!( @@ -379,7 +389,7 @@ pub trait FlightSqlService: Sync + Send + Sized + 'static { async fn do_put_statement_update( &self, _ticket: CommandStatementUpdate, - _request: Request>, + _request: Request, ) -> Result { Err(Status::unimplemented( "do_put_statement_update has no default implementation", @@ -390,7 +400,7 @@ pub trait FlightSqlService: Sync + Send + Sized + 'static { async fn do_put_prepared_statement_query( &self, _query: CommandPreparedStatementQuery, - _request: Request>, + _request: Request, ) -> Result::DoPutStream>, Status> { Err(Status::unimplemented( "do_put_prepared_statement_query has no default implementation", @@ -401,7 +411,7 @@ pub trait FlightSqlService: Sync + Send + Sized + 'static { async fn do_put_prepared_statement_update( &self, _query: CommandPreparedStatementUpdate, - _request: Request>, + _request: Request, ) -> Result { Err(Status::unimplemented( "do_put_prepared_statement_update has no default implementation", @@ -412,7 +422,7 @@ pub trait FlightSqlService: Sync + Send + Sized + 'static { async fn do_put_substrait_plan( &self, _query: CommandStatementSubstraitPlan, - _request: Request>, + _request: Request, ) -> Result { Err(Status::unimplemented( "do_put_substrait_plan has no default implementation", @@ -549,13 +559,10 @@ where Pin> + Send + 'static>>; type ListFlightsStream = Pin> + Send + 'static>>; - type DoGetStream = - Pin> + Send + 'static>>; - type DoPutStream = - Pin> + Send + 'static>>; - type DoActionStream = Pin< - Box> + Send + 'static>, - >; + type DoGetStream = Pin> + Send + 'static>>; + type DoPutStream = Pin> + Send + 'static>>; + type DoActionStream = + Pin> + Send + 'static>>; type ListActionsStream = Pin> + Send + 'static>>; type DoExchangeStream = @@ -580,8 +587,7 @@ where &self, request: Request, ) -> Result, Status> { - let message = - Any::decode(&*request.get_ref().cmd).map_err(decode_error_to_status)?; + let message = Any::decode(&*request.get_ref().cmd).map_err(decode_error_to_status)?; match Command::try_from(message).map_err(arrow_error_to_status)? { Command::CommandStatementQuery(token) => { @@ -600,9 +606,7 @@ where Command::CommandGetDbSchemas(token) => { return self.get_flight_info_schemas(token, request).await } - Command::CommandGetTables(token) => { - self.get_flight_info_tables(token, request).await - } + Command::CommandGetTables(token) => self.get_flight_info_tables(token, request).await, Command::CommandGetTableTypes(token) => { self.get_flight_info_table_types(token, request).await } @@ -624,10 +628,7 @@ where Command::CommandGetXdbcTypeInfo(token) => { self.get_flight_info_xdbc_type_info(token, request).await } - cmd => Err(Status::unimplemented(format!( - "get_flight_info: The defined request is invalid: {}", - cmd.type_url() - ))), + cmd => self.get_flight_info_fallback(cmd, request).await, } } @@ -642,31 +643,21 @@ where &self, request: Request, ) -> Result, Status> { - let msg: Any = Message::decode(&*request.get_ref().ticket) - .map_err(decode_error_to_status)?; + let msg: Any = + Message::decode(&*request.get_ref().ticket).map_err(decode_error_to_status)?; match Command::try_from(msg).map_err(arrow_error_to_status)? { - Command::TicketStatementQuery(command) => { - self.do_get_statement(command, request).await - } + Command::TicketStatementQuery(command) => self.do_get_statement(command, request).await, Command::CommandPreparedStatementQuery(command) => { self.do_get_prepared_statement(command, request).await } - Command::CommandGetCatalogs(command) => { - self.do_get_catalogs(command, request).await - } - Command::CommandGetDbSchemas(command) => { - self.do_get_schemas(command, request).await - } - Command::CommandGetTables(command) => { - self.do_get_tables(command, request).await - } + Command::CommandGetCatalogs(command) => self.do_get_catalogs(command, request).await, + Command::CommandGetDbSchemas(command) => self.do_get_schemas(command, request).await, + Command::CommandGetTables(command) => self.do_get_tables(command, request).await, Command::CommandGetTableTypes(command) => { self.do_get_table_types(command, request).await } - Command::CommandGetSqlInfo(command) => { - self.do_get_sql_info(command, request).await - } + Command::CommandGetSqlInfo(command) => self.do_get_sql_info(command, request).await, Command::CommandGetPrimaryKeys(command) => { self.do_get_primary_keys(command, request).await } @@ -688,11 +679,19 @@ where async fn do_put( &self, - mut request: Request>, + request: Request>, ) -> Result, Status> { - let cmd = request.get_mut().message().await?.unwrap(); - let message = Any::decode(&*cmd.flight_descriptor.unwrap().cmd) - .map_err(decode_error_to_status)?; + // See issue #4658: https://github.com/apache/arrow-rs/issues/4658 + // To dispatch to the correct `do_put` method, we cannot discard the first message, + // as it may contain the Arrow schema, which the `do_put` handler may need. + // To allow the first message to be reused by the `do_put` handler, + // we wrap this stream in a `Peekable` one, which allows us to peek at + // the first message without discarding it. + let mut request = request.map(PeekableFlightDataStream::new); + let cmd = Pin::new(request.get_mut()).peek().await.unwrap().clone()?; + + let message = + Any::decode(&*cmd.flight_descriptor.unwrap().cmd).map_err(decode_error_to_status)?; match Command::try_from(message).map_err(arrow_error_to_status)? { Command::CommandStatementUpdate(command) => { let record_count = self.do_put_statement_update(command, request).await?; @@ -747,11 +746,10 @@ where }; let create_prepared_substrait_plan_action_type = ActionType { r#type: CREATE_PREPARED_SUBSTRAIT_PLAN.to_string(), - description: - "Creates a reusable prepared substrait plan resource on the server.\n + description: "Creates a reusable prepared substrait plan resource on the server.\n Request Message: ActionCreatePreparedSubstraitPlanRequest\n Response Message: ActionCreatePreparedStatementResult" - .into(), + .into(), }; let begin_transaction_action_type = ActionType { r#type: BEGIN_TRANSACTION.to_string(), @@ -812,8 +810,7 @@ where request: Request, ) -> Result, Status> { if request.get_ref().r#type == CREATE_PREPARED_STATEMENT { - let any = - Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; + let any = Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; let cmd: ActionCreatePreparedStatementRequest = any .unpack() @@ -831,8 +828,7 @@ where })]); return Ok(Response::new(Box::pin(output))); } else if request.get_ref().r#type == CLOSE_PREPARED_STATEMENT { - let any = - Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; + let any = Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; let cmd: ActionClosePreparedStatementRequest = any .unpack() @@ -846,8 +842,7 @@ where .await?; return Ok(Response::new(Box::pin(futures::stream::empty()))); } else if request.get_ref().r#type == CREATE_PREPARED_SUBSTRAIT_PLAN { - let any = - Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; + let any = Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; let cmd: ActionCreatePreparedSubstraitPlanRequest = any .unpack() @@ -861,47 +856,38 @@ where .await?; return Ok(Response::new(Box::pin(futures::stream::empty()))); } else if request.get_ref().r#type == BEGIN_TRANSACTION { - let any = - Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; + let any = Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; let cmd: ActionBeginTransactionRequest = any .unpack() .map_err(arrow_error_to_status)? .ok_or_else(|| { - Status::invalid_argument( - "Unable to unpack ActionBeginTransactionRequest.", - ) - })?; + Status::invalid_argument("Unable to unpack ActionBeginTransactionRequest.") + })?; let stmt = self.do_action_begin_transaction(cmd, request).await?; let output = futures::stream::iter(vec![Ok(super::super::gen::Result { body: stmt.as_any().encode_to_vec().into(), })]); return Ok(Response::new(Box::pin(output))); } else if request.get_ref().r#type == END_TRANSACTION { - let any = - Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; + let any = Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; let cmd: ActionEndTransactionRequest = any .unpack() .map_err(arrow_error_to_status)? .ok_or_else(|| { - Status::invalid_argument( - "Unable to unpack ActionEndTransactionRequest.", - ) + Status::invalid_argument("Unable to unpack ActionEndTransactionRequest.") })?; self.do_action_end_transaction(cmd, request).await?; return Ok(Response::new(Box::pin(futures::stream::empty()))); } else if request.get_ref().r#type == BEGIN_SAVEPOINT { - let any = - Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; + let any = Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; let cmd: ActionBeginSavepointRequest = any .unpack() .map_err(arrow_error_to_status)? .ok_or_else(|| { - Status::invalid_argument( - "Unable to unpack ActionBeginSavepointRequest.", - ) + Status::invalid_argument("Unable to unpack ActionBeginSavepointRequest.") })?; let stmt = self.do_action_begin_savepoint(cmd, request).await?; let output = futures::stream::iter(vec![Ok(super::super::gen::Result { @@ -909,22 +895,18 @@ where })]); return Ok(Response::new(Box::pin(output))); } else if request.get_ref().r#type == END_SAVEPOINT { - let any = - Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; + let any = Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; let cmd: ActionEndSavepointRequest = any .unpack() .map_err(arrow_error_to_status)? .ok_or_else(|| { - Status::invalid_argument( - "Unable to unpack ActionEndSavepointRequest.", - ) + Status::invalid_argument("Unable to unpack ActionEndSavepointRequest.") })?; self.do_action_end_savepoint(cmd, request).await?; return Ok(Response::new(Box::pin(futures::stream::empty()))); } else if request.get_ref().r#type == CANCEL_QUERY { - let any = - Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; + let any = Any::decode(&*request.get_ref().body).map_err(decode_error_to_status)?; let cmd: ActionCancelQueryRequest = any .unpack() @@ -957,3 +939,89 @@ fn decode_error_to_status(err: prost::DecodeError) -> Status { fn arrow_error_to_status(err: arrow_schema::ArrowError) -> Status { Status::internal(format!("{err:?}")) } + +/// A wrapper around [`Streaming`] that allows "peeking" at the +/// message at the front of the stream without consuming it. +/// This is needed because sometimes the first message in the stream will contain +/// a [`FlightDescriptor`] in addition to potentially any data, and the dispatch logic +/// must inspect this information. +/// +/// # Example +/// +/// [`PeekableFlightDataStream::peek`] can be used to peek at the first message without +/// discarding it; otherwise, `PeekableFlightDataStream` can be used as a regular stream. +/// See the following example: +/// +/// ```no_run +/// use arrow_array::RecordBatch; +/// use arrow_flight::decode::FlightRecordBatchStream; +/// use arrow_flight::FlightDescriptor; +/// use arrow_flight::error::FlightError; +/// use arrow_flight::sql::server::PeekableFlightDataStream; +/// use tonic::{Request, Status}; +/// use futures::TryStreamExt; +/// +/// #[tokio::main] +/// async fn main() -> Result<(), Status> { +/// let request: Request = todo!(); +/// let stream: PeekableFlightDataStream = request.into_inner(); +/// +/// // The first message contains the flight descriptor and the schema. +/// // Read the flight descriptor without discarding the schema: +/// let flight_descriptor: FlightDescriptor = stream +/// .peek() +/// .await +/// .cloned() +/// .transpose()? +/// .and_then(|data| data.flight_descriptor) +/// .expect("first message should contain flight descriptor"); +/// +/// // Pass the stream through a decoder +/// let batches: Vec = FlightRecordBatchStream::new_from_flight_data( +/// request.into_inner().map_err(|e| e.into()), +/// ) +/// .try_collect() +/// .await?; +/// } +/// ``` +pub struct PeekableFlightDataStream { + inner: Peekable>, +} + +impl PeekableFlightDataStream { + fn new(stream: Streaming) -> Self { + Self { + inner: stream.peekable(), + } + } + + /// Convert this stream into a `Streaming`. + /// Any messages observed through [`Self::peek`] will be lost + /// after the conversion. + pub fn into_inner(self) -> Streaming { + self.inner.into_inner() + } + + /// Convert this stream into a `Peekable>`. + /// Preserves the state of the stream, so that calls to [`Self::peek`] + /// and [`Self::poll_next`] are the same. + pub fn into_peekable(self) -> Peekable> { + self.inner + } + + /// Peek at the head of this stream without advancing it. + pub async fn peek(&mut self) -> Option<&Result> { + Pin::new(&mut self.inner).peek().await + } +} + +impl Stream for PeekableFlightDataStream { + type Item = Result; + + fn poll_next( + mut self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + self.inner.poll_next_unpin(cx) + } +} diff --git a/arrow-flight/src/trailers.rs b/arrow-flight/src/trailers.rs index d652542da779..73136379d69f 100644 --- a/arrow-flight/src/trailers.rs +++ b/arrow-flight/src/trailers.rs @@ -28,9 +28,7 @@ use tonic::{metadata::MetadataMap, Status, Streaming}; /// /// Note that [`LazyTrailers`] has inner mutability and will only hold actual data after [`ExtractTrailersStream`] is /// fully consumed (dropping it is not required though). -pub fn extract_lazy_trailers( - s: Streaming, -) -> (ExtractTrailersStream, LazyTrailers) { +pub fn extract_lazy_trailers(s: Streaming) -> (ExtractTrailersStream, LazyTrailers) { let trailers: SharedTrailers = Default::default(); let stream = ExtractTrailersStream { inner: s, @@ -54,10 +52,7 @@ pub struct ExtractTrailersStream { impl Stream for ExtractTrailersStream { type Item = Result; - fn poll_next( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll> { + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { let res = ready!(self.inner.poll_next_unpin(cx)); if res.is_none() { diff --git a/arrow-flight/src/utils.rs b/arrow-flight/src/utils.rs index 145626b6608f..b75d61d200cb 100644 --- a/arrow-flight/src/utils.rs +++ b/arrow-flight/src/utils.rs @@ -52,26 +52,23 @@ pub fn flight_data_from_arrow_batch( } /// Convert a slice of wire protocol `FlightData`s into a vector of `RecordBatch`es -pub fn flight_data_to_batches( - flight_data: &[FlightData], -) -> Result, ArrowError> { +pub fn flight_data_to_batches(flight_data: &[FlightData]) -> Result, ArrowError> { let schema = flight_data.get(0).ok_or_else(|| { ArrowError::CastError("Need at least one FlightData for schema".to_string()) })?; let message = root_as_message(&schema.data_header[..]) .map_err(|_| ArrowError::CastError("Cannot get root as message".to_string()))?; - let ipc_schema: arrow_ipc::Schema = message.header_as_schema().ok_or_else(|| { - ArrowError::CastError("Cannot get header as Schema".to_string()) - })?; + let ipc_schema: arrow_ipc::Schema = message + .header_as_schema() + .ok_or_else(|| ArrowError::CastError("Cannot get header as Schema".to_string()))?; let schema = fb_to_schema(ipc_schema); let schema = Arc::new(schema); let mut batches = vec![]; let dictionaries_by_id = HashMap::new(); for datum in flight_data[1..].iter() { - let batch = - flight_data_to_arrow_batch(datum, schema.clone(), &dictionaries_by_id)?; + let batch = flight_data_to_arrow_batch(datum, schema.clone(), &dictionaries_by_id)?; batches.push(batch); } Ok(batches) @@ -84,9 +81,8 @@ pub fn flight_data_to_arrow_batch( dictionaries_by_id: &HashMap, ) -> Result { // check that the data_header is a record batch message - let message = arrow_ipc::root_as_message(&data.data_header[..]).map_err(|err| { - ArrowError::ParseError(format!("Unable to get root as message: {err:?}")) - })?; + let message = arrow_ipc::root_as_message(&data.data_header[..]) + .map_err(|err| ArrowError::ParseError(format!("Unable to get root as message: {err:?}")))?; message .header_as_record_batch() @@ -124,10 +120,7 @@ pub fn flight_schema_from_arrow_schema( since = "4.4.0", note = "Use From trait, e.g.: SchemaAsIpc::new(schema, options).into()" )] -pub fn flight_data_from_arrow_schema( - schema: &Schema, - options: &IpcWriteOptions, -) -> FlightData { +pub fn flight_data_from_arrow_schema(schema: &Schema, options: &IpcWriteOptions) -> FlightData { SchemaAsIpc::new(schema, options).into() } diff --git a/arrow-flight/tests/client.rs b/arrow-flight/tests/client.rs index 1b9891e121fa..3ad9ee7a45ca 100644 --- a/arrow-flight/tests/client.rs +++ b/arrow-flight/tests/client.rs @@ -23,9 +23,9 @@ mod common { } use arrow_array::{RecordBatch, UInt64Array}; use arrow_flight::{ - decode::FlightRecordBatchStream, encode::FlightDataEncoderBuilder, - error::FlightError, Action, ActionType, Criteria, Empty, FlightClient, FlightData, - FlightDescriptor, FlightInfo, HandshakeRequest, HandshakeResponse, PutResult, Ticket, + decode::FlightRecordBatchStream, encode::FlightDataEncoderBuilder, error::FlightError, Action, + ActionType, Criteria, Empty, FlightClient, FlightData, FlightDescriptor, FlightInfo, + HandshakeRequest, HandshakeResponse, PutResult, Ticket, }; use arrow_schema::{DataType, Field, Schema}; use bytes::Bytes; @@ -271,8 +271,7 @@ async fn test_do_put() { }, ]; - test_server - .set_do_put_response(expected_response.clone().into_iter().map(Ok).collect()); + test_server.set_do_put_response(expected_response.clone().into_iter().map(Ok).collect()); let input_stream = futures::stream::iter(input_flight_data.clone()).map(Ok); @@ -446,9 +445,8 @@ async fn test_do_exchange() { let input_flight_data = test_flight_data().await; let output_flight_data = test_flight_data2().await; - test_server.set_do_exchange_response( - output_flight_data.clone().into_iter().map(Ok).collect(), - ); + test_server + .set_do_exchange_response(output_flight_data.clone().into_iter().map(Ok).collect()); let response_stream = client .do_exchange(futures::stream::iter(input_flight_data.clone())) diff --git a/arrow-flight/tests/common/server.rs b/arrow-flight/tests/common/server.rs index c575d12bbf52..8b162d398c4b 100644 --- a/arrow-flight/tests/common/server.rs +++ b/arrow-flight/tests/common/server.rs @@ -174,10 +174,7 @@ impl TestFlightServer { } /// Specify the response returned from the next call to `do_action` - pub fn set_do_action_response( - &self, - response: Vec>, - ) { + pub fn set_do_action_response(&self, response: Vec>) { let mut state = self.state.lock().expect("mutex not poisoned"); state.do_action_response.replace(response); } @@ -278,9 +275,10 @@ impl FlightService for TestFlightServer { let mut state = self.state.lock().expect("mutex not poisoned"); state.handshake_request = Some(handshake_request); - let response = state.handshake_response.take().unwrap_or_else(|| { - Err(Status::internal("No handshake response configured")) - })?; + let response = state + .handshake_response + .take() + .unwrap_or_else(|| Err(Status::internal("No handshake response configured")))?; // turn into a streaming response let output = futures::stream::iter(std::iter::once(Ok(response))); @@ -313,9 +311,10 @@ impl FlightService for TestFlightServer { self.save_metadata(&request); let mut state = self.state.lock().expect("mutex not poisoned"); state.get_flight_info_request = Some(request.into_inner()); - let response = state.get_flight_info_response.take().unwrap_or_else(|| { - Err(Status::internal("No get_flight_info response configured")) - })?; + let response = state + .get_flight_info_response + .take() + .unwrap_or_else(|| Err(Status::internal("No get_flight_info response configured")))?; Ok(Response::new(response)) } @@ -326,9 +325,10 @@ impl FlightService for TestFlightServer { self.save_metadata(&request); let mut state = self.state.lock().expect("mutex not poisoned"); state.get_schema_request = Some(request.into_inner()); - let schema = state.get_schema_response.take().unwrap_or_else(|| { - Err(Status::internal("No get_schema response configured")) - })?; + let schema = state + .get_schema_response + .take() + .unwrap_or_else(|| Err(Status::internal("No get_schema response configured")))?; // encode the schema let options = arrow_ipc::writer::IpcWriteOptions::default(); diff --git a/arrow-flight/tests/common/trailers_layer.rs b/arrow-flight/tests/common/trailers_layer.rs index 9e6be0dcf0da..b2ab74f7d925 100644 --- a/arrow-flight/tests/common/trailers_layer.rs +++ b/arrow-flight/tests/common/trailers_layer.rs @@ -81,9 +81,7 @@ where ready!(self.as_mut().project().inner.poll(cx)); match result { - Ok(response) => { - Poll::Ready(Ok(response.map(|body| WrappedBody { inner: body }))) - } + Ok(response) => Poll::Ready(Ok(response.map(|body| WrappedBody { inner: body }))), Err(e) => Poll::Ready(Err(e)), } } diff --git a/arrow-flight/tests/encode_decode.rs b/arrow-flight/tests/encode_decode.rs index 71bcf4e0521a..f4741d743e57 100644 --- a/arrow-flight/tests/encode_decode.rs +++ b/arrow-flight/tests/encode_decode.rs @@ -195,8 +195,7 @@ async fn test_app_metadata() { let encode_stream = encoder.build(input_batch_stream); // use lower level stream to get access to app metadata - let decode_stream = - FlightRecordBatchStream::new_from_flight_data(encode_stream).into_inner(); + let decode_stream = FlightRecordBatchStream::new_from_flight_data(encode_stream).into_inner(); let mut messages: Vec<_> = decode_stream.try_collect().await.expect("encode fails"); @@ -225,8 +224,7 @@ async fn test_max_message_size() { let encode_stream = encoder.build(input_batch_stream); // use lower level stream to get access to app metadata - let decode_stream = - FlightRecordBatchStream::new_from_flight_data(encode_stream).into_inner(); + let decode_stream = FlightRecordBatchStream::new_from_flight_data(encode_stream).into_inner(); let messages: Vec<_> = decode_stream.try_collect().await.expect("encode fails"); @@ -254,8 +252,8 @@ async fn test_max_message_size_fuzz() { ]; for max_message_size_bytes in [10, 1024, 2048, 6400, 3211212] { - let encoder = FlightDataEncoderBuilder::default() - .with_max_flight_data_size(max_message_size_bytes); + let encoder = + FlightDataEncoderBuilder::default().with_max_flight_data_size(max_message_size_bytes); let input_batch_stream = futures::stream::iter(input.clone()).map(Ok); @@ -299,10 +297,10 @@ async fn test_chained_streams_batch_decoder() { let batch2 = make_dictionary_batch(3); // Model sending two flight streams back to back, with different schemas - let encode_stream1 = FlightDataEncoderBuilder::default() - .build(futures::stream::iter(vec![Ok(batch1.clone())])); - let encode_stream2 = FlightDataEncoderBuilder::default() - .build(futures::stream::iter(vec![Ok(batch2.clone())])); + let encode_stream1 = + FlightDataEncoderBuilder::default().build(futures::stream::iter(vec![Ok(batch1.clone())])); + let encode_stream2 = + FlightDataEncoderBuilder::default().build(futures::stream::iter(vec![Ok(batch2.clone())])); // append the two streams (so they will have two different schema messages) let encode_stream = encode_stream1.chain(encode_stream2); @@ -324,10 +322,10 @@ async fn test_chained_streams_data_decoder() { let batch2 = make_dictionary_batch(3); // Model sending two flight streams back to back, with different schemas - let encode_stream1 = FlightDataEncoderBuilder::default() - .build(futures::stream::iter(vec![Ok(batch1.clone())])); - let encode_stream2 = FlightDataEncoderBuilder::default() - .build(futures::stream::iter(vec![Ok(batch2.clone())])); + let encode_stream1 = + FlightDataEncoderBuilder::default().build(futures::stream::iter(vec![Ok(batch1.clone())])); + let encode_stream2 = + FlightDataEncoderBuilder::default().build(futures::stream::iter(vec![Ok(batch2.clone())])); // append the two streams (so they will have two different schema messages) let encode_stream = encode_stream1.chain(encode_stream2); @@ -335,8 +333,7 @@ async fn test_chained_streams_data_decoder() { // lower level decode stream can handle multiple schema messages let decode_stream = FlightDataDecoder::new(encode_stream); - let decoded_data: Vec<_> = - decode_stream.try_collect().await.expect("encode / decode"); + let decoded_data: Vec<_> = decode_stream.try_collect().await.expect("encode / decode"); println!("decoded data: {decoded_data:#?}"); @@ -425,8 +422,7 @@ fn make_primitive_batch(num_rows: usize) -> RecordBatch { }) .collect(); - RecordBatch::try_from_iter(vec![("i", Arc::new(i) as ArrayRef), ("f", Arc::new(f))]) - .unwrap() + RecordBatch::try_from_iter(vec![("i", Arc::new(i) as ArrayRef), ("f", Arc::new(f))]).unwrap() } /// Make a dictionary batch for testing @@ -459,8 +455,7 @@ fn make_dictionary_batch(num_rows: usize) -> RecordBatch { /// match the input. async fn roundtrip(input: Vec) { let expected_output = input.clone(); - roundtrip_with_encoder(FlightDataEncoderBuilder::default(), input, expected_output) - .await + roundtrip_with_encoder(FlightDataEncoderBuilder::default(), input, expected_output).await } /// Encodes input as a FlightData stream, and then decodes it using @@ -475,8 +470,7 @@ async fn roundtrip_dictionary(input: Vec) { .iter() .map(|batch| prepare_batch_for_flight(batch, schema.clone()).unwrap()) .collect(); - roundtrip_with_encoder(FlightDataEncoderBuilder::default(), input, expected_output) - .await + roundtrip_with_encoder(FlightDataEncoderBuilder::default(), input, expected_output).await } async fn roundtrip_with_encoder( @@ -491,8 +485,7 @@ async fn roundtrip_with_encoder( let encode_stream = encoder.build(input_batch_stream); let decode_stream = FlightRecordBatchStream::new_from_flight_data(encode_stream); - let output_batches: Vec<_> = - decode_stream.try_collect().await.expect("encode / decode"); + let output_batches: Vec<_> = decode_stream.try_collect().await.expect("encode / decode"); // remove any empty batches from input as they are not transmitted let expected_batches: Vec<_> = expected_batches diff --git a/arrow-flight/tests/flight_sql_client_cli.rs b/arrow-flight/tests/flight_sql_client_cli.rs index 912bcc75a9df..a28080450bc2 100644 --- a/arrow-flight/tests/flight_sql_client_cli.rs +++ b/arrow-flight/tests/flight_sql_client_cli.rs @@ -19,35 +19,37 @@ use std::{net::SocketAddr, pin::Pin, sync::Arc, time::Duration}; use arrow_array::{ArrayRef, Int64Array, RecordBatch, StringArray}; use arrow_flight::{ + decode::FlightRecordBatchStream, flight_service_server::{FlightService, FlightServiceServer}, sql::{ - server::FlightSqlService, ActionBeginSavepointRequest, - ActionBeginSavepointResult, ActionBeginTransactionRequest, + server::{FlightSqlService, PeekableFlightDataStream}, + ActionBeginSavepointRequest, ActionBeginSavepointResult, ActionBeginTransactionRequest, ActionBeginTransactionResult, ActionCancelQueryRequest, ActionCancelQueryResult, ActionClosePreparedStatementRequest, ActionCreatePreparedStatementRequest, ActionCreatePreparedStatementResult, ActionCreatePreparedSubstraitPlanRequest, ActionEndSavepointRequest, ActionEndTransactionRequest, Any, CommandGetCatalogs, CommandGetCrossReference, CommandGetDbSchemas, CommandGetExportedKeys, - CommandGetImportedKeys, CommandGetPrimaryKeys, CommandGetSqlInfo, - CommandGetTableTypes, CommandGetTables, CommandGetXdbcTypeInfo, - CommandPreparedStatementQuery, CommandPreparedStatementUpdate, - CommandStatementQuery, CommandStatementSubstraitPlan, CommandStatementUpdate, - ProstMessageExt, SqlInfo, TicketStatementQuery, + CommandGetImportedKeys, CommandGetPrimaryKeys, CommandGetSqlInfo, CommandGetTableTypes, + CommandGetTables, CommandGetXdbcTypeInfo, CommandPreparedStatementQuery, + CommandPreparedStatementUpdate, CommandStatementQuery, CommandStatementSubstraitPlan, + CommandStatementUpdate, ProstMessageExt, SqlInfo, TicketStatementQuery, }, utils::batches_to_flight_data, Action, FlightData, FlightDescriptor, FlightEndpoint, FlightInfo, HandshakeRequest, - HandshakeResponse, Ticket, + HandshakeResponse, IpcMessage, PutResult, SchemaAsIpc, Ticket, }; +use arrow_ipc::writer::IpcWriteOptions; use arrow_schema::{ArrowError, DataType, Field, Schema}; use assert_cmd::Command; -use futures::Stream; +use bytes::Bytes; +use futures::{Stream, StreamExt, TryStreamExt}; use prost::Message; use tokio::{net::TcpListener, task::JoinHandle}; use tonic::{Request, Response, Status, Streaming}; const QUERY: &str = "SELECT * FROM table;"; -#[tokio::test(flavor = "multi_thread", worker_threads = 1)] +#[tokio::test] async fn test_simple() { let test_server = FlightSqlServiceImpl {}; let fixture = TestFixture::new(&test_server).await; @@ -63,6 +65,7 @@ async fn test_simple() { .arg(addr.ip().to_string()) .arg("--port") .arg(addr.port().to_string()) + .arg("statement-query") .arg(QUERY) .assert() .success() @@ -87,10 +90,56 @@ async fn test_simple() { ); } +const PREPARED_QUERY: &str = "SELECT * FROM table WHERE field = $1"; +const PREPARED_STATEMENT_HANDLE: &str = "prepared_statement_handle"; + +#[tokio::test] +async fn test_do_put_prepared_statement() { + let test_server = FlightSqlServiceImpl {}; + let fixture = TestFixture::new(&test_server).await; + let addr = fixture.addr; + + let stdout = tokio::task::spawn_blocking(move || { + Command::cargo_bin("flight_sql_client") + .unwrap() + .env_clear() + .env("RUST_BACKTRACE", "1") + .env("RUST_LOG", "warn") + .arg("--host") + .arg(addr.ip().to_string()) + .arg("--port") + .arg(addr.port().to_string()) + .arg("prepared-statement-query") + .arg(PREPARED_QUERY) + .args(["-p", "$1=string"]) + .args(["-p", "$2=64"]) + .assert() + .success() + .get_output() + .stdout + .clone() + }) + .await + .unwrap(); + + fixture.shutdown_and_wait().await; + + assert_eq!( + std::str::from_utf8(&stdout).unwrap().trim(), + "+--------------+-----------+\ + \n| field_string | field_int |\ + \n+--------------+-----------+\ + \n| Hello | 42 |\ + \n| lovely | |\ + \n| FlightSQL! | 1337 |\ + \n+--------------+-----------+", + ); +} + /// All tests must complete within this many seconds or else the test server is shutdown const DEFAULT_TIMEOUT_SECONDS: u64 = 30; -#[derive(Clone)] +#[derive(Clone, Default)] pub struct FlightSqlServiceImpl {} impl FlightSqlServiceImpl { @@ -116,6 +165,58 @@ impl FlightSqlServiceImpl { ]; RecordBatch::try_new(Arc::new(schema), cols) } + + fn create_fake_prepared_stmt() -> Result { + let handle = PREPARED_STATEMENT_HANDLE.to_string(); + let schema = Schema::new(vec![ + Field::new("field_string", DataType::Utf8, false), + Field::new("field_int", DataType::Int64, true), + ]); + + let parameter_schema = Schema::new(vec![ + Field::new("$1", DataType::Utf8, false), + Field::new("$2", DataType::Int64, true), + ]); + + Ok(ActionCreatePreparedStatementResult { + prepared_statement_handle: handle.into(), + dataset_schema: serialize_schema(&schema)?, + parameter_schema: serialize_schema(¶meter_schema)?, + }) + } + + fn fake_flight_info(&self) -> Result { + let batch = Self::fake_result()?; + + Ok(FlightInfo::new() + .try_with_schema(&batch.schema()) + .expect("encoding schema") + .with_endpoint( + FlightEndpoint::new().with_ticket(Ticket::new( + FetchResults { + handle: String::from("part_1"), + } + .as_any() + .encode_to_vec(), + )), + ) + .with_endpoint( + FlightEndpoint::new().with_ticket(Ticket::new( + FetchResults { + handle: String::from("part_2"), + } + .as_any() + .encode_to_vec(), + )), + ) + .with_total_records(batch.num_rows() as i64) + .with_total_bytes(batch.get_array_memory_size() as i64) + .with_ordered(false)) + } +} + +fn serialize_schema(schema: &Schema) -> Result { + Ok(IpcMessage::try_from(SchemaAsIpc::new(schema, &IpcWriteOptions::default()))?.0) } #[tonic::async_trait] @@ -164,45 +265,21 @@ impl FlightSqlService for FlightSqlServiceImpl { ) -> Result, Status> { assert_eq!(query.query, QUERY); - let batch = Self::fake_result().unwrap(); - - let info = FlightInfo::new() - .try_with_schema(&batch.schema()) - .expect("encoding schema") - .with_endpoint( - FlightEndpoint::new().with_ticket(Ticket::new( - FetchResults { - handle: String::from("part_1"), - } - .as_any() - .encode_to_vec(), - )), - ) - .with_endpoint( - FlightEndpoint::new().with_ticket(Ticket::new( - FetchResults { - handle: String::from("part_2"), - } - .as_any() - .encode_to_vec(), - )), - ) - .with_total_records(batch.num_rows() as i64) - .with_total_bytes(batch.get_array_memory_size() as i64) - .with_ordered(false); - - let resp = Response::new(info); + let resp = Response::new(self.fake_flight_info().unwrap()); Ok(resp) } async fn get_flight_info_prepared_statement( &self, - _cmd: CommandPreparedStatementQuery, + cmd: CommandPreparedStatementQuery, _request: Request, ) -> Result, Status> { - Err(Status::unimplemented( - "get_flight_info_prepared_statement not implemented", - )) + assert_eq!( + cmd.prepared_statement_handle, + PREPARED_STATEMENT_HANDLE.as_bytes() + ); + let resp = Response::new(self.fake_flight_info().unwrap()); + Ok(resp) } async fn get_flight_info_substrait_plan( @@ -426,7 +503,7 @@ impl FlightSqlService for FlightSqlServiceImpl { async fn do_put_statement_update( &self, _ticket: CommandStatementUpdate, - _request: Request>, + _request: Request, ) -> Result { Err(Status::unimplemented( "do_put_statement_update not implemented", @@ -436,7 +513,7 @@ impl FlightSqlService for FlightSqlServiceImpl { async fn do_put_substrait_plan( &self, _ticket: CommandStatementSubstraitPlan, - _request: Request>, + _request: Request, ) -> Result { Err(Status::unimplemented( "do_put_substrait_plan not implemented", @@ -446,17 +523,36 @@ impl FlightSqlService for FlightSqlServiceImpl { async fn do_put_prepared_statement_query( &self, _query: CommandPreparedStatementQuery, - _request: Request>, + request: Request, ) -> Result::DoPutStream>, Status> { - Err(Status::unimplemented( - "do_put_prepared_statement_query not implemented", + // just make sure decoding the parameters works + let parameters = FlightRecordBatchStream::new_from_flight_data( + request.into_inner().map_err(|e| e.into()), + ) + .try_collect::>() + .await?; + + for (left, right) in parameters[0].schema().all_fields().iter().zip(vec![ + Field::new("$1", DataType::Utf8, false), + Field::new("$2", DataType::Int64, true), + ]) { + if left.name() != right.name() || left.data_type() != right.data_type() { + return Err(Status::invalid_argument(format!( + "Parameters did not match parameter schema\ngot {}", + parameters[0].schema(), + ))); + } + } + + Ok(Response::new( + futures::stream::once(async { Ok(PutResult::default()) }).boxed(), )) } async fn do_put_prepared_statement_update( &self, _query: CommandPreparedStatementUpdate, - _request: Request>, + _request: Request, ) -> Result { Err(Status::unimplemented( "do_put_prepared_statement_update not implemented", @@ -468,9 +564,8 @@ impl FlightSqlService for FlightSqlServiceImpl { _query: ActionCreatePreparedStatementRequest, _request: Request, ) -> Result { - Err(Status::unimplemented( - "do_action_create_prepared_statement not implemented", - )) + Self::create_fake_prepared_stmt() + .map_err(|e| Status::internal(format!("Unable to serialize schema: {e}"))) } async fn do_action_close_prepared_statement( diff --git a/arrow-integration-test/src/datatype.rs b/arrow-integration-test/src/datatype.rs index 47bacc7cc74b..42ac71fbbd7e 100644 --- a/arrow-integration-test/src/datatype.rs +++ b/arrow-integration-test/src/datatype.rs @@ -124,26 +124,16 @@ pub fn data_type_from_json(json: &serde_json::Value) -> Result { } Some(s) if s == "duration" => match map.get("unit") { Some(p) if p == "SECOND" => Ok(DataType::Duration(TimeUnit::Second)), - Some(p) if p == "MILLISECOND" => { - Ok(DataType::Duration(TimeUnit::Millisecond)) - } - Some(p) if p == "MICROSECOND" => { - Ok(DataType::Duration(TimeUnit::Microsecond)) - } - Some(p) if p == "NANOSECOND" => { - Ok(DataType::Duration(TimeUnit::Nanosecond)) - } + Some(p) if p == "MILLISECOND" => Ok(DataType::Duration(TimeUnit::Millisecond)), + Some(p) if p == "MICROSECOND" => Ok(DataType::Duration(TimeUnit::Microsecond)), + Some(p) if p == "NANOSECOND" => Ok(DataType::Duration(TimeUnit::Nanosecond)), _ => Err(ArrowError::ParseError( "time unit missing or invalid".to_string(), )), }, Some(s) if s == "interval" => match map.get("unit") { - Some(p) if p == "DAY_TIME" => { - Ok(DataType::Interval(IntervalUnit::DayTime)) - } - Some(p) if p == "YEAR_MONTH" => { - Ok(DataType::Interval(IntervalUnit::YearMonth)) - } + Some(p) if p == "DAY_TIME" => Ok(DataType::Interval(IntervalUnit::DayTime)), + Some(p) if p == "YEAR_MONTH" => Ok(DataType::Interval(IntervalUnit::YearMonth)), Some(p) if p == "MONTH_DAY_NANO" => { Ok(DataType::Interval(IntervalUnit::MonthDayNano)) } diff --git a/arrow-integration-test/src/field.rs b/arrow-integration-test/src/field.rs index f59314ca02db..32edc4165938 100644 --- a/arrow-integration-test/src/field.rs +++ b/arrow-integration-test/src/field.rs @@ -63,18 +63,17 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { "Field 'metadata' must have exact two entries for each key-value map".to_string(), )); } - if let (Some(k), Some(v)) = - (map.get("key"), map.get("value")) - { - if let (Some(k_str), Some(v_str)) = - (k.as_str(), v.as_str()) - { + if let (Some(k), Some(v)) = (map.get("key"), map.get("value")) { + if let (Some(k_str), Some(v_str)) = (k.as_str(), v.as_str()) { res.insert( k_str.to_string().clone(), v_str.to_string().clone(), ); } else { - return Err(ArrowError::ParseError("Field 'metadata' must have map value of string type".to_string())); + return Err(ArrowError::ParseError( + "Field 'metadata' must have map value of string type" + .to_string(), + )); } } else { return Err(ArrowError::ParseError("Field 'metadata' lacks map keys named \"key\" or \"value\"".to_string())); @@ -115,46 +114,47 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { // if data_type is a struct or list, get its children let data_type = match data_type { - DataType::List(_) - | DataType::LargeList(_) - | DataType::FixedSizeList(_, _) => match map.get("children") { - Some(Value::Array(values)) => { - if values.len() != 1 { + DataType::List(_) | DataType::LargeList(_) | DataType::FixedSizeList(_, _) => { + match map.get("children") { + Some(Value::Array(values)) => { + if values.len() != 1 { + return Err(ArrowError::ParseError( + "Field 'children' must have one element for a list data type" + .to_string(), + )); + } + match data_type { + DataType::List(_) => { + DataType::List(Arc::new(field_from_json(&values[0])?)) + } + DataType::LargeList(_) => { + DataType::LargeList(Arc::new(field_from_json(&values[0])?)) + } + DataType::FixedSizeList(_, int) => DataType::FixedSizeList( + Arc::new(field_from_json(&values[0])?), + int, + ), + _ => unreachable!( + "Data type should be a list, largelist or fixedsizelist" + ), + } + } + Some(_) => { return Err(ArrowError::ParseError( - "Field 'children' must have one element for a list data type".to_string(), - )); + "Field 'children' must be an array".to_string(), + )) } - match data_type { - DataType::List(_) => { - DataType::List(Arc::new(field_from_json(&values[0])?)) - } - DataType::LargeList(_) => DataType::LargeList(Arc::new( - field_from_json(&values[0])?, - )), - DataType::FixedSizeList(_, int) => DataType::FixedSizeList( - Arc::new(field_from_json(&values[0])?), - int, - ), - _ => unreachable!( - "Data type should be a list, largelist or fixedsizelist" - ), + None => { + return Err(ArrowError::ParseError( + "Field missing 'children' attribute".to_string(), + )); } } - Some(_) => { - return Err(ArrowError::ParseError( - "Field 'children' must be an array".to_string(), - )) - } - None => { - return Err(ArrowError::ParseError( - "Field missing 'children' attribute".to_string(), - )); - } - }, + } DataType::Struct(_) => match map.get("children") { - Some(Value::Array(values)) => DataType::Struct( - values.iter().map(field_from_json).collect::>()?, - ), + Some(Value::Array(values)) => { + DataType::Struct(values.iter().map(field_from_json).collect::>()?) + } Some(_) => { return Err(ArrowError::ParseError( "Field 'children' must be an array".to_string(), @@ -175,17 +175,16 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { DataType::Struct(map_fields) if map_fields.len() == 2 => { DataType::Map(Arc::new(child), keys_sorted) } - t => { - return Err(ArrowError::ParseError( - format!("Map children should be a struct with 2 fields, found {t:?}") - )) + t => { + return Err(ArrowError::ParseError(format!( + "Map children should be a struct with 2 fields, found {t:?}" + ))) } } } Some(_) => { return Err(ArrowError::ParseError( - "Field 'children' must be an array with 1 element" - .to_string(), + "Field 'children' must be an array with 1 element".to_string(), )) } None => { @@ -200,9 +199,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { let fields = fields .iter() .zip(values) - .map(|((id, _), value)| { - Ok((id, Arc::new(field_from_json(value)?))) - }) + .map(|((id, _), value)| Ok((id, Arc::new(field_from_json(value)?)))) .collect::>()?; DataType::Union(fields, mode) @@ -255,8 +252,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { _ => data_type, }; - let mut field = - Field::new_dict(name, data_type, nullable, dict_id, dict_is_ordered); + let mut field = Field::new_dict(name, data_type, nullable, dict_id, dict_is_ordered); field.set_metadata(metadata); Ok(field) } @@ -269,9 +265,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { /// Generate a JSON representation of the `Field`. pub fn field_to_json(field: &Field) -> serde_json::Value { let children: Vec = match field.data_type() { - DataType::Struct(fields) => { - fields.iter().map(|x| field_to_json(x.as_ref())).collect() - } + DataType::Struct(fields) => fields.iter().map(|x| field_to_json(x.as_ref())).collect(), DataType::List(field) | DataType::LargeList(field) | DataType::FixedSizeList(field, _) diff --git a/arrow-integration-test/src/lib.rs b/arrow-integration-test/src/lib.rs index 04bbcf3f6f23..7b797aa07061 100644 --- a/arrow-integration-test/src/lib.rs +++ b/arrow-integration-test/src/lib.rs @@ -183,7 +183,8 @@ impl ArrowJson { return Ok(false); } } - _ => return Ok(false), + Some(Err(e)) => return Err(e), + None => return Ok(false), } } @@ -260,9 +261,7 @@ impl ArrowJsonField { true } Err(e) => { - eprintln!( - "Encountered error while converting JSON field to Arrow field: {e:?}" - ); + eprintln!("Encountered error while converting JSON field to Arrow field: {e:?}"); false } } @@ -272,8 +271,8 @@ impl ArrowJsonField { /// TODO: convert to use an Into fn to_arrow_field(&self) -> Result { // a bit regressive, but we have to convert the field to JSON in order to convert it - let field = serde_json::to_value(self) - .map_err(|error| ArrowError::JsonError(error.to_string()))?; + let field = + serde_json::to_value(self).map_err(|error| ArrowError::JsonError(error.to_string()))?; field_from_json(&field) } } @@ -388,12 +387,9 @@ pub fn array_from_json( match is_valid { 1 => b.append_value(match value { Value::Number(n) => n.as_i64().unwrap(), - Value::String(s) => { - s.parse().expect("Unable to parse string as i64") - } + Value::String(s) => s.parse().expect("Unable to parse string as i64"), Value::Object(ref map) - if map.contains_key("days") - && map.contains_key("milliseconds") => + if map.contains_key("days") && map.contains_key("milliseconds") => { match field.data_type() { DataType::Interval(IntervalUnit::DayTime) => { @@ -403,23 +399,19 @@ pub fn array_from_json( match (days, milliseconds) { (Value::Number(d), Value::Number(m)) => { let mut bytes = [0_u8; 8]; - let m = (m.as_i64().unwrap() as i32) - .to_le_bytes(); - let d = (d.as_i64().unwrap() as i32) - .to_le_bytes(); + let m = (m.as_i64().unwrap() as i32).to_le_bytes(); + let d = (d.as_i64().unwrap() as i32).to_le_bytes(); let c = [d, m].concat(); bytes.copy_from_slice(c.as_slice()); i64::from_le_bytes(bytes) } - _ => panic!( - "Unable to parse {value:?} as interval daytime" - ), + _ => { + panic!("Unable to parse {value:?} as interval daytime") + } } } - _ => panic!( - "Unable to parse {value:?} as interval daytime" - ), + _ => panic!("Unable to parse {value:?} as interval daytime"), } } _ => panic!("Unable to parse {value:?} as number"), @@ -498,9 +490,7 @@ pub fn array_from_json( .expect("Unable to parse string as u64"), ) } else if value.is_number() { - b.append_value( - value.as_u64().expect("Unable to read number as u64"), - ) + b.append_value(value.as_u64().expect("Unable to read number as u64")) } else { panic!("Unable to parse value {value:?} as u64") } @@ -534,11 +524,10 @@ pub fn array_from_json( let months = months.as_i64().unwrap() as i32; let days = days.as_i64().unwrap() as i32; let nanoseconds = nanoseconds.as_i64().unwrap(); - let months_days_ns: i128 = ((nanoseconds as i128) - & 0xFFFFFFFFFFFFFFFF) - << 64 - | ((days as i128) & 0xFFFFFFFF) << 32 - | ((months as i128) & 0xFFFFFFFF); + let months_days_ns: i128 = + ((nanoseconds as i128) & 0xFFFFFFFFFFFFFFFF) << 64 + | ((days as i128) & 0xFFFFFFFF) << 32 + | ((months as i128) & 0xFFFFFFFF); months_days_ns } (_, _, _) => { @@ -677,11 +666,8 @@ pub fn array_from_json( DataType::List(child_field) => { let null_buf = create_null_buf(&json_col); let children = json_col.children.clone().unwrap(); - let child_array = array_from_json( - child_field, - children.get(0).unwrap().clone(), - dictionaries, - )?; + let child_array = + array_from_json(child_field, children.get(0).unwrap().clone(), dictionaries)?; let offsets: Vec = json_col .offset .unwrap() @@ -701,11 +687,8 @@ pub fn array_from_json( DataType::LargeList(child_field) => { let null_buf = create_null_buf(&json_col); let children = json_col.children.clone().unwrap(); - let child_array = array_from_json( - child_field, - children.get(0).unwrap().clone(), - dictionaries, - )?; + let child_array = + array_from_json(child_field, children.get(0).unwrap().clone(), dictionaries)?; let offsets: Vec = json_col .offset .unwrap() @@ -728,11 +711,8 @@ pub fn array_from_json( } DataType::FixedSizeList(child_field, _) => { let children = json_col.children.clone().unwrap(); - let child_array = array_from_json( - child_field, - children.get(0).unwrap().clone(), - dictionaries, - )?; + let child_array = + array_from_json(child_field, children.get(0).unwrap().clone(), dictionaries)?; let null_buf = create_null_buf(&json_col); let list_data = ArrayData::builder(field.data_type().clone()) .len(json_col.count) @@ -759,9 +739,7 @@ pub fn array_from_json( } DataType::Dictionary(key_type, value_type) => { let dict_id = field.dict_id().ok_or_else(|| { - ArrowError::JsonError(format!( - "Unable to find dict_id for field {field:?}" - )) + ArrowError::JsonError(format!("Unable to find dict_id for field {field:?}")) })?; // find dictionary let dictionary = dictionaries @@ -822,8 +800,7 @@ pub fn array_from_json( } else { [255_u8; 32] }; - bytes[0..integer_bytes.len()] - .copy_from_slice(integer_bytes.as_slice()); + bytes[0..integer_bytes.len()].copy_from_slice(integer_bytes.as_slice()); b.append_value(i256::from_le_bytes(bytes)); } _ => b.append_null(), @@ -836,11 +813,8 @@ pub fn array_from_json( DataType::Map(child_field, _) => { let null_buf = create_null_buf(&json_col); let children = json_col.children.clone().unwrap(); - let child_array = array_from_json( - child_field, - children.get(0).unwrap().clone(), - dictionaries, - )?; + let child_array = + array_from_json(child_field, children.get(0).unwrap().clone(), dictionaries)?; let offsets: Vec = json_col .offset .unwrap() @@ -945,9 +919,7 @@ pub fn dictionary_array_from_json( .unwrap(); let array = match dict_key { - DataType::Int8 => { - Arc::new(Int8DictionaryArray::from(dict_data)) as ArrayRef - } + DataType::Int8 => Arc::new(Int8DictionaryArray::from(dict_data)) as ArrayRef, DataType::Int16 => Arc::new(Int16DictionaryArray::from(dict_data)), DataType::Int32 => Arc::new(Int32DictionaryArray::from(dict_data)), DataType::Int64 => Arc::new(Int64DictionaryArray::from(dict_data)), @@ -1098,11 +1070,7 @@ mod tests { Field::new("c3", DataType::Utf8, true), Field::new( "c4", - DataType::List(Arc::new(Field::new( - "custom_item", - DataType::Int32, - false, - ))), + DataType::List(Arc::new(Field::new("custom_item", DataType::Int32, false))), true, ), ]); @@ -1198,10 +1166,8 @@ mod tests { ), ]); - let bools_with_metadata_map = - BooleanArray::from(vec![Some(true), None, Some(false)]); - let bools_with_metadata_vec = - BooleanArray::from(vec![Some(true), None, Some(false)]); + let bools_with_metadata_map = BooleanArray::from(vec![Some(true), None, Some(false)]); + let bools_with_metadata_vec = BooleanArray::from(vec![Some(true), None, Some(false)]); let bools = BooleanArray::from(vec![Some(true), None, Some(false)]); let int8s = Int8Array::from(vec![Some(1), None, Some(3)]); let int16s = Int16Array::from(vec![Some(1), None, Some(3)]); @@ -1219,39 +1185,24 @@ mod tests { Some(29923997007884), Some(30612271819236), ]); - let time_secs = - Time32SecondArray::from(vec![Some(27974), Some(78592), Some(43207)]); - let time_millis = Time32MillisecondArray::from(vec![ - Some(6613125), - Some(74667230), - Some(52260079), - ]); - let time_micros = - Time64MicrosecondArray::from(vec![Some(62522958593), None, None]); - let time_nanos = Time64NanosecondArray::from(vec![ - Some(73380123595985), - None, - Some(16584393546415), - ]); + let time_secs = Time32SecondArray::from(vec![Some(27974), Some(78592), Some(43207)]); + let time_millis = + Time32MillisecondArray::from(vec![Some(6613125), Some(74667230), Some(52260079)]); + let time_micros = Time64MicrosecondArray::from(vec![Some(62522958593), None, None]); + let time_nanos = + Time64NanosecondArray::from(vec![Some(73380123595985), None, Some(16584393546415)]); let ts_secs = TimestampSecondArray::from(vec![None, Some(193438817552), None]); - let ts_millis = TimestampMillisecondArray::from(vec![ - None, - Some(38606916383008), - Some(58113709376587), - ]); + let ts_millis = + TimestampMillisecondArray::from(vec![None, Some(38606916383008), Some(58113709376587)]); let ts_micros = TimestampMicrosecondArray::from(vec![None, None, None]); - let ts_nanos = - TimestampNanosecondArray::from(vec![None, None, Some(-6473623571954960143)]); + let ts_nanos = TimestampNanosecondArray::from(vec![None, None, Some(-6473623571954960143)]); let ts_secs_tz = TimestampSecondArray::from(vec![None, Some(193438817552), None]) .with_timezone_opt(secs_tz); - let ts_millis_tz = TimestampMillisecondArray::from(vec![ - None, - Some(38606916383008), - Some(58113709376587), - ]) - .with_timezone_opt(millis_tz); - let ts_micros_tz = TimestampMicrosecondArray::from(vec![None, None, None]) - .with_timezone_opt(micros_tz); + let ts_millis_tz = + TimestampMillisecondArray::from(vec![None, Some(38606916383008), Some(58113709376587)]) + .with_timezone_opt(millis_tz); + let ts_micros_tz = + TimestampMicrosecondArray::from(vec![None, None, None]).with_timezone_opt(micros_tz); let ts_nanos_tz = TimestampNanosecondArray::from(vec![None, None, Some(-6473623571954960143)]) .with_timezone_opt(nanos_tz); @@ -1259,8 +1210,7 @@ mod tests { let value_data = Int32Array::from(vec![None, Some(2), None, None]); let value_offsets = Buffer::from_slice_ref([0, 3, 4, 4]); - let list_data_type = - DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); + let list_data_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); let list_data = ArrayData::builder(list_data_type) .len(3) .add_buffer(value_offsets) diff --git a/arrow-integration-test/src/schema.rs b/arrow-integration-test/src/schema.rs index 6e143c2838d9..b5f6c5e86b38 100644 --- a/arrow-integration-test/src/schema.rs +++ b/arrow-integration-test/src/schema.rs @@ -65,11 +65,9 @@ fn from_metadata(json: &serde_json::Value) -> Result> { match json { Value::Array(_) => { let mut hashmap = HashMap::new(); - let values: Vec = serde_json::from_value(json.clone()) - .map_err(|_| { - ArrowError::JsonError( - "Unable to parse object into key-value pair".to_string(), - ) + let values: Vec = + serde_json::from_value(json.clone()).map_err(|_| { + ArrowError::JsonError("Unable to parse object into key-value pair".to_string()) })?; for meta in values { hashmap.insert(meta.key.clone(), meta.value); @@ -110,11 +108,10 @@ mod tests { #[test] fn schema_json() { // Add some custom metadata - let metadata: HashMap = - [("Key".to_string(), "Value".to_string())] - .iter() - .cloned() - .collect(); + let metadata: HashMap = [("Key".to_string(), "Value".to_string())] + .iter() + .cloned() + .collect(); let schema = Schema::new_with_metadata( vec![ @@ -140,10 +137,7 @@ mod tests { ), Field::new( "c17", - DataType::Timestamp( - TimeUnit::Microsecond, - Some("Africa/Johannesburg".into()), - ), + DataType::Timestamp(TimeUnit::Microsecond, Some("Africa/Johannesburg".into())), false, ), Field::new( @@ -197,10 +191,7 @@ mod tests { Field::new("c32", DataType::Duration(TimeUnit::Nanosecond), false), Field::new_dict( "c33", - DataType::Dictionary( - Box::new(DataType::Int32), - Box::new(DataType::Utf8), - ), + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), true, 123, true, diff --git a/arrow-integration-testing/Cargo.toml b/arrow-integration-testing/Cargo.toml index 7f78cf50a9d7..c29860f09d64 100644 --- a/arrow-integration-testing/Cargo.toml +++ b/arrow-integration-testing/Cargo.toml @@ -27,11 +27,14 @@ edition = { workspace = true } publish = false rust-version = { workspace = true } +[lib] +crate-type = ["lib", "cdylib"] + [features] logging = ["tracing-subscriber"] [dependencies] -arrow = { path = "../arrow", default-features = false, features = ["test_utils", "ipc", "ipc_compression", "json"] } +arrow = { path = "../arrow", default-features = false, features = ["test_utils", "ipc", "ipc_compression", "json", "ffi"] } arrow-flight = { path = "../arrow-flight", default-features = false } arrow-buffer = { path = "../arrow-buffer", default-features = false } arrow-integration-test = { path = "../arrow-integration-test", default-features = false } @@ -39,11 +42,11 @@ async-trait = { version = "0.1.41", default-features = false } clap = { version = "4", default-features = false, features = ["std", "derive", "help", "error-context", "usage"] } futures = { version = "0.3", default-features = false } hex = { version = "0.4", default-features = false, features = ["std"] } -prost = { version = "0.11", default-features = false } +prost = { version = "0.12", default-features = false } serde = { version = "1.0", default-features = false, features = ["rc", "derive"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } tokio = { version = "1.0", default-features = false } -tonic = { version = "0.9", default-features = false } +tonic = { version = "0.10", default-features = false } tracing-subscriber = { version = "0.3.1", default-features = false, features = ["fmt"], optional = true } num = { version = "0.4", default-features = false, features = ["std"] } flate2 = { version = "1", default-features = false, features = ["rust_backend"] } diff --git a/arrow-integration-testing/README.md b/arrow-integration-testing/README.md index e82591e6b139..dcf39c27fbc5 100644 --- a/arrow-integration-testing/README.md +++ b/arrow-integration-testing/README.md @@ -48,7 +48,7 @@ ln -s arrow/rust ```shell cd arrow -pip install -e dev/archery[docker] +pip install -e dev/archery[integration] ``` ### Build the C++ binaries: diff --git a/arrow-integration-testing/src/bin/arrow-json-integration-test.rs b/arrow-integration-testing/src/bin/arrow-json-integration-test.rs index 2c36e8d9b8ae..9f1abb16a668 100644 --- a/arrow-integration-testing/src/bin/arrow-json-integration-test.rs +++ b/arrow-integration-testing/src/bin/arrow-json-integration-test.rs @@ -15,16 +15,13 @@ // specific language governing permissions and limitations // under the License. -use arrow::datatypes::{DataType, Field}; -use arrow::datatypes::{Fields, Schema}; use arrow::error::{ArrowError, Result}; use arrow::ipc::reader::FileReader; use arrow::ipc::writer::FileWriter; use arrow_integration_test::*; -use arrow_integration_testing::read_json_file; +use arrow_integration_testing::{canonicalize_schema, open_json_file}; use clap::Parser; use std::fs::File; -use std::sync::Arc; #[derive(clap::ValueEnum, Debug, Clone)] #[clap(rename_all = "SCREAMING_SNAKE_CASE")] @@ -66,12 +63,12 @@ fn json_to_arrow(json_name: &str, arrow_name: &str, verbose: bool) -> Result<()> eprintln!("Converting {json_name} to {arrow_name}"); } - let json_file = read_json_file(json_name)?; + let json_file = open_json_file(json_name)?; let arrow_file = File::create(arrow_name)?; let mut writer = FileWriter::try_new(arrow_file, &json_file.schema)?; - for b in json_file.batches { + for b in json_file.read_batches()? { writer.write(&b)?; } @@ -113,55 +110,13 @@ fn arrow_to_json(arrow_name: &str, json_name: &str, verbose: bool) -> Result<()> Ok(()) } -fn canonicalize_schema(schema: &Schema) -> Schema { - let fields = schema - .fields() - .iter() - .map(|field| match field.data_type() { - DataType::Map(child_field, sorted) => match child_field.data_type() { - DataType::Struct(fields) if fields.len() == 2 => { - let first_field = fields.get(0).unwrap(); - let key_field = Arc::new(Field::new( - "key", - first_field.data_type().clone(), - first_field.is_nullable(), - )); - let second_field = fields.get(1).unwrap(); - let value_field = Arc::new(Field::new( - "value", - second_field.data_type().clone(), - second_field.is_nullable(), - )); - - let fields = Fields::from([key_field, value_field]); - let struct_type = DataType::Struct(fields); - let child_field = - Field::new("entries", struct_type, child_field.is_nullable()); - - Arc::new(Field::new( - field.name().as_str(), - DataType::Map(Arc::new(child_field), *sorted), - field.is_nullable(), - )) - } - _ => panic!( - "The child field of Map type should be Struct type with 2 fields." - ), - }, - _ => field.clone(), - }) - .collect::(); - - Schema::new(fields).with_metadata(schema.metadata().clone()) -} - fn validate(arrow_name: &str, json_name: &str, verbose: bool) -> Result<()> { if verbose { eprintln!("Validating {arrow_name} and {json_name}"); } // open JSON file - let json_file = read_json_file(json_name)?; + let json_file = open_json_file(json_name)?; // open Arrow file let arrow_file = File::open(arrow_name)?; @@ -176,7 +131,7 @@ fn validate(arrow_name: &str, json_name: &str, verbose: bool) -> Result<()> { ))); } - let json_batches = &json_file.batches; + let json_batches = json_file.read_batches()?; // compare number of batches assert!( diff --git a/arrow-integration-testing/src/bin/flight-test-integration-client.rs b/arrow-integration-testing/src/bin/flight-test-integration-client.rs index d46b4fac759e..b8bbb952837b 100644 --- a/arrow-integration-testing/src/bin/flight-test-integration-client.rs +++ b/arrow-integration-testing/src/bin/flight-test-integration-client.rs @@ -62,8 +62,7 @@ async fn main() -> Result { } None => { let path = args.path.expect("No path is given"); - flight_client_scenarios::integration_test::run_scenario(&host, port, &path) - .await?; + flight_client_scenarios::integration_test::run_scenario(&host, port, &path).await?; } } diff --git a/arrow-integration-testing/src/flight_client_scenarios/auth_basic_proto.rs b/arrow-integration-testing/src/flight_client_scenarios/auth_basic_proto.rs index 9f66abf50106..376e31e15553 100644 --- a/arrow-integration-testing/src/flight_client_scenarios/auth_basic_proto.rs +++ b/arrow-integration-testing/src/flight_client_scenarios/auth_basic_proto.rs @@ -17,9 +17,7 @@ use crate::{AUTH_PASSWORD, AUTH_USERNAME}; -use arrow_flight::{ - flight_service_client::FlightServiceClient, BasicAuth, HandshakeRequest, -}; +use arrow_flight::{flight_service_client::FlightServiceClient, BasicAuth, HandshakeRequest}; use futures::{stream, StreamExt}; use prost::Message; use tonic::{metadata::MetadataValue, Request, Status}; @@ -78,11 +76,7 @@ pub async fn run_scenario(host: &str, port: u16) -> Result { Ok(()) } -async fn authenticate( - client: &mut Client, - username: &str, - password: &str, -) -> Result { +async fn authenticate(client: &mut Client, username: &str, password: &str) -> Result { let auth = BasicAuth { username: username.into(), password: password.into(), diff --git a/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs b/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs index a55c2dec0580..c6b5a72ca6e2 100644 --- a/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs +++ b/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::{read_json_file, ArrowFile}; +use crate::open_json_file; use std::collections::HashMap; use arrow::{ @@ -27,8 +27,7 @@ use arrow::{ }; use arrow_flight::{ flight_descriptor::DescriptorType, flight_service_client::FlightServiceClient, - utils::flight_data_to_arrow_batch, FlightData, FlightDescriptor, Location, - SchemaAsIpc, Ticket, + utils::flight_data_to_arrow_batch, FlightData, FlightDescriptor, Location, SchemaAsIpc, Ticket, }; use futures::{channel::mpsc, sink::SinkExt, stream, StreamExt}; use tonic::{Request, Streaming}; @@ -46,23 +45,16 @@ pub async fn run_scenario(host: &str, port: u16, path: &str) -> Result { let client = FlightServiceClient::connect(url).await?; - let ArrowFile { - schema, batches, .. - } = read_json_file(path)?; + let json_file = open_json_file(path)?; - let schema = Arc::new(schema); + let batches = json_file.read_batches()?; + let schema = Arc::new(json_file.schema); let mut descriptor = FlightDescriptor::default(); descriptor.set_type(DescriptorType::Path); descriptor.path = vec![path.to_string()]; - upload_data( - client.clone(), - schema.clone(), - descriptor.clone(), - batches.clone(), - ) - .await?; + upload_data(client.clone(), schema, descriptor.clone(), batches.clone()).await?; verify_data(client, descriptor, &batches).await?; Ok(()) @@ -203,19 +195,16 @@ async fn consume_flight_location( let mut dictionaries_by_id = HashMap::new(); for (counter, expected_batch) in expected_data.iter().enumerate() { - let data = receive_batch_flight_data( - &mut resp, - actual_schema.clone(), - &mut dictionaries_by_id, - ) - .await - .unwrap_or_else(|| { - panic!( - "Got fewer batches than expected, received so far: {} expected: {}", - counter, - expected_data.len(), - ) - }); + let data = + receive_batch_flight_data(&mut resp, actual_schema.clone(), &mut dictionaries_by_id) + .await + .unwrap_or_else(|| { + panic!( + "Got fewer batches than expected, received so far: {} expected: {}", + counter, + expected_data.len(), + ) + }); let metadata = counter.to_string().into_bytes(); assert_eq!(metadata, data.app_metadata); @@ -250,8 +239,8 @@ async fn consume_flight_location( async fn receive_schema_flight_data(resp: &mut Streaming) -> Option { let data = resp.next().await?.ok()?; - let message = arrow::ipc::root_as_message(&data.data_header[..]) - .expect("Error parsing message"); + let message = + arrow::ipc::root_as_message(&data.data_header[..]).expect("Error parsing message"); // message header is a Schema, so read it let ipc_schema: ipc::Schema = message @@ -268,8 +257,8 @@ async fn receive_batch_flight_data( dictionaries_by_id: &mut HashMap, ) -> Option { let mut data = resp.next().await?.ok()?; - let mut message = arrow::ipc::root_as_message(&data.data_header[..]) - .expect("Error parsing first message"); + let mut message = + arrow::ipc::root_as_message(&data.data_header[..]).expect("Error parsing first message"); while message.header_type() == ipc::MessageHeader::DictionaryBatch { reader::read_dictionary( @@ -284,8 +273,8 @@ async fn receive_batch_flight_data( .expect("Error reading dictionary"); data = resp.next().await?.ok()?; - message = arrow::ipc::root_as_message(&data.data_header[..]) - .expect("Error parsing message"); + message = + arrow::ipc::root_as_message(&data.data_header[..]).expect("Error parsing message"); } Some(data) diff --git a/arrow-integration-testing/src/flight_client_scenarios/middleware.rs b/arrow-integration-testing/src/flight_client_scenarios/middleware.rs index 773919ff72af..3b71edf446a3 100644 --- a/arrow-integration-testing/src/flight_client_scenarios/middleware.rs +++ b/arrow-integration-testing/src/flight_client_scenarios/middleware.rs @@ -16,8 +16,7 @@ // under the License. use arrow_flight::{ - flight_descriptor::DescriptorType, flight_service_client::FlightServiceClient, - FlightDescriptor, + flight_descriptor::DescriptorType, flight_service_client::FlightServiceClient, FlightDescriptor, }; use prost::bytes::Bytes; use tonic::{Request, Status}; diff --git a/arrow-integration-testing/src/flight_server_scenarios/auth_basic_proto.rs b/arrow-integration-testing/src/flight_server_scenarios/auth_basic_proto.rs index 72d47b1391ee..ff4fc12f2523 100644 --- a/arrow-integration-testing/src/flight_server_scenarios/auth_basic_proto.rs +++ b/arrow-integration-testing/src/flight_server_scenarios/auth_basic_proto.rs @@ -19,15 +19,13 @@ use std::pin::Pin; use std::sync::Arc; use arrow_flight::{ - flight_service_server::FlightService, flight_service_server::FlightServiceServer, - Action, ActionType, BasicAuth, Criteria, Empty, FlightData, FlightDescriptor, - FlightInfo, HandshakeRequest, HandshakeResponse, PutResult, SchemaResult, Ticket, + flight_service_server::FlightService, flight_service_server::FlightServiceServer, Action, + ActionType, BasicAuth, Criteria, Empty, FlightData, FlightDescriptor, FlightInfo, + HandshakeRequest, HandshakeResponse, PutResult, SchemaResult, Ticket, }; use futures::{channel::mpsc, sink::SinkExt, Stream, StreamExt}; use tokio::sync::Mutex; -use tonic::{ - metadata::MetadataMap, transport::Server, Request, Response, Status, Streaming, -}; +use tonic::{metadata::MetadataMap, transport::Server, Request, Response, Status, Streaming}; type TonicStream = Pin + Send + Sync + 'static>>; type Error = Box; @@ -63,10 +61,7 @@ pub struct AuthBasicProtoScenarioImpl { } impl AuthBasicProtoScenarioImpl { - async fn check_auth( - &self, - metadata: &MetadataMap, - ) -> Result { + async fn check_auth(&self, metadata: &MetadataMap) -> Result { let token = metadata .get_bin("auth-token-bin") .and_then(|v| v.to_bytes().ok()) @@ -74,10 +69,7 @@ impl AuthBasicProtoScenarioImpl { self.is_valid(token).await } - async fn is_valid( - &self, - token: Option, - ) -> Result { + async fn is_valid(&self, token: Option) -> Result { match token { Some(t) if t == *self.username => Ok(GrpcServerCallContext { peer_identity: self.username.to_string(), @@ -142,12 +134,10 @@ impl FlightService for AuthBasicProtoScenarioImpl { let req = req.expect("Error reading handshake request"); let HandshakeRequest { payload, .. } = req; - let auth = BasicAuth::decode(&*payload) - .expect("Error parsing handshake request"); + let auth = + BasicAuth::decode(&*payload).expect("Error parsing handshake request"); - let resp = if *auth.username == *username - && *auth.password == *password - { + let resp = if *auth.username == *username && *auth.password == *password { Ok(HandshakeResponse { payload: username.as_bytes().to_vec().into(), ..HandshakeResponse::default() diff --git a/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs b/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs index e2c4cb5d88f3..2011031e921a 100644 --- a/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs +++ b/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs @@ -30,9 +30,9 @@ use arrow::{ }; use arrow_flight::{ flight_descriptor::DescriptorType, flight_service_server::FlightService, - flight_service_server::FlightServiceServer, Action, ActionType, Criteria, Empty, - FlightData, FlightDescriptor, FlightEndpoint, FlightInfo, HandshakeRequest, - HandshakeResponse, IpcMessage, PutResult, SchemaAsIpc, SchemaResult, Ticket, + flight_service_server::FlightServiceServer, Action, ActionType, Criteria, Empty, FlightData, + FlightDescriptor, FlightEndpoint, FlightInfo, HandshakeRequest, HandshakeResponse, IpcMessage, + PutResult, SchemaAsIpc, SchemaResult, Ticket, }; use futures::{channel::mpsc, sink::SinkExt, Stream, StreamExt}; use std::convert::TryInto; @@ -113,8 +113,7 @@ impl FlightService for FlightServiceImpl { let options = arrow::ipc::writer::IpcWriteOptions::default(); - let schema = - std::iter::once(Ok(SchemaAsIpc::new(&flight.schema, &options).into())); + let schema = std::iter::once(Ok(SchemaAsIpc::new(&flight.schema, &options).into())); let batches = flight .chunks @@ -126,12 +125,9 @@ impl FlightService for FlightServiceImpl { let (encoded_dictionaries, encoded_batch) = data_gen .encoded_batch(batch, &mut dictionary_tracker, &options) - .expect( - "DictionaryTracker configured above to not error on replacement", - ); + .expect("DictionaryTracker configured above to not error on replacement"); - let dictionary_flight_data = - encoded_dictionaries.into_iter().map(Into::into); + let dictionary_flight_data = encoded_dictionaries.into_iter().map(Into::into); let mut batch_flight_data: FlightData = encoded_batch.into(); // Only the record batch's FlightData gets app_metadata @@ -182,8 +178,7 @@ impl FlightService for FlightServiceImpl { let endpoint = self.endpoint_from_path(&path[0]); - let total_records: usize = - flight.chunks.iter().map(|chunk| chunk.num_rows()).sum(); + let total_records: usize = flight.chunks.iter().map(|chunk| chunk.num_rows()).sum(); let options = arrow::ipc::writer::IpcWriteOptions::default(); let message = SchemaAsIpc::new(&flight.schema, &options) @@ -224,8 +219,7 @@ impl FlightService for FlightServiceImpl { .clone() .ok_or_else(|| Status::invalid_argument("Must have a descriptor"))?; - if descriptor.r#type != DescriptorType::Path as i32 || descriptor.path.is_empty() - { + if descriptor.r#type != DescriptorType::Path as i32 || descriptor.path.is_empty() { return Err(Status::invalid_argument("Must specify a path")); } @@ -297,9 +291,9 @@ async fn record_batch_from_message( schema_ref: SchemaRef, dictionaries_by_id: &HashMap, ) -> Result { - let ipc_batch = message.header_as_record_batch().ok_or_else(|| { - Status::internal("Could not parse message header as record batch") - })?; + let ipc_batch = message + .header_as_record_batch() + .ok_or_else(|| Status::internal("Could not parse message header as record batch"))?; let arrow_batch_result = reader::read_record_batch( data_body, @@ -320,9 +314,9 @@ async fn dictionary_from_message( schema_ref: SchemaRef, dictionaries_by_id: &mut HashMap, ) -> Result<(), Status> { - let ipc_batch = message.header_as_dictionary_batch().ok_or_else(|| { - Status::internal("Could not parse message header as dictionary batch") - })?; + let ipc_batch = message + .header_as_dictionary_batch() + .ok_or_else(|| Status::internal("Could not parse message header as dictionary batch"))?; let dictionary_batch_result = reader::read_dictionary( data_body, diff --git a/arrow-integration-testing/src/flight_server_scenarios/middleware.rs b/arrow-integration-testing/src/flight_server_scenarios/middleware.rs index 9b1c84b57119..68d871b528a6 100644 --- a/arrow-integration-testing/src/flight_server_scenarios/middleware.rs +++ b/arrow-integration-testing/src/flight_server_scenarios/middleware.rs @@ -19,9 +19,9 @@ use std::pin::Pin; use arrow_flight::{ flight_descriptor::DescriptorType, flight_service_server::FlightService, - flight_service_server::FlightServiceServer, Action, ActionType, Criteria, Empty, - FlightData, FlightDescriptor, FlightInfo, HandshakeRequest, HandshakeResponse, - PutResult, SchemaResult, Ticket, + flight_service_server::FlightServiceServer, Action, ActionType, Criteria, Empty, FlightData, + FlightDescriptor, FlightInfo, HandshakeRequest, HandshakeResponse, PutResult, SchemaResult, + Ticket, }; use futures::Stream; use tonic::{transport::Server, Request, Response, Status, Streaming}; @@ -93,8 +93,7 @@ impl FlightService for MiddlewareScenarioImpl { let descriptor = request.into_inner(); - if descriptor.r#type == DescriptorType::Cmd as i32 - && descriptor.cmd.as_ref() == b"success" + if descriptor.r#type == DescriptorType::Cmd as i32 && descriptor.cmd.as_ref() == b"success" { // Return a fake location - the test doesn't read it let endpoint = super::endpoint("foo", "grpc+tcp://localhost:10010"); diff --git a/arrow-integration-testing/src/lib.rs b/arrow-integration-testing/src/lib.rs index fe0cc68a4205..553e69b0a1a0 100644 --- a/arrow-integration-testing/src/lib.rs +++ b/arrow-integration-testing/src/lib.rs @@ -19,14 +19,20 @@ use serde_json::Value; -use arrow::datatypes::Schema; -use arrow::error::Result; +use arrow::array::{Array, StructArray}; +use arrow::datatypes::{DataType, Field, Fields, Schema}; +use arrow::error::{ArrowError, Result}; +use arrow::ffi::{from_ffi_and_data_type, FFI_ArrowArray, FFI_ArrowSchema}; use arrow::record_batch::RecordBatch; use arrow::util::test_util::arrow_test_data; use arrow_integration_test::*; use std::collections::HashMap; +use std::ffi::{c_int, CStr, CString}; use std::fs::File; use std::io::BufReader; +use std::iter::zip; +use std::ptr; +use std::sync::Arc; /// The expected username for the basic auth integration test. pub const AUTH_USERNAME: &str = "arrow"; @@ -40,11 +46,68 @@ pub struct ArrowFile { pub schema: Schema, // we can evolve this into a concrete Arrow type // this is temporarily not being read from - pub _dictionaries: HashMap, - pub batches: Vec, + dictionaries: HashMap, + arrow_json: Value, } -pub fn read_json_file(json_name: &str) -> Result { +impl ArrowFile { + pub fn read_batch(&self, batch_num: usize) -> Result { + let b = self.arrow_json["batches"].get(batch_num).unwrap(); + let json_batch: ArrowJsonBatch = serde_json::from_value(b.clone()).unwrap(); + record_batch_from_json(&self.schema, json_batch, Some(&self.dictionaries)) + } + + pub fn read_batches(&self) -> Result> { + self.arrow_json["batches"] + .as_array() + .unwrap() + .iter() + .map(|b| { + let json_batch: ArrowJsonBatch = serde_json::from_value(b.clone()).unwrap(); + record_batch_from_json(&self.schema, json_batch, Some(&self.dictionaries)) + }) + .collect() + } +} + +// Canonicalize the names of map fields in a schema +pub fn canonicalize_schema(schema: &Schema) -> Schema { + let fields = schema + .fields() + .iter() + .map(|field| match field.data_type() { + DataType::Map(child_field, sorted) => match child_field.data_type() { + DataType::Struct(fields) if fields.len() == 2 => { + let first_field = fields.get(0).unwrap(); + let key_field = + Arc::new(Field::new("key", first_field.data_type().clone(), false)); + let second_field = fields.get(1).unwrap(); + let value_field = Arc::new(Field::new( + "value", + second_field.data_type().clone(), + second_field.is_nullable(), + )); + + let fields = Fields::from([key_field, value_field]); + let struct_type = DataType::Struct(fields); + let child_field = Field::new("entries", struct_type, false); + + Arc::new(Field::new( + field.name().as_str(), + DataType::Map(Arc::new(child_field), *sorted), + field.is_nullable(), + )) + } + _ => panic!("The child field of Map type should be Struct type with 2 fields."), + }, + _ => field.clone(), + }) + .collect::(); + + Schema::new(fields).with_metadata(schema.metadata().clone()) +} + +pub fn open_json_file(json_name: &str) -> Result { let json_file = File::open(json_name)?; let reader = BufReader::new(json_file); let arrow_json: Value = serde_json::from_reader(reader).unwrap(); @@ -56,23 +119,16 @@ pub fn read_json_file(json_name: &str) -> Result { .as_array() .expect("Unable to get dictionaries as array") { - let json_dict: ArrowJsonDictionaryBatch = serde_json::from_value(d.clone()) - .expect("Unable to get dictionary from JSON"); + let json_dict: ArrowJsonDictionaryBatch = + serde_json::from_value(d.clone()).expect("Unable to get dictionary from JSON"); // TODO: convert to a concrete Arrow type dictionaries.insert(json_dict.id, json_dict); } } - - let mut batches = vec![]; - for b in arrow_json["batches"].as_array().unwrap() { - let json_batch: ArrowJsonBatch = serde_json::from_value(b.clone()).unwrap(); - let batch = record_batch_from_json(&schema, json_batch, Some(&dictionaries))?; - batches.push(batch); - } Ok(ArrowFile { schema, - _dictionaries: dictionaries, - batches, + dictionaries, + arrow_json, }) } @@ -100,3 +156,147 @@ pub fn read_gzip_json(version: &str, path: &str) -> ArrowJson { let arrow_json: ArrowJson = serde_json::from_str(&s).unwrap(); arrow_json } + +// +// C Data Integration entrypoints +// + +fn cdata_integration_export_schema_from_json( + c_json_name: *const i8, + out: *mut FFI_ArrowSchema, +) -> Result<()> { + let json_name = unsafe { CStr::from_ptr(c_json_name) }; + let f = open_json_file(json_name.to_str()?)?; + let c_schema = FFI_ArrowSchema::try_from(&f.schema)?; + // Move exported schema into output struct + unsafe { ptr::write(out, c_schema) }; + Ok(()) +} + +fn cdata_integration_export_batch_from_json( + c_json_name: *const i8, + batch_num: c_int, + out: *mut FFI_ArrowArray, +) -> Result<()> { + let json_name = unsafe { CStr::from_ptr(c_json_name) }; + let b = open_json_file(json_name.to_str()?)?.read_batch(batch_num.try_into().unwrap())?; + let a = StructArray::from(b).into_data(); + let c_array = FFI_ArrowArray::new(&a); + // Move exported array into output struct + unsafe { ptr::write(out, c_array) }; + Ok(()) +} + +fn cdata_integration_import_schema_and_compare_to_json( + c_json_name: *const i8, + c_schema: *mut FFI_ArrowSchema, +) -> Result<()> { + let json_name = unsafe { CStr::from_ptr(c_json_name) }; + let json_schema = open_json_file(json_name.to_str()?)?.schema; + + // The source ArrowSchema will be released when this is dropped + let imported_schema = unsafe { FFI_ArrowSchema::from_raw(c_schema) }; + let imported_schema = Schema::try_from(&imported_schema)?; + + // compare schemas + if canonicalize_schema(&json_schema) != canonicalize_schema(&imported_schema) { + return Err(ArrowError::ComputeError(format!( + "Schemas do not match.\n- JSON: {:?}\n- Imported: {:?}", + json_schema, imported_schema + ))); + } + Ok(()) +} + +fn compare_batches(a: &RecordBatch, b: &RecordBatch) -> Result<()> { + if a.num_columns() != b.num_columns() { + return Err(ArrowError::InvalidArgumentError( + "batches do not have the same number of columns".to_string(), + )); + } + for (a_column, b_column) in zip(a.columns(), b.columns()) { + if a_column != b_column { + return Err(ArrowError::InvalidArgumentError( + "batch columns are not the same".to_string(), + )); + } + } + Ok(()) +} + +fn cdata_integration_import_batch_and_compare_to_json( + c_json_name: *const i8, + batch_num: c_int, + c_array: *mut FFI_ArrowArray, +) -> Result<()> { + let json_name = unsafe { CStr::from_ptr(c_json_name) }; + let json_batch = + open_json_file(json_name.to_str()?)?.read_batch(batch_num.try_into().unwrap())?; + let schema = json_batch.schema(); + + let data_type_for_import = DataType::Struct(schema.fields.clone()); + let imported_array = unsafe { FFI_ArrowArray::from_raw(c_array) }; + let imported_array = unsafe { from_ffi_and_data_type(imported_array, data_type_for_import) }?; + imported_array.validate_full()?; + let imported_batch = RecordBatch::from(StructArray::from(imported_array)); + + compare_batches(&json_batch, &imported_batch) +} + +// If Result is an error, then export a const char* to its string display, otherwise NULL +fn result_to_c_error(result: &std::result::Result) -> *mut i8 { + match result { + Ok(_) => ptr::null_mut(), + Err(e) => CString::new(format!("{}", e)).unwrap().into_raw(), + } +} + +/// Release a const char* exported by result_to_c_error() +/// +/// # Safety +/// +/// The pointer is assumed to have been obtained using CString::into_raw. +#[no_mangle] +pub unsafe extern "C" fn arrow_rs_free_error(c_error: *mut i8) { + if !c_error.is_null() { + drop(unsafe { CString::from_raw(c_error) }); + } +} + +#[no_mangle] +pub extern "C" fn arrow_rs_cdata_integration_export_schema_from_json( + c_json_name: *const i8, + out: *mut FFI_ArrowSchema, +) -> *mut i8 { + let r = cdata_integration_export_schema_from_json(c_json_name, out); + result_to_c_error(&r) +} + +#[no_mangle] +pub extern "C" fn arrow_rs_cdata_integration_import_schema_and_compare_to_json( + c_json_name: *const i8, + c_schema: *mut FFI_ArrowSchema, +) -> *mut i8 { + let r = cdata_integration_import_schema_and_compare_to_json(c_json_name, c_schema); + result_to_c_error(&r) +} + +#[no_mangle] +pub extern "C" fn arrow_rs_cdata_integration_export_batch_from_json( + c_json_name: *const i8, + batch_num: c_int, + out: *mut FFI_ArrowArray, +) -> *mut i8 { + let r = cdata_integration_export_batch_from_json(c_json_name, batch_num, out); + result_to_c_error(&r) +} + +#[no_mangle] +pub extern "C" fn arrow_rs_cdata_integration_import_batch_and_compare_to_json( + c_json_name: *const i8, + batch_num: c_int, + c_array: *mut FFI_ArrowArray, +) -> *mut i8 { + let r = cdata_integration_import_batch_and_compare_to_json(c_json_name, batch_num, c_array); + result_to_c_error(&r) +} diff --git a/arrow-integration-testing/tests/ipc_reader.rs b/arrow-integration-testing/tests/ipc_reader.rs index 696ab6e6053a..11b8fa84534e 100644 --- a/arrow-integration-testing/tests/ipc_reader.rs +++ b/arrow-integration-testing/tests/ipc_reader.rs @@ -63,9 +63,7 @@ fn read_1_0_0_bigendian_decimal_should_panic() { } #[test] -#[should_panic( - expected = "Last offset 687865856 of Utf8 is larger than values length 41" -)] +#[should_panic(expected = "Last offset 687865856 of Utf8 is larger than values length 41")] fn read_1_0_0_bigendian_dictionary_should_panic() { // The offsets are not translated for big-endian files // https://github.com/apache/arrow-rs/issues/859 @@ -160,8 +158,7 @@ fn read_2_0_0_compression() { /// Verification json file /// `arrow-ipc-stream/integration//.json.gz fn verify_arrow_file(testdata: &str, version: &str, path: &str) { - let filename = - format!("{testdata}/arrow-ipc-stream/integration/{version}/{path}.arrow_file"); + let filename = format!("{testdata}/arrow-ipc-stream/integration/{version}/{path}.arrow_file"); println!("Verifying {filename}"); // Compare contents to the expected output format in JSON @@ -197,8 +194,7 @@ fn verify_arrow_file(testdata: &str, version: &str, path: &str) { /// Verification json file /// `arrow-ipc-stream/integration//.json.gz fn verify_arrow_stream(testdata: &str, version: &str, path: &str) { - let filename = - format!("{testdata}/arrow-ipc-stream/integration/{version}/{path}.stream"); + let filename = format!("{testdata}/arrow-ipc-stream/integration/{version}/{path}.stream"); println!("Verifying {filename}"); // Compare contents to the expected output format in JSON diff --git a/arrow-integration-testing/tests/ipc_writer.rs b/arrow-integration-testing/tests/ipc_writer.rs index 11707d935540..d780eb2ee0b5 100644 --- a/arrow-integration-testing/tests/ipc_writer.rs +++ b/arrow-integration-testing/tests/ipc_writer.rs @@ -113,12 +113,7 @@ fn write_2_0_0_compression() { for options in &all_options { println!("Using options {options:?}"); roundtrip_arrow_file_with_options(&testdata, version, path, options.clone()); - roundtrip_arrow_stream_with_options( - &testdata, - version, - path, - options.clone(), - ); + roundtrip_arrow_stream_with_options(&testdata, version, path, options.clone()); } }); } @@ -143,8 +138,7 @@ fn roundtrip_arrow_file_with_options( path: &str, options: IpcWriteOptions, ) { - let filename = - format!("{testdata}/arrow-ipc-stream/integration/{version}/{path}.arrow_file"); + let filename = format!("{testdata}/arrow-ipc-stream/integration/{version}/{path}.arrow_file"); println!("Verifying {filename}"); let mut tempfile = tempfile::tempfile().unwrap(); @@ -156,12 +150,8 @@ fn roundtrip_arrow_file_with_options( // read and rewrite the file to a temp location { - let mut writer = FileWriter::try_new_with_options( - &mut tempfile, - &reader.schema(), - options, - ) - .unwrap(); + let mut writer = + FileWriter::try_new_with_options(&mut tempfile, &reader.schema(), options).unwrap(); while let Some(Ok(batch)) = reader.next() { writer.write(&batch).unwrap(); } @@ -207,12 +197,7 @@ fn roundtrip_arrow_file_with_options( /// Verification json file /// `arrow-ipc-stream/integration//.json.gz fn roundtrip_arrow_stream(testdata: &str, version: &str, path: &str) { - roundtrip_arrow_stream_with_options( - testdata, - version, - path, - IpcWriteOptions::default(), - ) + roundtrip_arrow_stream_with_options(testdata, version, path, IpcWriteOptions::default()) } fn roundtrip_arrow_stream_with_options( @@ -221,8 +206,7 @@ fn roundtrip_arrow_stream_with_options( path: &str, options: IpcWriteOptions, ) { - let filename = - format!("{testdata}/arrow-ipc-stream/integration/{version}/{path}.stream"); + let filename = format!("{testdata}/arrow-ipc-stream/integration/{version}/{path}.stream"); println!("Verifying {filename}"); let mut tempfile = tempfile::tempfile().unwrap(); @@ -234,12 +218,9 @@ fn roundtrip_arrow_stream_with_options( // read and rewrite the file to a temp location { - let mut writer = StreamWriter::try_new_with_options( - &mut tempfile, - &reader.schema(), - options, - ) - .unwrap(); + let mut writer = + StreamWriter::try_new_with_options(&mut tempfile, &reader.schema(), options) + .unwrap(); while let Some(Ok(batch)) = reader.next() { writer.write(&batch).unwrap(); } diff --git a/arrow-ipc/Cargo.toml b/arrow-ipc/Cargo.toml index a03f53d6641c..83ad044d25e7 100644 --- a/arrow-ipc/Cargo.toml +++ b/arrow-ipc/Cargo.toml @@ -40,8 +40,12 @@ arrow-cast = { workspace = true } arrow-data = { workspace = true } arrow-schema = { workspace = true } flatbuffers = { version = "23.1.21", default-features = false } -lz4 = { version = "1.23", default-features = false, optional = true } -zstd = { version = "0.12.0", default-features = false, optional = true } +lz4_flex = { version = "0.11", default-features = false, features = ["std", "frame"], optional = true } +zstd = { version = "0.13.0", default-features = false, optional = true } + +[features] +default = [] +lz4 = ["lz4_flex"] [dev-dependencies] tempfile = "3.3" diff --git a/arrow-ipc/src/compression.rs b/arrow-ipc/src/compression.rs index db05e9a6a6c6..0d8b7b4c1bd4 100644 --- a/arrow-ipc/src/compression.rs +++ b/arrow-ipc/src/compression.rs @@ -90,10 +90,7 @@ impl CompressionCodec { /// [8 bytes]: uncompressed length /// [remaining bytes]: compressed data stream /// ``` - pub(crate) fn decompress_to_buffer( - &self, - input: &Buffer, - ) -> Result { + pub(crate) fn decompress_to_buffer(&self, input: &Buffer) -> Result { // read the first 8 bytes to determine if the data is // compressed let decompressed_length = read_uncompressed_size(input); @@ -103,13 +100,15 @@ impl CompressionCodec { } else if decompressed_length == LENGTH_NO_COMPRESSED_DATA { // no compression input.slice(LENGTH_OF_PREFIX_DATA as usize) - } else { + } else if let Ok(decompressed_length) = usize::try_from(decompressed_length) { // decompress data using the codec - let mut uncompressed_buffer = - Vec::with_capacity(decompressed_length as usize); let input_data = &input[(LENGTH_OF_PREFIX_DATA as usize)..]; - self.decompress(input_data, &mut uncompressed_buffer)?; - Buffer::from(uncompressed_buffer) + self.decompress(input_data, decompressed_length as _)? + .into() + } else { + return Err(ArrowError::IpcError(format!( + "Invalid uncompressed length: {decompressed_length}" + ))); }; Ok(buffer) } @@ -125,24 +124,29 @@ impl CompressionCodec { /// Decompress the data in input buffer and write to output buffer /// using the specified compression - fn decompress( - &self, - input: &[u8], - output: &mut Vec, - ) -> Result { - match self { - CompressionCodec::Lz4Frame => decompress_lz4(input, output), - CompressionCodec::Zstd => decompress_zstd(input, output), + fn decompress(&self, input: &[u8], decompressed_size: usize) -> Result, ArrowError> { + let ret = match self { + CompressionCodec::Lz4Frame => decompress_lz4(input, decompressed_size)?, + CompressionCodec::Zstd => decompress_zstd(input, decompressed_size)?, + }; + if ret.len() != decompressed_size { + return Err(ArrowError::IpcError(format!( + "Expected compressed length of {decompressed_size} got {}", + ret.len() + ))); } + Ok(ret) } } #[cfg(feature = "lz4")] fn compress_lz4(input: &[u8], output: &mut Vec) -> Result<(), ArrowError> { use std::io::Write; - let mut encoder = lz4::EncoderBuilder::new().build(output)?; + let mut encoder = lz4_flex::frame::FrameEncoder::new(output); encoder.write_all(input)?; - encoder.finish().1?; + encoder + .finish() + .map_err(|e| ArrowError::ExternalError(Box::new(e)))?; Ok(()) } @@ -155,14 +159,16 @@ fn compress_lz4(_input: &[u8], _output: &mut Vec) -> Result<(), ArrowError> } #[cfg(feature = "lz4")] -fn decompress_lz4(input: &[u8], output: &mut Vec) -> Result { +fn decompress_lz4(input: &[u8], decompressed_size: usize) -> Result, ArrowError> { use std::io::Read; - Ok(lz4::Decoder::new(input)?.read_to_end(output)?) + let mut output = Vec::with_capacity(decompressed_size); + lz4_flex::frame::FrameDecoder::new(input).read_to_end(&mut output)?; + Ok(output) } #[cfg(not(feature = "lz4"))] #[allow(clippy::ptr_arg)] -fn decompress_lz4(_input: &[u8], _output: &mut Vec) -> Result { +fn decompress_lz4(_input: &[u8], _decompressed_size: usize) -> Result, ArrowError> { Err(ArrowError::InvalidArgumentError( "lz4 IPC decompression requires the lz4 feature".to_string(), )) @@ -186,14 +192,16 @@ fn compress_zstd(_input: &[u8], _output: &mut Vec) -> Result<(), ArrowError> } #[cfg(feature = "zstd")] -fn decompress_zstd(input: &[u8], output: &mut Vec) -> Result { +fn decompress_zstd(input: &[u8], decompressed_size: usize) -> Result, ArrowError> { use std::io::Read; - Ok(zstd::Decoder::new(input)?.read_to_end(output)?) + let mut output = Vec::with_capacity(decompressed_size); + zstd::Decoder::with_buffer(input)?.read_to_end(&mut output)?; + Ok(output) } #[cfg(not(feature = "zstd"))] #[allow(clippy::ptr_arg)] -fn decompress_zstd(_input: &[u8], _output: &mut Vec) -> Result { +fn decompress_zstd(_input: &[u8], _decompressed_size: usize) -> Result, ArrowError> { Err(ArrowError::InvalidArgumentError( "zstd IPC decompression requires the zstd feature".to_string(), )) @@ -216,28 +224,26 @@ mod tests { #[test] #[cfg(feature = "lz4")] fn test_lz4_compression() { - let input_bytes = "hello lz4".as_bytes(); + let input_bytes = b"hello lz4"; let codec = super::CompressionCodec::Lz4Frame; let mut output_bytes: Vec = Vec::new(); codec.compress(input_bytes, &mut output_bytes).unwrap(); - let mut result_output_bytes: Vec = Vec::new(); - codec - .decompress(output_bytes.as_slice(), &mut result_output_bytes) + let result = codec + .decompress(output_bytes.as_slice(), input_bytes.len()) .unwrap(); - assert_eq!(input_bytes, result_output_bytes.as_slice()); + assert_eq!(input_bytes, result.as_slice()); } #[test] #[cfg(feature = "zstd")] fn test_zstd_compression() { - let input_bytes = "hello zstd".as_bytes(); + let input_bytes = b"hello zstd"; let codec = super::CompressionCodec::Zstd; let mut output_bytes: Vec = Vec::new(); codec.compress(input_bytes, &mut output_bytes).unwrap(); - let mut result_output_bytes: Vec = Vec::new(); - codec - .decompress(output_bytes.as_slice(), &mut result_output_bytes) + let result = codec + .decompress(output_bytes.as_slice(), input_bytes.len()) .unwrap(); - assert_eq!(input_bytes, result_output_bytes.as_slice()); + assert_eq!(input_bytes, result.as_slice()); } } diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs index a78ccde6e169..b290a09acf5d 100644 --- a/arrow-ipc/src/convert.rs +++ b/arrow-ipc/src/convert.rs @@ -18,9 +18,7 @@ //! Utilities for converting between IPC types and native Arrow types use arrow_schema::*; -use flatbuffers::{ - FlatBufferBuilder, ForwardsUOffset, UnionWIPOffset, Vector, WIPOffset, -}; +use flatbuffers::{FlatBufferBuilder, ForwardsUOffset, UnionWIPOffset, Vector, WIPOffset}; use std::collections::HashMap; use std::sync::Arc; @@ -186,16 +184,11 @@ pub fn try_schema_from_ipc_buffer(buffer: &[u8]) -> Result { // buffer 0 }; - let msg = - size_prefixed_root_as_message(&buffer[begin_offset..]).map_err(|err| { - ArrowError::ParseError(format!( - "Unable to convert flight info to a message: {err}" - )) - })?; + let msg = size_prefixed_root_as_message(&buffer[begin_offset..]).map_err(|err| { + ArrowError::ParseError(format!("Unable to convert flight info to a message: {err}")) + })?; let ipc_schema = msg.header_as_schema().ok_or_else(|| { - ArrowError::ParseError( - "Unable to convert flight info to a schema".to_string(), - ) + ArrowError::ParseError("Unable to convert flight info to a schema".to_string()) })?; Ok(fb_to_schema(ipc_schema)) } else { @@ -277,15 +270,9 @@ pub(crate) fn get_data_type(field: crate::Field, may_be_dictionary: bool) -> Dat let time = field.type_as_time().unwrap(); match (time.bitWidth(), time.unit()) { (32, crate::TimeUnit::SECOND) => DataType::Time32(TimeUnit::Second), - (32, crate::TimeUnit::MILLISECOND) => { - DataType::Time32(TimeUnit::Millisecond) - } - (64, crate::TimeUnit::MICROSECOND) => { - DataType::Time64(TimeUnit::Microsecond) - } - (64, crate::TimeUnit::NANOSECOND) => { - DataType::Time64(TimeUnit::Nanosecond) - } + (32, crate::TimeUnit::MILLISECOND) => DataType::Time32(TimeUnit::Millisecond), + (64, crate::TimeUnit::MICROSECOND) => DataType::Time64(TimeUnit::Microsecond), + (64, crate::TimeUnit::NANOSECOND) => DataType::Time64(TimeUnit::Nanosecond), z => panic!( "Time type with bit width of {} and unit of {:?} not supported", z.0, z.1 @@ -296,30 +283,22 @@ pub(crate) fn get_data_type(field: crate::Field, may_be_dictionary: bool) -> Dat let timestamp = field.type_as_timestamp().unwrap(); let timezone: Option<_> = timestamp.timezone().map(|tz| tz.into()); match timestamp.unit() { - crate::TimeUnit::SECOND => { - DataType::Timestamp(TimeUnit::Second, timezone) - } + crate::TimeUnit::SECOND => DataType::Timestamp(TimeUnit::Second, timezone), crate::TimeUnit::MILLISECOND => { DataType::Timestamp(TimeUnit::Millisecond, timezone) } crate::TimeUnit::MICROSECOND => { DataType::Timestamp(TimeUnit::Microsecond, timezone) } - crate::TimeUnit::NANOSECOND => { - DataType::Timestamp(TimeUnit::Nanosecond, timezone) - } + crate::TimeUnit::NANOSECOND => DataType::Timestamp(TimeUnit::Nanosecond, timezone), z => panic!("Timestamp type with unit of {z:?} not supported"), } } crate::Type::Interval => { let interval = field.type_as_interval().unwrap(); match interval.unit() { - crate::IntervalUnit::YEAR_MONTH => { - DataType::Interval(IntervalUnit::YearMonth) - } - crate::IntervalUnit::DAY_TIME => { - DataType::Interval(IntervalUnit::DayTime) - } + crate::IntervalUnit::YEAR_MONTH => DataType::Interval(IntervalUnit::YearMonth), + crate::IntervalUnit::DAY_TIME => DataType::Interval(IntervalUnit::DayTime), crate::IntervalUnit::MONTH_DAY_NANO => { DataType::Interval(IntervalUnit::MonthDayNano) } @@ -775,8 +754,8 @@ pub(crate) fn get_fb_field_type<'a>( UnionMode::Dense => crate::UnionMode::Dense, }; - let fbb_type_ids = fbb - .create_vector(&fields.iter().map(|(t, _)| t as i32).collect::>()); + let fbb_type_ids = + fbb.create_vector(&fields.iter().map(|(t, _)| t as i32).collect::>()); let mut builder = crate::UnionBuilder::new(fbb); builder.add_mode(union_mode); builder.add_typeIds(fbb_type_ids); @@ -872,10 +851,7 @@ mod tests { ), Field::new( "timestamp[us]", - DataType::Timestamp( - TimeUnit::Microsecond, - Some("Africa/Johannesburg".into()), - ), + DataType::Timestamp(TimeUnit::Microsecond, Some("Africa/Johannesburg".into())), false, ), Field::new( @@ -900,11 +876,7 @@ mod tests { ), Field::new("utf8", DataType::Utf8, false), Field::new("binary", DataType::Binary, false), - Field::new_list( - "list[u8]", - Field::new("item", DataType::UInt8, false), - true, - ), + Field::new_list("list[u8]", Field::new("item", DataType::UInt8, false), true), Field::new_list( "list[struct]", Field::new_struct( @@ -1013,20 +985,14 @@ mod tests { ), Field::new_dict( "dictionary", - DataType::Dictionary( - Box::new(DataType::Int32), - Box::new(DataType::Utf8), - ), + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), true, 123, true, ), Field::new_dict( "dictionary", - DataType::Dictionary( - Box::new(DataType::UInt8), - Box::new(DataType::UInt32), - ), + DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::UInt32)), true, 123, true, @@ -1056,20 +1022,18 @@ mod tests { // # stripping continuation & length prefix & suffix bytes to get only schema bytes // [x for x in sink.getvalue().to_pybytes()][8:-8] let bytes: Vec = vec![ - 16, 0, 0, 0, 0, 0, 10, 0, 12, 0, 6, 0, 5, 0, 8, 0, 10, 0, 0, 0, 0, 1, 4, 0, - 12, 0, 0, 0, 8, 0, 8, 0, 0, 0, 4, 0, 8, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 20, - 0, 0, 0, 16, 0, 20, 0, 8, 0, 0, 0, 7, 0, 12, 0, 0, 0, 16, 0, 16, 0, 0, 0, 0, - 0, 0, 2, 16, 0, 0, 0, 32, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 102, - 105, 101, 108, 100, 49, 0, 0, 0, 0, 6, 0, 8, 0, 4, 0, 6, 0, 0, 0, 32, 0, 0, - 0, + 16, 0, 0, 0, 0, 0, 10, 0, 12, 0, 6, 0, 5, 0, 8, 0, 10, 0, 0, 0, 0, 1, 4, 0, 12, 0, 0, + 0, 8, 0, 8, 0, 0, 0, 4, 0, 8, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 20, 0, 0, 0, 16, 0, 20, + 0, 8, 0, 0, 0, 7, 0, 12, 0, 0, 0, 16, 0, 16, 0, 0, 0, 0, 0, 0, 2, 16, 0, 0, 0, 32, 0, + 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 102, 105, 101, 108, 100, 49, 0, 0, 0, 0, 6, + 0, 8, 0, 4, 0, 6, 0, 0, 0, 32, 0, 0, 0, ]; let ipc = crate::root_as_message(&bytes).unwrap(); let schema = ipc.header_as_schema().unwrap(); // generate same message with Rust let data_gen = crate::writer::IpcDataGenerator::default(); - let arrow_schema = - Schema::new(vec![Field::new("field1", DataType::UInt32, false)]); + let arrow_schema = Schema::new(vec![Field::new("field1", DataType::UInt32, false)]); let bytes = data_gen .schema_to_bytes(&arrow_schema, &crate::writer::IpcWriteOptions::default()) .ipc_message; diff --git a/arrow-ipc/src/gen/File.rs b/arrow-ipc/src/gen/File.rs index 0e9427813788..c0c2fb183237 100644 --- a/arrow-ipc/src/gen/File.rs +++ b/arrow-ipc/src/gen/File.rs @@ -61,10 +61,7 @@ impl<'b> flatbuffers::Push for Block { type Output = Block; #[inline] unsafe fn push(&self, dst: &mut [u8], _written_len: usize) { - let src = ::core::slice::from_raw_parts( - self as *const Block as *const u8, - Self::size(), - ); + let src = ::core::slice::from_raw_parts(self as *const Block as *const u8, Self::size()); dst.copy_from_slice(src); } } @@ -307,11 +304,7 @@ impl flatbuffers::Verifiable for Footer<'_> { use flatbuffers::Verifiable; v.visit_table(pos)? .visit_field::("version", Self::VT_VERSION, false)? - .visit_field::>( - "schema", - Self::VT_SCHEMA, - false, - )? + .visit_field::>("schema", Self::VT_SCHEMA, false)? .visit_field::>>( "dictionaries", Self::VT_DICTIONARIES, @@ -335,9 +328,7 @@ pub struct FooterArgs<'a> { pub dictionaries: Option>>, pub recordBatches: Option>>, pub custom_metadata: Option< - flatbuffers::WIPOffset< - flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset>>, - >, + flatbuffers::WIPOffset>>>, >, } impl<'a> Default for FooterArgs<'a> { @@ -360,39 +351,29 @@ pub struct FooterBuilder<'a: 'b, 'b> { impl<'a: 'b, 'b> FooterBuilder<'a, 'b> { #[inline] pub fn add_version(&mut self, version: MetadataVersion) { - self.fbb_.push_slot::( - Footer::VT_VERSION, - version, - MetadataVersion::V1, - ); + self.fbb_ + .push_slot::(Footer::VT_VERSION, version, MetadataVersion::V1); } #[inline] pub fn add_schema(&mut self, schema: flatbuffers::WIPOffset>) { self.fbb_ - .push_slot_always::>( - Footer::VT_SCHEMA, - schema, - ); + .push_slot_always::>(Footer::VT_SCHEMA, schema); } #[inline] pub fn add_dictionaries( &mut self, dictionaries: flatbuffers::WIPOffset>, ) { - self.fbb_.push_slot_always::>( - Footer::VT_DICTIONARIES, - dictionaries, - ); + self.fbb_ + .push_slot_always::>(Footer::VT_DICTIONARIES, dictionaries); } #[inline] pub fn add_recordBatches( &mut self, recordBatches: flatbuffers::WIPOffset>, ) { - self.fbb_.push_slot_always::>( - Footer::VT_RECORDBATCHES, - recordBatches, - ); + self.fbb_ + .push_slot_always::>(Footer::VT_RECORDBATCHES, recordBatches); } #[inline] pub fn add_custom_metadata( @@ -407,9 +388,7 @@ impl<'a: 'b, 'b> FooterBuilder<'a, 'b> { ); } #[inline] - pub fn new( - _fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>, - ) -> FooterBuilder<'a, 'b> { + pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> FooterBuilder<'a, 'b> { let start = _fbb.start_table(); FooterBuilder { fbb_: _fbb, @@ -451,9 +430,7 @@ pub fn root_as_footer(buf: &[u8]) -> Result Result { +pub fn size_prefixed_root_as_footer(buf: &[u8]) -> Result { flatbuffers::size_prefixed_root::