Skip to content

Commit

Permalink
Merge branch 'main' into pg-like-operators
Browse files Browse the repository at this point in the history
  • Loading branch information
gruuya committed Jan 26, 2024
2 parents d2717ba + 8a4bad4 commit 1de01f2
Show file tree
Hide file tree
Showing 102 changed files with 3,442 additions and 2,033 deletions.
10 changes: 8 additions & 2 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,7 @@ jobs:
# Verify MSRV for the crates which are directly used by other projects.
msrv:
name: Verify MSRV
name: Verify MSRV (Min Supported Rust Version)
runs-on: ubuntu-latest
container:
image: amd64/rust
Expand All @@ -500,7 +500,13 @@ jobs:
run: cargo install cargo-msrv
- name: Check datafusion
working-directory: datafusion/core
run: cargo msrv verify
run: |
# If you encounter an error with any of the commands below
# it means some crate in your dependency tree has a higher
# MSRV (Min Supported Rust Version) than the one specified
# in the `rust-version` key of `Cargo.toml`. Check your
# dependencies or update the version in `Cargo.toml`
cargo msrv verify
- name: Check datafusion-substrait
working-directory: datafusion/substrait
run: cargo msrv verify
Expand Down
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ parquet = { version = "50.0.0", default-features = false, features = ["arrow", "
rand = "0.8"
rstest = "0.18.0"
serde_json = "1"
sqlparser = { git = "https://github.com/sqlparser-rs/sqlparser-rs", rev = "c7d2903c6d1d1a98e6e6203c5cfd54acb9aeca16", features = ["visitor"] }
sqlparser = { version = "0.43.0", features = ["visitor"] }
tempfile = "3"
thiserror = "1.0.44"
url = "2.2"
Expand Down
177 changes: 167 additions & 10 deletions benchmarks/queries/clickbench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,180 @@ ClickBench is focused on aggregation and filtering performance (though it has no
[ClickBench repository]: https://github.com/ClickHouse/ClickBench/blob/main/datafusion/queries.sql

## "Extended" Queries
The "extended" queries are not part of the official ClickBench benchmark.
Instead they are used to test other DataFusion features that are not
covered by the standard benchmark

Each description below is for the corresponding line in `extended.sql` (line 1
is `Q0`, line 2 is `Q1`, etc.)
The "extended" queries are not part of the official ClickBench benchmark.
Instead they are used to test other DataFusion features that are not covered by
the standard benchmark Each description below is for the corresponding line in
`extended.sql` (line 1 is `Q0`, line 2 is `Q1`, etc.)

### Q0: Data Exploration

**Question**: "How many distinct searches, mobile phones, and mobile phone models are there in the dataset?"

**Important Query Properties**: multiple `COUNT DISTINCT`s, with low and high cardinality
distinct string columns.

```sql
SELECT COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel")
FROM hits;
```


### Q1: Data Exploration

**Question**: "How many distinct "hit color", "browser country" and "language" are there in the dataset?"

**Important Query Properties**: multiple `COUNT DISTINCT`s. All three are small strings (length either 1 or 2).

### Q0
Models initial Data exploration, to understand some statistics of data.
Import Query Properties: multiple `COUNT DISTINCT` on strings

```sql
SELECT
COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel")
SELECT COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserCountry"), COUNT(DISTINCT "BrowserLanguage")
FROM hits;
```

### Q2: Top 10 anaylsis

**Question**: "Find the top 10 "browser country" by number of distinct "social network"s,
including the distinct counts of "hit color", "browser language",
and "social action"."

**Important Query Properties**: GROUP BY short, string, multiple `COUNT DISTINCT`s. There are several small strings (length either 1 or 2).

```sql
SELECT "BrowserCountry", COUNT(DISTINCT "SocialNetwork"), COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserLanguage"), COUNT(DISTINCT "SocialAction")
FROM hits
GROUP BY 1
ORDER BY 2 DESC
LIMIT 10;
```


## Data Notes

Here are some interesting statistics about the data used in the queries
Max length of `"SearchPhrase"` is 1113 characters
```sql
select min(length("SearchPhrase")) as "SearchPhrase_len_min", max(length("SearchPhrase")) "SearchPhrase_len_max" from 'hits.parquet' limit 10;
+----------------------+----------------------+
| SearchPhrase_len_min | SearchPhrase_len_max |
+----------------------+----------------------+
| 0 | 1113 |
+----------------------+----------------------+
```


Here is the schema of the data
```sql
❯ describe 'hits.parquet';
+-----------------------+-----------+-------------+
| column_name | data_type | is_nullable |
+-----------------------+-----------+-------------+
| WatchID | Int64 | NO |
| JavaEnable | Int16 | NO |
| Title | Utf8 | NO |
| GoodEvent | Int16 | NO |
| EventTime | Int64 | NO |
| EventDate | UInt16 | NO |
| CounterID | Int32 | NO |
| ClientIP | Int32 | NO |
| RegionID | Int32 | NO |
| UserID | Int64 | NO |
| CounterClass | Int16 | NO |
| OS | Int16 | NO |
| UserAgent | Int16 | NO |
| URL | Utf8 | NO |
| Referer | Utf8 | NO |
| IsRefresh | Int16 | NO |
| RefererCategoryID | Int16 | NO |
| RefererRegionID | Int32 | NO |
| URLCategoryID | Int16 | NO |
| URLRegionID | Int32 | NO |
| ResolutionWidth | Int16 | NO |
| ResolutionHeight | Int16 | NO |
| ResolutionDepth | Int16 | NO |
| FlashMajor | Int16 | NO |
| FlashMinor | Int16 | NO |
| FlashMinor2 | Utf8 | NO |
| NetMajor | Int16 | NO |
| NetMinor | Int16 | NO |
| UserAgentMajor | Int16 | NO |
| UserAgentMinor | Utf8 | NO |
| CookieEnable | Int16 | NO |
| JavascriptEnable | Int16 | NO |
| IsMobile | Int16 | NO |
| MobilePhone | Int16 | NO |
| MobilePhoneModel | Utf8 | NO |
| Params | Utf8 | NO |
| IPNetworkID | Int32 | NO |
| TraficSourceID | Int16 | NO |
| SearchEngineID | Int16 | NO |
| SearchPhrase | Utf8 | NO |
| AdvEngineID | Int16 | NO |
| IsArtifical | Int16 | NO |
| WindowClientWidth | Int16 | NO |
| WindowClientHeight | Int16 | NO |
| ClientTimeZone | Int16 | NO |
| ClientEventTime | Int64 | NO |
| SilverlightVersion1 | Int16 | NO |
| SilverlightVersion2 | Int16 | NO |
| SilverlightVersion3 | Int32 | NO |
| SilverlightVersion4 | Int16 | NO |
| PageCharset | Utf8 | NO |
| CodeVersion | Int32 | NO |
| IsLink | Int16 | NO |
| IsDownload | Int16 | NO |
| IsNotBounce | Int16 | NO |
| FUniqID | Int64 | NO |
| OriginalURL | Utf8 | NO |
| HID | Int32 | NO |
| IsOldCounter | Int16 | NO |
| IsEvent | Int16 | NO |
| IsParameter | Int16 | NO |
| DontCountHits | Int16 | NO |
| WithHash | Int16 | NO |
| HitColor | Utf8 | NO |
| LocalEventTime | Int64 | NO |
| Age | Int16 | NO |
| Sex | Int16 | NO |
| Income | Int16 | NO |
| Interests | Int16 | NO |
| Robotness | Int16 | NO |
| RemoteIP | Int32 | NO |
| WindowName | Int32 | NO |
| OpenerName | Int32 | NO |
| HistoryLength | Int16 | NO |
| BrowserLanguage | Utf8 | NO |
| BrowserCountry | Utf8 | NO |
| SocialNetwork | Utf8 | NO |
| SocialAction | Utf8 | NO |
| HTTPError | Int16 | NO |
| SendTiming | Int32 | NO |
| DNSTiming | Int32 | NO |
| ConnectTiming | Int32 | NO |
| ResponseStartTiming | Int32 | NO |
| ResponseEndTiming | Int32 | NO |
| FetchTiming | Int32 | NO |
| SocialSourceNetworkID | Int16 | NO |
| SocialSourcePage | Utf8 | NO |
| ParamPrice | Int64 | NO |
| ParamOrderID | Utf8 | NO |
| ParamCurrency | Utf8 | NO |
| ParamCurrencyID | Int16 | NO |
| OpenstatServiceName | Utf8 | NO |
| OpenstatCampaignID | Utf8 | NO |
| OpenstatAdID | Utf8 | NO |
| OpenstatSourceID | Utf8 | NO |
| UTMSource | Utf8 | NO |
| UTMMedium | Utf8 | NO |
| UTMCampaign | Utf8 | NO |
| UTMContent | Utf8 | NO |
| UTMTerm | Utf8 | NO |
| FromTag | Utf8 | NO |
| HasGCLID | Int16 | NO |
| RefererHash | Int64 | NO |
| URLHash | Int64 | NO |
| CLID | Int32 | NO |
+-----------------------+-----------+-------------+
105 rows in set. Query took 0.034 seconds.

```
4 changes: 3 additions & 1 deletion benchmarks/queries/clickbench/extended.sql
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
SELECT COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel") FROM hits;
SELECT COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel") FROM hits;
SELECT COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserCountry"), COUNT(DISTINCT "BrowserLanguage") FROM hits;
SELECT "BrowserCountry", COUNT(DISTINCT "SocialNetwork"), COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserLanguage"), COUNT(DISTINCT "SocialAction") FROM hits GROUP BY 1 ORDER BY 2 DESC LIMIT 10;
21 changes: 11 additions & 10 deletions datafusion-cli/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions datafusion-examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ cargo run --example csv_sql
- [`simple_udaf.rs`](examples/simple_udaf.rs): Define and invoke a User Defined Aggregate Function (UDAF)
- [`advanced_udaf.rs`](examples/advanced_udaf.rs): Define and invoke a more complicated User Defined Aggregate Function (UDAF)
- [`simple_udfw.rs`](examples/simple_udwf.rs): Define and invoke a User Defined Window Function (UDWF)
- [`to_timestamp.rs`](examples/to_timestamp.rs): Examples of using the to_timestamp functions
- [`advanced_udwf.rs`](examples/advanced_udwf.rs): Define and invoke a more complicated User Defined Window Function (UDWF)

## Distributed
Expand Down
4 changes: 2 additions & 2 deletions datafusion-examples/examples/advanced_udaf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ impl Accumulator for GeometricMean {
// This function serializes our state to `ScalarValue`, which DataFusion uses
// to pass this state between execution stages.
// Note that this can be arbitrary data.
fn state(&self) -> Result<Vec<ScalarValue>> {
fn state(&mut self) -> Result<Vec<ScalarValue>> {
Ok(vec![
ScalarValue::from(self.prod),
ScalarValue::from(self.n),
Expand All @@ -134,7 +134,7 @@ impl Accumulator for GeometricMean {

// DataFusion expects this function to return the final value of this aggregator.
// in this case, this is the formula of the geometric mean
fn evaluate(&self) -> Result<ScalarValue> {
fn evaluate(&mut self) -> Result<ScalarValue> {
let value = self.prod.powf(1.0 / self.n as f64);
Ok(ScalarValue::from(value))
}
Expand Down
4 changes: 2 additions & 2 deletions datafusion-examples/examples/simple_udaf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ impl Accumulator for GeometricMean {
// This function serializes our state to `ScalarValue`, which DataFusion uses
// to pass this state between execution stages.
// Note that this can be arbitrary data.
fn state(&self) -> Result<Vec<ScalarValue>> {
fn state(&mut self) -> Result<Vec<ScalarValue>> {
Ok(vec![
ScalarValue::from(self.prod),
ScalarValue::from(self.n),
Expand All @@ -81,7 +81,7 @@ impl Accumulator for GeometricMean {

// DataFusion expects this function to return the final value of this aggregator.
// in this case, this is the formula of the geometric mean
fn evaluate(&self) -> Result<ScalarValue> {
fn evaluate(&mut self) -> Result<ScalarValue> {
let value = self.prod.powf(1.0 / self.n as f64);
Ok(ScalarValue::from(value))
}
Expand Down
18 changes: 4 additions & 14 deletions datafusion-examples/examples/simple_udf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ use datafusion::error::Result;
use datafusion::prelude::*;
use datafusion_common::cast::as_float64_array;
use datafusion_expr::ColumnarValue;
use datafusion_physical_expr::functions::columnar_values_to_array;
use std::sync::Arc;

/// create local execution context with an in-memory table:
Expand Down Expand Up @@ -70,22 +71,11 @@ async fn main() -> Result<()> {
// this is guaranteed by DataFusion based on the function's signature.
assert_eq!(args.len(), 2);

// Try to obtain row number
let len = args
.iter()
.fold(Option::<usize>::None, |acc, arg| match arg {
ColumnarValue::Scalar(_) => acc,
ColumnarValue::Array(a) => Some(a.len()),
});

let inferred_length = len.unwrap_or(1);

let arg0 = args[0].clone().into_array(inferred_length)?;
let arg1 = args[1].clone().into_array(inferred_length)?;
let args = columnar_values_to_array(args)?;

// 1. cast both arguments to f64. These casts MUST be aligned with the signature or this function panics!
let base = as_float64_array(&arg0).expect("cast failed");
let exponent = as_float64_array(&arg1).expect("cast failed");
let base = as_float64_array(&args[0]).expect("cast failed");
let exponent = as_float64_array(&args[1]).expect("cast failed");

// this is guaranteed by DataFusion. We place it just to make it obvious.
assert_eq!(exponent.len(), base.len());
Expand Down
Loading

0 comments on commit 1de01f2

Please sign in to comment.