Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix S3 in CLI: Do not normalize options values #13576

Draft
wants to merge 8 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 35 additions & 12 deletions datafusion/common/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ macro_rules! config_namespace {
$vis:vis struct $struct_name:ident {
$(
$(#[doc = $d:tt])*
$field_vis:vis $field_name:ident : $field_type:ty, default = $default:expr
$field_vis:vis $field_name:ident : $field_type:ty, $(transform = $tf:expr,)? default = $default:expr
)*$(,)*
}
) => {
Expand All @@ -127,9 +127,13 @@ macro_rules! config_namespace {
impl ConfigField for $struct_name {
fn set(&mut self, key: &str, value: &str) -> Result<()> {
let (key, rem) = key.split_once('.').unwrap_or((key, ""));

match key {
$(
stringify!($field_name) => self.$field_name.set(rem, value),
stringify!($field_name) => {
$(let value = $tf(value);)?
self.$field_name.set(rem, value.as_ref())
},
)*
_ => return _config_err!(
"Config value \"{}\" not found on {}", key, stringify!($struct_name)
Expand Down Expand Up @@ -212,11 +216,11 @@ config_namespace! {
pub enable_ident_normalization: bool, default = true

/// When set to true, SQL parser will normalize options value (convert value to lowercase)
pub enable_options_value_normalization: bool, default = true
pub enable_options_value_normalization: bool, default = false

/// Configure the SQL dialect used by DataFusion's parser; supported values include: Generic,
/// MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, and Ansi.
pub dialect: String, default = "generic".to_string()
pub dialect: String, default = "generic".to_string() // no need to lowercase because this is handled by [`sqlparser::dialect_from_str`]

/// If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but
/// ignore the length. If false, error if a `VARCHAR` with a length is
Expand Down Expand Up @@ -257,7 +261,7 @@ config_namespace! {
///
/// Some functions, e.g. `EXTRACT(HOUR from SOME_TIME)`, shift the underlying datetime
/// according to this time zone, and then extract the hour
pub time_zone: Option<String>, default = Some("+00:00".into())
pub time_zone: Option<String>, default = Some("+00:00".into()) // todo: to_upper?

/// Parquet options
pub parquet: ParquetOptions, default = Default::default()
Expand Down Expand Up @@ -431,7 +435,7 @@ config_namespace! {
///
/// Note that this default setting is not the same as
/// the default parquet writer setting.
pub compression: Option<String>, default = Some("zstd(3)".into())
pub compression: Option<String>, transform = str::to_lowercase, default = Some("zstd(3)".into())

/// (writing) Sets if dictionary encoding is enabled. If NULL, uses
/// default parquet writer setting
Expand All @@ -444,7 +448,7 @@ config_namespace! {
/// Valid values are: "none", "chunk", and "page"
/// These values are not case sensitive. If NULL, uses
/// default parquet writer setting
pub statistics_enabled: Option<String>, default = Some("page".into())
pub statistics_enabled: Option<String>, transform = str::to_lowercase, default = Some("page".into())

/// (writing) Sets max statistics size for any column. If NULL, uses
/// default parquet writer setting
Expand All @@ -470,7 +474,7 @@ config_namespace! {
/// delta_byte_array, rle_dictionary, and byte_stream_split.
/// These values are not case sensitive. If NULL, uses
/// default parquet writer setting
pub encoding: Option<String>, default = None
pub encoding: Option<String>, transform = str::to_lowercase, default = None

/// (writing) Use any available bloom filters when reading parquet files
pub bloom_filter_on_read: bool, default = true
Expand Down Expand Up @@ -973,27 +977,46 @@ impl<F: ConfigField + Default> ConfigField for Option<F> {

#[macro_export]
macro_rules! config_field {
($t:ty) => {
($t:ty, $($transform:expr)? $(,)?) => {
impl ConfigField for $t {
fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str) {
v.some(key, self, description)
v.some(key, self, description);
}

fn set(&mut self, _: &str, value: &str) -> Result<()> {
// Apply all transformations if provided
let value = {
let current_value = value.to_string();
$(
let current_value = $transform(&current_value);
)?
current_value
};

// Parse the final value into the target type
*self = value.parse().map_err(|e| {
DataFusionError::Context(
format!(concat!("Error parsing {} as ", stringify!($t),), value),
format!(
"Error parsing '{}' as {}",
value,
stringify!($t),
),
Box::new(DataFusionError::External(Box::new(e))),
)
})?;

Ok(())
}
}
};

($t:ty) => {
config_field!($t,);
};
}

config_field!(String);
config_field!(bool);
config_field!(bool, str::to_lowercase);
config_field!(usize);
config_field!(f64);
config_field!(u64);
Expand Down
2 changes: 1 addition & 1 deletion datafusion/core/src/datasource/stream.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ impl TableProviderFactory for StreamTableFactory {
let header = if let Ok(opt) = cmd
.options
.get("format.has_header")
.map(|has_header| bool::from_str(has_header))
.map(|has_header| bool::from_str(has_header.to_lowercase().as_str()))
.transpose()
{
opt.unwrap_or(false)
Expand Down
18 changes: 14 additions & 4 deletions datafusion/core/tests/config_from_env.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,21 @@ use std::env;
#[test]
fn from_env() {
// Note: these must be a single test to avoid interference from concurrent execution

let env_key = "DATAFUSION_OPTIMIZER_FILTER_NULL_JOIN_KEYS";
env::set_var(env_key, "true");
let config = ConfigOptions::from_env().unwrap();
// valid testing
for bool_option in ["true", "TRUE", "True"] {
env::set_var(env_key, bool_option);
let config = ConfigOptions::from_env().unwrap();
env::remove_var(env_key);
assert!(config.optimizer.filter_null_join_keys);
}

// invalid testing
env::set_var(env_key, "ttruee");
let err = ConfigOptions::from_env().unwrap_err().strip_backtrace();
assert_eq!(err, "Error parsing 'ttruee' as bool\ncaused by\nExternal error: provided string was not `true` or `false`");
env::remove_var(env_key);
assert!(config.optimizer.filter_null_join_keys);

let env_key = "DATAFUSION_EXECUTION_BATCH_SIZE";

Expand All @@ -37,7 +47,7 @@ fn from_env() {
// for invalid testing
env::set_var(env_key, "abc");
let err = ConfigOptions::from_env().unwrap_err().strip_backtrace();
assert_eq!(err, "Error parsing abc as usize\ncaused by\nExternal error: invalid digit found in string");
assert_eq!(err, "Error parsing 'abc' as usize\ncaused by\nExternal error: invalid digit found in string");

env::remove_var(env_key);
let config = ConfigOptions::from_env().unwrap();
Expand Down
4 changes: 4 additions & 0 deletions datafusion/core/tests/data/dates.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
date
03/12/2024
31/12/2024

2 changes: 1 addition & 1 deletion datafusion/sql/src/planner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ impl Default for ParserOptions {
parse_float_as_decimal: false,
enable_ident_normalization: true,
support_varchar_with_length: true,
enable_options_value_normalization: true,
enable_options_value_normalization: false,
}
}
}
Expand Down
14 changes: 14 additions & 0 deletions datafusion/sqllogictest/test_files/create_external_table.slt
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,20 @@ OPTIONS (
has_header false,
compression gzip);

# Verify that some options are case insensitive
statement ok
CREATE EXTERNAL TABLE IF NOT EXISTS region (
r_regionkey BIGINT,
r_name VARCHAR,
r_comment VARCHAR,
r_rev VARCHAR,
) STORED AS CSV LOCATION 'test_files/tpch/data/region.tbl'
OPTIONS (
format.delimiter '|',
has_header FALSE,
compression GZIP);


# Create an external parquet table and infer schema to order by

# query should succeed
Expand Down
12 changes: 12 additions & 0 deletions datafusion/sqllogictest/test_files/csv_files.slt
Original file line number Diff line number Diff line change
Expand Up @@ -362,3 +362,15 @@ OPTIONS ('format.terminator' E'\r', 'format.has_header' 'true');

statement ok
drop table stored_table_with_cr_terminator;


# check that datetime_format is case sensitive
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that doesn't work, reported a separate issue #13633

statement ok
CREATE EXTERNAL TABLE dates (
date date
) STORED AS CSV
LOCATION '../core/tests/data/dates.csv'
OPTIONS ('format.date_format' 'dd/MM/yyyy', 'format.has_header' 'TRUE');

statement ok
drop table dates;
4 changes: 2 additions & 2 deletions datafusion/sqllogictest/test_files/information_schema.slt
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ datafusion.optimizer.skip_failed_rules false
datafusion.optimizer.top_down_join_key_reordering true
datafusion.sql_parser.dialect generic
datafusion.sql_parser.enable_ident_normalization true
datafusion.sql_parser.enable_options_value_normalization true
datafusion.sql_parser.enable_options_value_normalization false
datafusion.sql_parser.parse_float_as_decimal false
datafusion.sql_parser.support_varchar_with_length true

Expand Down Expand Up @@ -351,7 +351,7 @@ datafusion.optimizer.skip_failed_rules false When set to true, the logical plan
datafusion.optimizer.top_down_join_key_reordering true When set to true, the physical plan optimizer will run a top down process to reorder the join keys
datafusion.sql_parser.dialect generic Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, and Ansi.
datafusion.sql_parser.enable_ident_normalization true When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted)
datafusion.sql_parser.enable_options_value_normalization true When set to true, SQL parser will normalize options value (convert value to lowercase)
datafusion.sql_parser.enable_options_value_normalization false When set to true, SQL parser will normalize options value (convert value to lowercase)
datafusion.sql_parser.parse_float_as_decimal false When set to true, SQL parser will parse float as decimal type
datafusion.sql_parser.support_varchar_with_length true If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits.

Expand Down
8 changes: 4 additions & 4 deletions datafusion/sqllogictest/test_files/set_variable.slt
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,10 @@ datafusion.execution.coalesce_batches false
statement ok
set datafusion.catalog.information_schema = true

statement error DataFusion error: Error parsing 1 as bool
statement error DataFusion error: Error parsing '1' as bool
SET datafusion.execution.coalesce_batches to 1

statement error DataFusion error: Error parsing abc as bool
statement error DataFusion error: Error parsing 'abc' as bool
SET datafusion.execution.coalesce_batches to abc

# set u64 variable
Expand Down Expand Up @@ -132,10 +132,10 @@ datafusion.execution.batch_size 2
statement ok
set datafusion.catalog.information_schema = true

statement error DataFusion error: Error parsing -1 as usize
statement error DataFusion error: Error parsing '-1' as usize
SET datafusion.execution.batch_size to -1

statement error DataFusion error: Error parsing abc as usize
statement error DataFusion error: Error parsing 'abc' as usize
SET datafusion.execution.batch_size to abc

statement error External error: invalid digit found in string
Expand Down
2 changes: 1 addition & 1 deletion docs/source/user-guide/configs.md
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,6 @@ Environment variables are read during `SessionConfig` initialisation so they mus
| datafusion.explain.show_schema | false | When set to true, the explain statement will print schema information |
| datafusion.sql_parser.parse_float_as_decimal | false | When set to true, SQL parser will parse float as decimal type |
| datafusion.sql_parser.enable_ident_normalization | true | When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted) |
| datafusion.sql_parser.enable_options_value_normalization | true | When set to true, SQL parser will normalize options value (convert value to lowercase) |
| datafusion.sql_parser.enable_options_value_normalization | false | When set to true, SQL parser will normalize options value (convert value to lowercase) |
| datafusion.sql_parser.dialect | generic | Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, and Ansi. |
| datafusion.sql_parser.support_varchar_with_length | true | If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits. |
Loading