From 7422f2f7061d9cf875e99dc61a7567332b99a148 Mon Sep 17 00:00:00 2001 From: Colin Ho Date: Wed, 26 Jun 2024 17:02:08 -0700 Subject: [PATCH] [BUG] Fix time type inference (#2441) Fixes JSON time type inference to use Time64 instead of Time32. --------- Co-authored-by: Jay Chia <17691182+jaychia@users.noreply.github.com> --- src/daft-decoding/src/inference.rs | 8 +++++++- src/daft-json/src/inference.rs | 4 ++-- src/daft-json/src/read.rs | 5 ++--- src/daft-json/src/schema.rs | 5 ++--- src/daft-json/test/dtypes.jsonl | 8 ++++---- 5 files changed, 17 insertions(+), 13 deletions(-) diff --git a/src/daft-decoding/src/inference.rs b/src/daft-decoding/src/inference.rs index 0f1b7df89e..1d8be9c32a 100644 --- a/src/daft-decoding/src/inference.rs +++ b/src/daft-decoding/src/inference.rs @@ -36,7 +36,7 @@ pub fn infer_string(string: &str) -> DataType { if is_date(string) { DataType::Date32 } else if let Some(time_unit) = is_time(string) { - DataType::Time32(time_unit) + DataType::Time64(time_unit) } else if let Some((time_unit, offset)) = is_datetime(string) { // NOTE: We try to parse as a non-naive datatime (with timezone information) first, // since is_datetime() will return false if timezone information is not present in the string, @@ -77,6 +77,12 @@ fn is_date(string: &str) -> bool { fn is_time(string: &str) -> Option { if let Ok(t) = string.parse::() { let time_unit = nanoseconds_to_time_unit(t.nanosecond()); + // NOTE: We only support Time64 with nanosecond or microsecond granularity, + // so if the parsed timeunit is millisecond or second, + // map it to Time64 with microsecond granularity. + if time_unit != TimeUnit::Nanosecond { + return Some(TimeUnit::Microsecond); + } return Some(time_unit); } None diff --git a/src/daft-json/src/inference.rs b/src/daft-json/src/inference.rs index 5f87a5e56b..79caea435e 100644 --- a/src/daft-json/src/inference.rs +++ b/src/daft-json/src/inference.rs @@ -185,7 +185,7 @@ pub(crate) fn coerce_data_type(mut datatypes: HashSet) -> DataType { (DataType::Int64, DataType::Boolean) | (DataType::Boolean, DataType::Int64) => { DataType::Int64 } - (DataType::Time32(left_tu), DataType::Time32(right_tu)) => { + (DataType::Time64(left_tu), DataType::Time64(right_tu)) => { // Set unified time unit to the lowest granularity time unit. let unified_tu = if left_tu == right_tu || time_unit_to_ordinal(&left_tu) < time_unit_to_ordinal(&right_tu) @@ -194,7 +194,7 @@ pub(crate) fn coerce_data_type(mut datatypes: HashSet) -> DataType { } else { right_tu }; - DataType::Time32(unified_tu) + DataType::Time64(unified_tu) } ( DataType::Timestamp(left_tu, left_tz), diff --git a/src/daft-json/src/read.rs b/src/daft-json/src/read.rs index 798baf8dc1..fb504d89de 100644 --- a/src/daft-json/src/read.rs +++ b/src/daft-json/src/read.rs @@ -627,9 +627,8 @@ mod tests { Field::new("str", DataType::Utf8), Field::new("null", DataType::Null), Field::new("date", DataType::Date), - // TODO(Clark): Add coverage for time parsing once we add support for representing time series in Daft. - // // Time unit should be coarest granularity found in file, i.e. seconds. - // Field::new("time", DataType::Time(TimeUnit::Nanoseconds)), + // Time unit should be coarest granularity found in file, i.e. microseconds. + Field::new("time", DataType::Time(TimeUnit::Microseconds)), // Time unit should be coarsest granularity found in file, i.e. seconds due to naive date inclusion. Field::new( "naive_timestamp", diff --git a/src/daft-json/src/schema.rs b/src/daft-json/src/schema.rs index cb10875ebe..a88b233f50 100644 --- a/src/daft-json/src/schema.rs +++ b/src/daft-json/src/schema.rs @@ -278,9 +278,8 @@ mod tests { Field::new("str", DataType::Utf8), Field::new("null", DataType::Null), Field::new("date", DataType::Date), - // TODO(Clark): Add coverage for time parsing once we add support for representing time series in Daft. - // // Time unit should be coarest granularity found in file, i.e. seconds. - // Field::new("time", DataType::Time(TimeUnit::Nanoseconds)), + // // Time unit should be coarsest granularity found in file, i.e. microseconds. + Field::new("time", DataType::Time(TimeUnit::Microseconds)), // Time unit should be coarsest granularity found in file, i.e. seconds due to naive date inclusion. Field::new( "naive_timestamp", diff --git a/src/daft-json/test/dtypes.jsonl b/src/daft-json/test/dtypes.jsonl index 3986d97356..799d72d2eb 100644 --- a/src/daft-json/test/dtypes.jsonl +++ b/src/daft-json/test/dtypes.jsonl @@ -1,4 +1,4 @@ -{"int": 1, "float": 2.3, "bool": false, "str": "foo", "null": null, "date": "2023-11-29", "naive_timestamp": "2023-11-29T06:31:52.342", "timestamp": "2023-11-29T06:31:52.342567Z", "list": [1, 2, 3], "obj": {"a": 1, "b": false}, "nested_list": [[{"a": "foo"}]], "nested_obj": {"obj": {"a": 4}, "list": [1, 2]}} -{"int": 2, "float": 3.3, "bool": true, "str": "bar", "null": null, "date": "2023/11/28", "naive_timestamp": "2023-11-29", "timestamp": "2023-11-29T06:31:52.342+07:00", "list": [4, 5], "obj": {"a": 2, "b": false}, "nested_list": [[{"a": "bar"}]], "nested_obj": {"obj": {"a": 6}, "list": [3, 4]}} -{"int": null, "float": null, "bool": null, "str": null, "null": null, "date": null, "naive_timestamp": null, "timestamp": null, "list": null, "obj": null, "nested_list": null, "nested_obj": null} -{"int": 3, "float": 4.3, "bool": false, "str": "baz", "null": null, "date": "2023-11-27", "naive_timestamp": "2023-11-29 06:31:52.342567", "timestamp": "2023-11-29 06:31:52.342-07:00", "list": [6, 7, null, 9], "obj": {"a": null, "b": false}, "nested_list": [[{"a": null}]], "nested_obj": {"obj": {"a": null}, "list": [5, null]}} +{"int": 1, "float": 2.3, "bool": false, "str": "foo", "null": null, "date": "2023-11-29", "time": "12:01:23.342", "naive_timestamp": "2023-11-29T06:31:52.342", "timestamp": "2023-11-29T06:31:52.342567Z", "list": [1, 2, 3], "obj": {"a": 1, "b": false}, "nested_list": [[{"a": "foo"}]], "nested_obj": {"obj": {"a": 4}, "list": [1, 2]}} +{"int": 2, "float": 3.3, "bool": true, "str": "bar", "null": null, "date": "2023/11/28", "time": "12:01:23", "naive_timestamp": "2023-11-29", "timestamp": "2023-11-29T06:31:52.342+07:00", "list": [4, 5], "obj": {"a": 2, "b": false}, "nested_list": [[{"a": "bar"}]], "nested_obj": {"obj": {"a": 6}, "list": [3, 4]}} +{"int": null, "float": null, "bool": null, "str": null, "null": null, "date": null, "time": null, "naive_timestamp": null, "timestamp": null, "list": null, "obj": null, "nested_list": null, "nested_obj": null} +{"int": 3, "float": 4.3, "bool": false, "str": "baz", "null": null, "date": "2023-11-27", "time": "12:01:23.342129312", "naive_timestamp": "2023-11-29 06:31:52.342567", "timestamp": "2023-11-29 06:31:52.342-07:00", "list": [6, 7, null, 9], "obj": {"a": null, "b": false}, "nested_list": [[{"a": null}]], "nested_obj": {"obj": {"a": null}, "list": [5, null]}}