diff --git a/daft/datatype.py b/daft/datatype.py index c71717eb90..06b5443a89 100644 --- a/daft/datatype.py +++ b/daft/datatype.py @@ -195,7 +195,7 @@ def date(cls) -> DataType: @classmethod def time(cls, timeunit: TimeUnit | str) -> DataType: - """Time DataType.""" + """Time DataType. Supported timeunits are "us", "ns".""" if isinstance(timeunit, str): timeunit = TimeUnit.from_str(timeunit) return cls._from_pydatatype(PyDataType.time(timeunit._timeunit)) diff --git a/src/daft-core/src/array/ops/cast.rs b/src/daft-core/src/array/ops/cast.rs index 9f86bec7d2..417cf77874 100644 --- a/src/daft-core/src/array/ops/cast.rs +++ b/src/daft-core/src/array/ops/cast.rs @@ -363,6 +363,7 @@ impl TimestampArray { match dtype { DataType::Timestamp(..) => arrow_logical_cast(self, dtype), DataType::Date => Ok(self.date()?.into_series()), + DataType::Time(tu) => Ok(self.time(tu)?.into_series()), DataType::Utf8 => { let DataType::Timestamp(unit, timezone) = self.data_type() else { panic!("Wrong dtype for TimestampArray: {}", self.data_type()) diff --git a/src/daft-core/src/array/ops/date.rs b/src/daft-core/src/array/ops/date.rs index af1e896049..eefb053393 100644 --- a/src/daft-core/src/array/ops/date.rs +++ b/src/daft-core/src/array/ops/date.rs @@ -1,7 +1,7 @@ use crate::{ datatypes::{ - logical::{DateArray, TimestampArray}, - Field, Int32Array, UInt32Array, + logical::{DateArray, TimeArray, TimestampArray}, + Field, Int32Array, Int64Array, TimeUnit, UInt32Array, }, DataType, }; @@ -108,6 +108,78 @@ impl TimestampArray { )) } + pub fn time(&self, timeunit_for_cast: &TimeUnit) -> DaftResult { + let physical = self.physical.as_arrow(); + let DataType::Timestamp(timeunit, tz) = self.data_type() else { + unreachable!("Timestamp array must have Timestamp datatype") + }; + let tu = timeunit.to_arrow(); + let timeunit_for_cast = if timeunit_for_cast == &TimeUnit::Nanoseconds { + TimeUnit::Nanoseconds + } else { + TimeUnit::Microseconds // default to microseconds + }; + let time_arrow = match tz { + Some(tz) => match arrow2::temporal_conversions::parse_offset(tz) { + Ok(tz) => Ok(arrow2::array::PrimitiveArray::::from_iter( + physical.iter().map(|ts| { + ts.map(|ts| { + let dt = + arrow2::temporal_conversions::timestamp_to_datetime(*ts, tu, &tz); + match timeunit_for_cast { + TimeUnit::Nanoseconds => { + let hour = dt.hour() as i64 * 3_600_000_000_000; + let minute = dt.minute() as i64 * 60_000_000_000; + let second = dt.second() as i64 * 1_000_000_000; + let nanosecond = dt.nanosecond() as i64; + hour + minute + second + nanosecond + } + _ => { + let hour = dt.hour() as i64 * 3_600_000_000; + let minute = dt.minute() as i64 * 60_000_000; + let second = dt.second() as i64 * 1_000_000; + let microsecond = dt.nanosecond() as i64 / 1_000; + hour + minute + second + microsecond + } + } + }) + }), + )), + Err(e) => Err(DaftError::TypeError(format!( + "Cannot parse timezone in Timestamp datatype: {}, error: {}", + tz, e + ))), + }, + None => Ok(arrow2::array::PrimitiveArray::::from_iter( + physical.iter().map(|ts| { + ts.map(|ts| { + let dt = arrow2::temporal_conversions::timestamp_to_naive_datetime(*ts, tu); + match timeunit_for_cast { + TimeUnit::Nanoseconds => { + let hour = dt.hour() as i64 * 3_600_000_000_000; + let minute = dt.minute() as i64 * 60_000_000_000; + let second = dt.second() as i64 * 1_000_000_000; + let nanosecond = dt.nanosecond() as i64; + hour + minute + second + nanosecond + } + _ => { + let hour = dt.hour() as i64 * 3_600_000_000; + let minute = dt.minute() as i64 * 60_000_000; + let second = dt.second() as i64 * 1_000_000; + let microsecond = dt.nanosecond() as i64 / 1_000; + hour + minute + second + microsecond + } + } + }) + }), + )), + }?; + Ok(TimeArray::new( + Field::new(self.name(), DataType::Time(timeunit_for_cast)), + Int64Array::from((self.name(), Box::new(time_arrow))), + )) + } + pub fn hour(&self) -> DaftResult { let physical = self.physical.as_arrow(); let DataType::Timestamp(timeunit, tz) = self.data_type() else { diff --git a/src/daft-core/src/array/ops/hash.rs b/src/daft-core/src/array/ops/hash.rs index c2c7ab0729..f1a84529f3 100644 --- a/src/daft-core/src/array/ops/hash.rs +++ b/src/daft-core/src/array/ops/hash.rs @@ -156,7 +156,10 @@ impl DateArray { impl TimeArray { pub fn murmur3_32(&self) -> DaftResult { - self.physical.murmur3_32() + let us = self.cast(&crate::DataType::Time( + crate::datatypes::TimeUnit::Microseconds, + ))?; + us.time()?.physical.murmur3_32() } } diff --git a/tests/expressions/test_expressions.py b/tests/expressions/test_expressions.py index 1fb36cafc0..92c80df665 100644 --- a/tests/expressions/test_expressions.py +++ b/tests/expressions/test_expressions.py @@ -26,7 +26,7 @@ (None, DataType.null()), (Series.from_pylist([1, 2, 3]), DataType.int64()), (date(2023, 1, 1), DataType.date()), - (time(1, 2, 3), DataType.time(timeunit=TimeUnit.from_str("us"))), + (time(1, 2, 3, 4), DataType.time(timeunit=TimeUnit.from_str("us"))), (datetime(2023, 1, 1), DataType.timestamp(timeunit=TimeUnit.from_str("us"))), (datetime(2022, 1, 1, tzinfo=pytz.utc), DataType.timestamp(timeunit=TimeUnit.from_str("us"), timezone="UTC")), ], diff --git a/tests/io/test_csv_roundtrip.py b/tests/io/test_csv_roundtrip.py index 7cbea8e22c..2043e30a1b 100644 --- a/tests/io/test_csv_roundtrip.py +++ b/tests/io/test_csv_roundtrip.py @@ -37,6 +37,12 @@ DataType.time(TimeUnit.us()), DataType.time(TimeUnit.us()), ), + ( + [datetime.time(1, 2, 3, 4), datetime.time(5, 6, 7, 8), None], + pa.time64("ns"), + DataType.time(TimeUnit.ns()), + DataType.time(TimeUnit.us()), + ), ( [datetime.datetime(1994, 1, 1), datetime.datetime(1995, 1, 1), None], pa.timestamp("ms"), diff --git a/tests/io/test_parquet_roundtrip.py b/tests/io/test_parquet_roundtrip.py index 79f72f643f..79050f3fcc 100644 --- a/tests/io/test_parquet_roundtrip.py +++ b/tests/io/test_parquet_roundtrip.py @@ -32,6 +32,11 @@ pa.time64("us"), DataType.time(TimeUnit.us()), ), + ( + [datetime.time(12, 1, 22, 4), datetime.time(13, 8, 45, 34), None], + pa.time64("ns"), + DataType.time(TimeUnit.ns()), + ), ( [datetime.datetime(1994, 1, 1), datetime.datetime(1995, 1, 1), None], pa.timestamp("ms"), diff --git a/tests/series/test_cast.py b/tests/series/test_cast.py index 9367ba131c..996c53317c 100644 --- a/tests/series/test_cast.py +++ b/tests/series/test_cast.py @@ -773,3 +773,22 @@ def test_cast_date_to_timestamp(): back = casted.dt.date() assert (input == back).to_pylist() == [True] + + +def test_cast_timestamp_to_time(): + from datetime import datetime, time + + """Microseconds""" + input = Series.from_pylist([datetime(2022, 1, 6, 12, 34, 56, 78)]) + casted = input.cast(DataType.time("us")) + assert casted.to_pylist() == [time(12, 34, 56, 78)] + + """Nanoseconds""" + input = Series.from_pylist([datetime(2022, 1, 6, 12, 34, 56, 78)]) + casted = input.cast(DataType.time("ns")) + assert casted.to_pylist() == [time(12, 34, 56, 78)] + + """Seconds""" + input = Series.from_pylist([datetime(2022, 1, 6, 12, 34, 56, 78)]) + casted = input.cast(DataType.time("s")) + assert casted.to_pylist() == [time(12, 34, 56, 78)] diff --git a/tests/series/test_hash.py b/tests/series/test_hash.py index 51d373f6b4..444c69fb22 100644 --- a/tests/series/test_hash.py +++ b/tests/series/test_hash.py @@ -177,12 +177,20 @@ def test_murmur3_32_hash_date(): def test_murmur3_32_hash_time(): - arr = Series.from_pylist([time(22, 31, 8), None]) + arr = Series.from_pylist([time(22, 31, 8, 0), None]) assert arr.datatype() == DataType.time("us") hashes = arr.murmur3_32() assert hashes.to_pylist() == [-662762989, None] +def test_murmur3_32_hash_time_nanoseconds(): + arr = Series.from_pylist([time(22, 31, 8, 0), None]) + arr = arr.cast(DataType.time("ns")) + assert arr.datatype() == DataType.time("ns") + hashes = arr.murmur3_32() + assert hashes.to_pylist() == [-662762989, None] + + def test_murmur3_32_hash_timestamp(): arr = Series.from_pylist([datetime(2017, 11, 16, 22, 31, 8), None]) hashes = arr.murmur3_32() diff --git a/tests/series/test_size_bytes.py b/tests/series/test_size_bytes.py index 92e56678bd..3fe42b8662 100644 --- a/tests/series/test_size_bytes.py +++ b/tests/series/test_size_bytes.py @@ -90,19 +90,20 @@ def test_series_date_size_bytes(size, with_nulls) -> None: @pytest.mark.parametrize("size", [0, 1, 2, 8, 9, 16]) @pytest.mark.parametrize("with_nulls", [True, False]) -def test_series_time_size_bytes(size, with_nulls) -> None: +@pytest.mark.parametrize("precision", ["us", "ns"]) +def test_series_time_size_bytes(size, with_nulls, precision) -> None: from datetime import time - pydata = [time(i, 0, 0) for i in range(size)] + pydata = [time(i, i, i, i) for i in range(size)] if with_nulls and size > 0: - data = pa.array(pydata[:-1] + [None], pa.time64("us")) + data = pa.array(pydata[:-1] + [None], pa.time64(precision)) else: - data = pa.array(pydata, pa.time64("us")) + data = pa.array(pydata, pa.time64(precision)) s = Series.from_arrow(data) - assert s.datatype() == DataType.time("us") + assert s.datatype() == DataType.time(precision) assert s.size_bytes() == get_total_buffer_size(data) diff --git a/tests/series/test_sort.py b/tests/series/test_sort.py index e2f2de037c..790aa4a1d1 100644 --- a/tests/series/test_sort.py +++ b/tests/series/test_sort.py @@ -112,7 +112,8 @@ def date_maker(d): assert taken.to_pylist() == sorted_order[::-1] -def test_series_time_sorting() -> None: +@pytest.mark.parametrize("timeunit", ["us", "ns"]) +def test_series_time_sorting(timeunit) -> None: from datetime import time def time_maker(h, m, s, us): @@ -125,7 +126,7 @@ def time_maker(h, m, s, us): sorted_order = list( map(time_maker, [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [1, 2, 4, 5, None, None]) ) - s = s.cast(DataType.time("us")) + s = s.cast(DataType.time(timeunit)) s_sorted = s.sort() assert len(s_sorted) == len(s) assert s_sorted.datatype() == s.datatype() diff --git a/tests/series/test_take.py b/tests/series/test_take.py index 27f92f5a54..ea68ff515e 100644 --- a/tests/series/test_take.py +++ b/tests/series/test_take.py @@ -45,7 +45,8 @@ def date_maker(d): assert taken.to_pylist() == days[::-1] -def test_series_time_take() -> None: +@pytest.mark.parametrize("time_unit", ["us", "ns"]) +def test_series_time_take(time_unit) -> None: from datetime import time def time_maker(h, m, s, us): @@ -55,8 +56,9 @@ def time_maker(h, m, s, us): times = list(map(time_maker, [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [5, 4, 1, None, 2, None])) s = Series.from_pylist(times) + s = s.cast(DataType.time(time_unit)) taken = s.take(Series.from_pylist([5, 4, 3, 2, 1, 0])) - assert taken.datatype() == DataType.time("us") + assert taken.datatype() == DataType.time(time_unit) assert taken.to_pylist() == times[::-1] diff --git a/tests/table/test_from_py.py b/tests/table/test_from_py.py index 26d7fba527..ec89cd87be 100644 --- a/tests/table/test_from_py.py +++ b/tests/table/test_from_py.py @@ -24,7 +24,7 @@ "str": ["foo", "bar"], "binary": [b"foo", b"bar"], "date": [datetime.date.today(), datetime.date.today()], - "time": [datetime.time(1, 2, 3), datetime.time(4, 5, 6)], + "time": [datetime.time(1, 2, 3, 4), datetime.time(5, 6, 7, 8)], "list": [[1, 2], [3]], "struct": [{"a": 1, "b": 2.0}, {"b": 3.0}], "empty_struct": [{}, {}], @@ -94,7 +94,8 @@ "binary": pa.array(PYTHON_TYPE_ARRAYS["binary"], pa.binary()), "boolean": pa.array(PYTHON_TYPE_ARRAYS["bool"], pa.bool_()), "date32": pa.array(PYTHON_TYPE_ARRAYS["date"], pa.date32()), - "time64": pa.array(PYTHON_TYPE_ARRAYS["time"], pa.time64("us")), + "time64_microseconds": pa.array(PYTHON_TYPE_ARRAYS["time"], pa.time64("us")), + "time64_nanoseconds": pa.array(PYTHON_TYPE_ARRAYS["time"], pa.time64("ns")), "list": pa.array(PYTHON_TYPE_ARRAYS["list"], pa.list_(pa.int64())), "fixed_size_list": pa.array([[1, 2], [3, 4]], pa.list_(pa.int64(), 2)), "struct": pa.array(PYTHON_TYPE_ARRAYS["struct"], pa.struct([("a", pa.int64()), ("b", pa.float64())])), @@ -144,7 +145,8 @@ "binary": pa.large_binary(), "boolean": pa.bool_(), "date32": pa.date32(), - "time64": pa.time64("us"), + "time64_microseconds": pa.time64("us"), + "time64_nanoseconds": pa.time64("ns"), "list": pa.large_list(pa.int64()), "fixed_size_list": pa.list_(pa.int64(), 2), "struct": pa.struct([("a", pa.int64()), ("b", pa.float64())]),