diff --git a/Cargo.lock b/Cargo.lock index 69175ebc7f..36a4c84bb3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1751,7 +1751,6 @@ dependencies = [ "regex", "serde", "sketches-ddsketch", - "tracing", "unicode-normalization", "xxhash-rust", ] diff --git a/Cargo.toml b/Cargo.toml index 9c52e1584b..1d1065f026 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -192,7 +192,7 @@ tokio = {version = "1.37.0", features = [ ]} tokio-stream = {version = "0.1.14", features = ["fs", "io-util", "time"]} tokio-util = "0.7.11" -tracing = "0.1.40" +tracing = "0.1" url = "2.4.0" [workspace.dependencies.arrow2] diff --git a/py-polars/debug/launch.py b/py-polars/debug/launch.py deleted file mode 100644 index 91a6ba15e6..0000000000 --- a/py-polars/debug/launch.py +++ /dev/null @@ -1,82 +0,0 @@ -import os -import re -import sys -import time -from pathlib import Path - -""" -The following parameter determines the sleep time of the Python process after a signal -is sent that attaches the Rust LLDB debugger. If the Rust LLDB debugger attaches to the -current session too late, it might miss any set breakpoints. If this happens -consistently, it is recommended to increase this value. -""" -LLDB_DEBUG_WAIT_TIME_SECONDS = 1 - - -def launch_debugging() -> None: - """ - Debug Rust files via Python. - - Determine the pID for the current debugging session, attach the Rust LLDB launcher, - and execute the originally-requested script. - """ - if len(sys.argv) == 1: - msg = ( - "launch.py is not meant to be executed directly; please use the `Python: " - "Debug Rust` debugging configuration to run a python script that uses the " - "polars library." - ) - raise RuntimeError(msg) - - # Get the current process ID. - pID = os.getpid() - - # Print to the debug console to allow VSCode to pick up on the signal and start the - # Rust LLDB configuration automatically. - launch_file = Path(__file__).parents[2] / ".vscode/launch.json" - if not launch_file.exists(): - msg = f"Cannot locate {launch_file}" - raise RuntimeError(msg) - with launch_file.open("r") as f: - launch_info = f.read() - - # Overwrite the pid found in launch.json with the pid for the current process. - # Match the initial "Rust LLDB" definition with the pid defined immediately after. - pattern = re.compile('("Rust LLDB",\\s*"pid":\\s*")\\d+(")') - found = pattern.search(launch_info) - if not found: - msg = ( - "Cannot locate pid definition in launch.json for Rust LLDB configuration. " - "Please follow the instructions in the debugging section of the " - "contributing guide (https://docs.pola.rs/development/contributing/ide/#debugging) " - "for creating the launch configuration." - ) - raise RuntimeError(msg) - - launch_info_with_new_pid = pattern.sub(rf"\g<1>{pID}\g<2>", launch_info) - with launch_file.open("w") as f: - f.write(launch_info_with_new_pid) - - # Print pID to the debug console. This auto-triggers the Rust LLDB configurations. - print(f"pID = {pID}") - - # Give the LLDB time to connect. Depending on how long it takes for your LLDB - # debugging session to initialize, you may have to adjust this setting. - time.sleep(LLDB_DEBUG_WAIT_TIME_SECONDS) - - # Update sys.argv so that when exec() is called, the first argument is the script - # name itself, and the remaining are the input arguments. - sys.argv.pop(0) - with Path(sys.argv[0]).open() as fh: - script_contents = fh.read() - - # Run the originally requested file by reading in the script, compiling, and - # executing the code. - file_to_execute = Path(sys.argv[0]) - exec( - compile(script_contents, file_to_execute, mode="exec"), {"__name__": "__main__"} - ) - - -if __name__ == "__main__": - launch_debugging() \ No newline at end of file diff --git a/something.py b/something.py deleted file mode 100644 index 8660f9a13a..0000000000 --- a/something.py +++ /dev/null @@ -1,106 +0,0 @@ -# # from __future__ import annotations - -import datetime - -import time - -# # Sleep for 5 seconds -import pyarrow as pa - -# # import pytest -from daft.expressions import col -from daft.table import MicroPartition -import daft - -# # def test_map_get(): -# # data = pa.array([[(1, 2)], [], [(2, 1)]], type=pa.map_(pa.int64(), pa.int64())) -# # table = MicroPartition.from_arrow(pa.table({"map_col": data})) - -# # result = table.eval_expression_list([col("map_col").map.get(1)]) - -# # assert result.to_pydict() == {"value": [2, None, None]} - - -# # def test_map_get_broadcasted(): -# # data = pa.array([[(1, 2)], [], [(2, 1)]], type=pa.map_(pa.int64(), pa.int64())) -# # keys = pa.array([1, 3, 2], type=pa.int64()) -# # table = MicroPartition.from_arrow(pa.table({"map_col": data, "key": keys})) - -# # result = table.eval_expression_list([col("map_col").map.get(col("key"))]) - -# # assert result.to_pydict() == {"value": [2, None, 1]} - - -# # def test_map_get_duplicate_keys(): -# # # Only the first value is returned -# # data = pa.array([[(1, 2), (1, 3)]], type=pa.map_(pa.int64(), pa.int64())) -# # table = MicroPartition.from_arrow(pa.table({"map_col": data})) - -# # result = table.eval_expression_list([col("map_col").map.get(1)]) - -# # assert result.to_pydict() == {"value": [2]} - - -# # def test_list_array(): -# # print("HIIIIIII") -# # data = pa.array( -# # [ -# # [datetime.date(2022, 1, 1)], -# # [datetime.date(2022, 1, 2)], -# # [], -# # ], -# # type=pa.list_(pa.date32()), # logical types -# # ) - -# # table = MicroPartition.from_arrow(pa.table({"map_col": data})) - -# # print("TABLE", table) -# # print("oi") - -# # # result = table.eval_expression_list([col("map_col").map.get("foo")]) - -# # # assert result.to_pydict() == {"value": [datetime.date(2022, 1, 1), datetime.date(2022, 1, 2), None]} - - -def test_map_get_logical_type(): - print("OIIIIIII") - data = pa.array( - [ - [("foo", datetime.date(2022, 1, 1))], - [("foo", datetime.date(2022, 1, 2))], - [], - ], - type=pa.map_(pa.string(), pa.date32()), # logical types - ) - -# assert isinstance(data, pa.Array) -# assert data.type == pa.map_(pa.string(), pa.date32()) -# assert len(data) == 3 -# assert data[0].as_py() == [("foo", datetime.date(2022, 1, 1))] -# assert data[1].as_py() == [("foo", datetime.date(2022, 1, 2))] -# assert data[2].as_py() == [] - -# # Assert physical types -# assert str(data.type) == "map" - -# # Convert types - - table = daft.table.MicroPartition.from_arrow(pa.table({"map_col": data})) - - -# # result = table.eval_expression_list([col("map_col").map.get("foo")]) - -# # assert result.to_pydict() == {"value": [datetime.date(2022, 1, 1), datetime.date(2022, 1, 2), None]} - - -# # def test_map_get_bad_field(): -# # data = pa.array([[(1, 2)], [(2, 3)]], type=pa.map_(pa.int64(), pa.int64())) -# # table = MicroPartition.from_arrow(pa.table({"map_col": data})) - -# # with pytest.raises(ValueError): -# # table.eval_expression_list([col("map_col").map.get("foo")]) - - -print("starting") -test_map_get_logical_type() -print("done") diff --git a/src/daft-core/Cargo.toml b/src/daft-core/Cargo.toml index 0746deec79..ec15924316 100644 --- a/src/daft-core/Cargo.toml +++ b/src/daft-core/Cargo.toml @@ -43,7 +43,6 @@ pyo3 = {workspace = true, optional = true} regex = {workspace = true} serde = {workspace = true} sketches-ddsketch = {workspace = true} -tracing = {workspace = true} unicode-normalization = "0.1.23" [dependencies.numpy] diff --git a/src/daft-core/src/array/ops/from_arrow.rs b/src/daft-core/src/array/ops/from_arrow.rs index 99988709fc..41235eda4d 100644 --- a/src/daft-core/src/array/ops/from_arrow.rs +++ b/src/daft-core/src/array/ops/from_arrow.rs @@ -1,7 +1,6 @@ use std::sync::Arc; use common_error::{DaftError, DaftResult}; -use log::info; use crate::{ array::{DataArray, FixedSizeListArray, ListArray, StructArray}, @@ -30,15 +29,12 @@ impl FromArrow for LogicalArray where ::ArrayType: FromArrow, { - #[tracing::instrument(level = "trace", name = "LogicalArray::from_arrow", skip_all)] fn from_arrow(field: FieldRef, arrow_arr: Box) -> DaftResult { let target_convert = field.to_physical(); let target_convert_arrow = target_convert.dtype.to_arrow()?; let physical_arrow_array = arrow_arr.convert_logical_type(target_convert_arrow.clone()); - let dbg = format!("Target Convert: {target_convert:#?}\nTarget Convert Arrow: {target_convert_arrow:#?}\nPhysical Arrow: {:#?}", physical_arrow_array.data_type()); - let physical = ::ArrayType::from_arrow( Arc::new(target_convert), physical_arrow_array, @@ -132,8 +128,6 @@ impl FromArrow for StructArray { return Err(DaftError::ValueError(format!("Attempting to create Daft StructArray with {} fields from Arrow array with {} fields: {} vs {:?}", fields.len(), arrow_fields.len(), &field.dtype, arrow_arr.data_type()))); } - let debug = format!("{field:#?}"); - let arrow_arr = arrow_arr.as_ref().as_any().downcast_ref::().unwrap(); let arrow_child_arrays = arrow_arr.values(); diff --git a/src/daft-core/src/series/from.rs b/src/daft-core/src/series/from.rs index 2270e4c6f9..92682c658b 100644 --- a/src/daft-core/src/series/from.rs +++ b/src/daft-core/src/series/from.rs @@ -50,7 +50,6 @@ impl Series { impl TryFrom<(&str, Box)> for Series { type Error = DaftError; - #[instrument(level = "trace", name = "Series::try_from", skip_all)] fn try_from((name, array): (&str, Box)) -> DaftResult { let source_arrow_type: &ArrowDataType = array.data_type(); let dtype = DaftDataType::from(source_arrow_type); diff --git a/src/daft-schema/src/dtype.rs b/src/daft-schema/src/dtype.rs index 47efd28ed1..c19ef9aa06 100644 --- a/src/daft-schema/src/dtype.rs +++ b/src/daft-schema/src/dtype.rs @@ -618,9 +618,7 @@ impl DataType { } impl From<&ArrowType> for DataType { - #[tracing::instrument(level = "trace", name = "DataType::from(&ArrowType)", skip(item))] fn from(item: &ArrowType) -> Self { - tracing::trace!("processing {item:#?}"); let result = match item { ArrowType::Null => Self::Null, ArrowType::Boolean => Self::Boolean, @@ -697,7 +695,6 @@ impl From<&ArrowType> for DataType { _ => panic!("DataType :{item:?} is not supported"), }; - tracing::info!("Result: {result:?}"); result } } diff --git a/tests/expressions/test_expressions.py b/tests/expressions/test_expressions.py index ef0d80cbe2..fceb121e3e 100644 --- a/tests/expressions/test_expressions.py +++ b/tests/expressions/test_expressions.py @@ -516,16 +516,6 @@ def test_list_value_counts(): {"list_col": [["a", "b", "a", "c"], ["b", "b", "c"], ["a", "a", "a"], [], ["d", None, "d"]]} ) - # mp = MicroPartition.from_pydict({ - # "list_col": [ - # ["a", "b", "a", "c"], - # ["b", "b", "c"], - # ["a", "a", "a"], - # [], - # ["d", "d"] - # ] - # }) - # # Apply list_value_counts operation result = mp.eval_expression_list([col("list_col").list.value_counts().alias("value_counts")]) value_counts = result.to_pydict()["value_counts"] @@ -536,9 +526,6 @@ def test_list_value_counts(): # Check the result assert value_counts == expected - # Test with empty input - empty_mp = MicroPartition.from_pydict({"list_col": []}) - # Test with empty input (no proper type -> should raise error) empty_mp = MicroPartition.from_pydict({"list_col": []}) with pytest.raises(ValueError): diff --git a/tests/table/map/test_map_get.py b/tests/table/map/test_map_get.py index 1dc5e9e0a4..6ab7a31ab8 100644 --- a/tests/table/map/test_map_get.py +++ b/tests/table/map/test_map_get.py @@ -1,101 +1,62 @@ -# from __future__ import annotations +from __future__ import annotations import datetime import pyarrow as pa +import pytest -# import pytest -# from daft.expressions import col -# from daft.table import MicroPartition -import daft +from daft.expressions import col +from daft.table import MicroPartition -# def test_map_get(): -# data = pa.array([[(1, 2)], [], [(2, 1)]], type=pa.map_(pa.int64(), pa.int64())) -# table = MicroPartition.from_arrow(pa.table({"map_col": data})) -# result = table.eval_expression_list([col("map_col").map.get(1)]) +def test_map_get(): + data = pa.array([[(1, 2)], [], [(2, 1)]], type=pa.map_(pa.int64(), pa.int64())) + table = MicroPartition.from_arrow(pa.table({"map_col": data})) -# assert result.to_pydict() == {"value": [2, None, None]} + result = table.eval_expression_list([col("map_col").map.get(1)]) + assert result.to_pydict() == {"value": [2, None, None]} -# def test_map_get_broadcasted(): -# data = pa.array([[(1, 2)], [], [(2, 1)]], type=pa.map_(pa.int64(), pa.int64())) -# keys = pa.array([1, 3, 2], type=pa.int64()) -# table = MicroPartition.from_arrow(pa.table({"map_col": data, "key": keys})) -# result = table.eval_expression_list([col("map_col").map.get(col("key"))]) +def test_map_get_broadcasted(): + data = pa.array([[(1, 2)], [], [(2, 1)]], type=pa.map_(pa.int64(), pa.int64())) + keys = pa.array([1, 3, 2], type=pa.int64()) + table = MicroPartition.from_arrow(pa.table({"map_col": data, "key": keys})) -# assert result.to_pydict() == {"value": [2, None, 1]} + result = table.eval_expression_list([col("map_col").map.get(col("key"))]) + assert result.to_pydict() == {"value": [2, None, 1]} -# def test_map_get_duplicate_keys(): -# # Only the first value is returned -# data = pa.array([[(1, 2), (1, 3)]], type=pa.map_(pa.int64(), pa.int64())) -# table = MicroPartition.from_arrow(pa.table({"map_col": data})) -# result = table.eval_expression_list([col("map_col").map.get(1)]) +def test_map_get_duplicate_keys(): + # Only the first value is returned + data = pa.array([[(1, 2), (1, 3)]], type=pa.map_(pa.int64(), pa.int64())) + table = MicroPartition.from_arrow(pa.table({"map_col": data})) -# assert result.to_pydict() == {"value": [2]} + result = table.eval_expression_list([col("map_col").map.get(1)]) - -# def test_list_array(): -# print("HIIIIIII") -# data = pa.array( -# [ -# [datetime.date(2022, 1, 1)], -# [datetime.date(2022, 1, 2)], -# [], -# ], -# type=pa.list_(pa.date32()), # logical types -# ) - -# table = MicroPartition.from_arrow(pa.table({"map_col": data})) - -# print("TABLE", table) -# print("oi") - -# # result = table.eval_expression_list([col("map_col").map.get("foo")]) - -# # assert result.to_pydict() == {"value": [datetime.date(2022, 1, 1), datetime.date(2022, 1, 2), None]} + assert result.to_pydict() == {"value": [2]} def test_map_get_logical_type(): - print("OIIIIIII") data = pa.array( [ [("foo", datetime.date(2022, 1, 1))], [("foo", datetime.date(2022, 1, 2))], [], ], - type=pa.map_(pa.string(), pa.date32()), # logical types + type=pa.map_(pa.string(), pa.date32()), ) + table = MicroPartition.from_arrow(pa.table({"map_col": data})) - assert isinstance(data, pa.Array) - assert data.type == pa.map_(pa.string(), pa.date32()) - assert len(data) == 3 - assert data[0].as_py() == [("foo", datetime.date(2022, 1, 1))] - assert data[1].as_py() == [("foo", datetime.date(2022, 1, 2))] - assert data[2].as_py() == [] - - # Assert physical types - assert str(data.type) == "map" - - # Convert types - - table = daft.table.MicroPartition.from_arrow(pa.table({"map_col": data})) - - -# result = table.eval_expression_list([col("map_col").map.get("foo")]) - -# assert result.to_pydict() == {"value": [datetime.date(2022, 1, 1), datetime.date(2022, 1, 2), None]} - + result = table.eval_expression_list([col("map_col").map.get("foo")]) -# def test_map_get_bad_field(): -# data = pa.array([[(1, 2)], [(2, 3)]], type=pa.map_(pa.int64(), pa.int64())) -# table = MicroPartition.from_arrow(pa.table({"map_col": data})) + assert result.to_pydict() == {"value": [datetime.date(2022, 1, 1), datetime.date(2022, 1, 2), None]} -# with pytest.raises(ValueError): -# table.eval_expression_list([col("map_col").map.get("foo")]) +def test_map_get_bad_field(): + data = pa.array([[(1, 2)], [(2, 3)]], type=pa.map_(pa.int64(), pa.int64())) + table = MicroPartition.from_arrow(pa.table({"map_col": data})) -test_map_get_logical_type() + with pytest.raises(ValueError): + table.eval_expression_list([col("map_col").map.get("foo")]) diff --git a/trace-1727315975393512.json b/trace-1727315975393512.json deleted file mode 100644 index e69de29bb2..0000000000