From eda6ad38dd3d9c7bb7d29e4b0be7ae0b3fcaa4f5 Mon Sep 17 00:00:00 2001 From: dave Date: Mon, 9 Dec 2024 16:26:24 +0100 Subject: [PATCH 01/35] remove standalone dataset from exports --- dlt/__init__.py | 1 - tests/load/test_read_interfaces.py | 13 +++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/dlt/__init__.py b/dlt/__init__.py index e8a1b7bf92..6dcbfba143 100644 --- a/dlt/__init__.py +++ b/dlt/__init__.py @@ -42,7 +42,6 @@ ) from dlt.pipeline import progress from dlt import destinations -from dlt.destinations.dataset import dataset as _dataset pipeline = _pipeline current = _current diff --git a/tests/load/test_read_interfaces.py b/tests/load/test_read_interfaces.py index 1a9c8a383b..1949f2dfd7 100644 --- a/tests/load/test_read_interfaces.py +++ b/tests/load/test_read_interfaces.py @@ -23,6 +23,7 @@ from dlt.common.destination.reference import TDestinationReferenceArg from dlt.destinations.dataset import ReadableDBAPIDataset, ReadableRelationUnknownColumnException from tests.load.utils import drop_pipeline_data +from dlt.destinations.dataset import dataset as _dataset EXPECTED_COLUMNS = ["id", "decimal", "other_decimal", "_dlt_load_id", "_dlt_id"] @@ -491,7 +492,7 @@ def test_standalone_dataset(populated_pipeline: Pipeline) -> None: total_records = _total_records(populated_pipeline) # check dataset factory - dataset = dlt._dataset( + dataset = _dataset( destination=populated_pipeline.destination, dataset_name=populated_pipeline.dataset_name ) # verfiy that sql client and schema are lazy loaded @@ -504,7 +505,7 @@ def test_standalone_dataset(populated_pipeline: Pipeline) -> None: # check that schema is loaded by name dataset = cast( ReadableDBAPIDataset, - dlt._dataset( + _dataset( destination=populated_pipeline.destination, dataset_name=populated_pipeline.dataset_name, schema=populated_pipeline.default_schema_name, @@ -515,7 +516,7 @@ def test_standalone_dataset(populated_pipeline: Pipeline) -> None: # check that schema is not loaded when wrong name given dataset = cast( ReadableDBAPIDataset, - dlt._dataset( + _dataset( destination=populated_pipeline.destination, dataset_name=populated_pipeline.dataset_name, schema="wrong_schema_name", @@ -527,7 +528,7 @@ def test_standalone_dataset(populated_pipeline: Pipeline) -> None: # check that schema is loaded if no schema name given dataset = cast( ReadableDBAPIDataset, - dlt._dataset( + _dataset( destination=populated_pipeline.destination, dataset_name=populated_pipeline.dataset_name, ), @@ -538,7 +539,7 @@ def test_standalone_dataset(populated_pipeline: Pipeline) -> None: # check that there is no error when creating dataset without schema table dataset = cast( ReadableDBAPIDataset, - dlt._dataset( + _dataset( destination=populated_pipeline.destination, dataset_name="unknown_dataset", ), @@ -560,7 +561,7 @@ def test_standalone_dataset(populated_pipeline: Pipeline) -> None: dataset = cast( ReadableDBAPIDataset, - dlt._dataset( + _dataset( destination=populated_pipeline.destination, dataset_name=populated_pipeline.dataset_name, ), From fae7a2b1c0e687ad97e5c997235fb58cae0b155d Mon Sep 17 00:00:00 2001 From: dave Date: Mon, 9 Dec 2024 16:31:56 +0100 Subject: [PATCH 02/35] make pipeline dataset factory public --- dlt/__init__.py | 1 - dlt/pipeline/pipeline.py | 2 +- .../general-usage/dataset-access/dataset.md | 4 +-- .../dataset-access/ibis-backend.md | 2 +- .../test_readable_dbapi_dataset.py | 8 +++--- tests/extract/test_incremental.py | 8 +++--- tests/load/duckdb/test_duckdb_client.py | 4 +-- tests/load/filesystem/test_sql_client.py | 6 ++-- tests/load/pipeline/test_bigquery.py | 8 +++--- tests/load/pipeline/test_duckdb.py | 4 +-- tests/load/test_read_interfaces.py | 28 +++++++++---------- tests/pipeline/test_dlt_versions.py | 2 +- tests/pipeline/test_pipeline.py | 8 +++--- tests/pipeline/test_pipeline_extra.py | 4 +-- 14 files changed, 44 insertions(+), 45 deletions(-) diff --git a/dlt/__init__.py b/dlt/__init__.py index 6dcbfba143..328817efd2 100644 --- a/dlt/__init__.py +++ b/dlt/__init__.py @@ -79,7 +79,6 @@ "TCredentials", "sources", "destinations", - "_dataset", ] # verify that no injection context was created diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 70d160ea67..674dab6c37 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -1750,7 +1750,7 @@ def __getstate__(self) -> Any: # pickle only the SupportsPipeline protocol fields return {"pipeline_name": self.pipeline_name} - def _dataset( + def dataset( self, schema: Union[Schema, str, None] = None, dataset_type: TDatasetType = "dbapi" ) -> SupportsReadableDataset: """Access helper to dataset""" diff --git a/docs/website/docs/general-usage/dataset-access/dataset.md b/docs/website/docs/general-usage/dataset-access/dataset.md index 68635383c5..07fe37ecb2 100644 --- a/docs/website/docs/general-usage/dataset-access/dataset.md +++ b/docs/website/docs/general-usage/dataset-access/dataset.md @@ -19,7 +19,7 @@ Here's a full example of how to retrieve data from a pipeline and load it into a # and you have loaded data to a table named 'items' in the destination # Step 1: Get the readable dataset from the pipeline -dataset = pipeline._dataset() +dataset = pipeline.dataset() # Step 2: Access a table as a ReadableRelation items_relation = dataset.items # Or dataset["items"] @@ -39,7 +39,7 @@ Assuming you have a `Pipeline` object (let's call it `pipeline`), you can obtain ```py # Get the readable dataset from the pipeline -dataset = pipeline._dataset() +dataset = pipeline.dataset() ``` ### Access tables as `ReadableRelation` diff --git a/docs/website/docs/general-usage/dataset-access/ibis-backend.md b/docs/website/docs/general-usage/dataset-access/ibis-backend.md index 8f4b0fb6b6..39bc2988c9 100644 --- a/docs/website/docs/general-usage/dataset-access/ibis-backend.md +++ b/docs/website/docs/general-usage/dataset-access/ibis-backend.md @@ -28,7 +28,7 @@ pip install ibis-framework[duckdb] ```py # get the dataset from the pipeline -dataset = pipeline._dataset() +dataset = pipeline.dataset() dataset_name = pipeline.dataset_name # get the native ibis connection from the dataset diff --git a/tests/destinations/test_readable_dbapi_dataset.py b/tests/destinations/test_readable_dbapi_dataset.py index 4745735371..1c03ffd664 100644 --- a/tests/destinations/test_readable_dbapi_dataset.py +++ b/tests/destinations/test_readable_dbapi_dataset.py @@ -9,7 +9,7 @@ def test_query_builder() -> None: - dataset = dlt.pipeline(destination="duckdb", pipeline_name="pipeline")._dataset() + dataset = dlt.pipeline(destination="duckdb", pipeline_name="pipeline").dataset() # default query for a table assert dataset.my_table.query.strip() == 'SELECT * FROM "pipeline_dataset"."my_table"' # type: ignore[attr-defined] @@ -55,7 +55,7 @@ def test_query_builder() -> None: def test_copy_and_chaining() -> None: - dataset = dlt.pipeline(destination="duckdb", pipeline_name="pipeline")._dataset() + dataset = dlt.pipeline(destination="duckdb", pipeline_name="pipeline").dataset() # create releation and set some stuff on it relation = dataset.items @@ -80,7 +80,7 @@ def test_copy_and_chaining() -> None: def test_computed_schema_columns() -> None: - dataset = dlt.pipeline(destination="duckdb", pipeline_name="pipeline")._dataset() + dataset = dlt.pipeline(destination="duckdb", pipeline_name="pipeline").dataset() relation = dataset.items # no schema present @@ -107,7 +107,7 @@ def test_computed_schema_columns() -> None: def test_prevent_changing_relation_with_query() -> None: - dataset = dlt.pipeline(destination="duckdb", pipeline_name="pipeline")._dataset() + dataset = dlt.pipeline(destination="duckdb", pipeline_name="pipeline").dataset() relation = dataset("SELECT * FROM something") with pytest.raises(ReadableRelationHasQueryException): diff --git a/tests/extract/test_incremental.py b/tests/extract/test_incremental.py index 725872b621..2d5c84c1a9 100644 --- a/tests/extract/test_incremental.py +++ b/tests/extract/test_incremental.py @@ -228,7 +228,7 @@ def test_pandas_index_as_dedup_key() -> None: no_index_r = some_data.with_name(new_name="no_index") p.run(no_index_r) p.run(no_index_r) - data_ = p._dataset().no_index.arrow() + data_ = p.dataset().no_index.arrow() assert data_.schema.names == ["created_at", "id"] assert data_["id"].to_pylist() == ["a", "b", "c", "d", "e", "f", "g"] @@ -240,7 +240,7 @@ def test_pandas_index_as_dedup_key() -> None: unnamed_index_r.incremental.primary_key = "__index_level_0__" p.run(unnamed_index_r) p.run(unnamed_index_r) - data_ = p._dataset().unnamed_index.arrow() + data_ = p.dataset().unnamed_index.arrow() assert data_.schema.names == ["created_at", "id", "index_level_0"] # indexes 2 and 3 are removed from second batch because they were in the previous batch # and the created_at overlapped so they got deduplicated @@ -258,7 +258,7 @@ def _make_named_index(df_: pd.DataFrame) -> pd.DataFrame: named_index_r.incremental.primary_key = "order_id" p.run(named_index_r) p.run(named_index_r) - data_ = p._dataset().named_index.arrow() + data_ = p.dataset().named_index.arrow() assert data_.schema.names == ["created_at", "id", "order_id"] assert data_["order_id"].to_pylist() == [0, 1, 2, 3, 4, 0, 1, 4] @@ -268,7 +268,7 @@ def _make_named_index(df_: pd.DataFrame) -> pd.DataFrame: ) p.run(named_index_impl_r) p.run(named_index_impl_r) - data_ = p._dataset().named_index_impl.arrow() + data_ = p.dataset().named_index_impl.arrow() assert data_.schema.names == ["created_at", "id"] assert data_["id"].to_pylist() == ["a", "b", "c", "d", "e", "f", "g"] diff --git a/tests/load/duckdb/test_duckdb_client.py b/tests/load/duckdb/test_duckdb_client.py index 49475ce43f..652f75772a 100644 --- a/tests/load/duckdb/test_duckdb_client.py +++ b/tests/load/duckdb/test_duckdb_client.py @@ -282,14 +282,14 @@ def test_drops_pipeline_changes_bound() -> None: p = dlt.pipeline(pipeline_name="quack_pipeline", destination="duckdb") p.run([1, 2, 3], table_name="p_table") p = p.drop() - assert len(p._dataset().p_table.fetchall()) == 3 + assert len(p.dataset().p_table.fetchall()) == 3 # drops internal duckdb p = dlt.pipeline(pipeline_name="quack_pipeline", destination=duckdb(":pipeline:")) p.run([1, 2, 3], table_name="p_table") p = p.drop() with pytest.raises(DatabaseUndefinedRelation): - p._dataset().p_table.fetchall() + p.dataset().p_table.fetchall() def test_duckdb_database_delete() -> None: diff --git a/tests/load/filesystem/test_sql_client.py b/tests/load/filesystem/test_sql_client.py index ac2ada2551..f9232f6a9d 100644 --- a/tests/load/filesystem/test_sql_client.py +++ b/tests/load/filesystem/test_sql_client.py @@ -349,7 +349,7 @@ def items(): pipeline.run([items()], loader_file_format=destination_config.file_format) - df = pipeline._dataset().items.df() + df = pipeline.dataset().items.df() assert len(df.index) == 20 @dlt.resource(table_name="items") @@ -359,5 +359,5 @@ def items2(): pipeline.run([items2()], loader_file_format=destination_config.file_format) # check df and arrow access - assert len(pipeline._dataset().items.df().index) == 50 - assert pipeline._dataset().items.arrow().num_rows == 50 + assert len(pipeline.dataset().items.df().index) == 50 + assert pipeline.dataset().items.arrow().num_rows == 50 diff --git a/tests/load/pipeline/test_bigquery.py b/tests/load/pipeline/test_bigquery.py index cb65c6bcf1..83982bb998 100644 --- a/tests/load/pipeline/test_bigquery.py +++ b/tests/load/pipeline/test_bigquery.py @@ -384,8 +384,8 @@ def resource(): bigquery_adapter(resource, autodetect_schema=True) pipeline.run(resource) - assert len(pipeline._dataset().items.df()) == 5 - assert len(pipeline._dataset().items__nested.df()) == 5 + assert len(pipeline.dataset().items.df()) == 5 + assert len(pipeline.dataset().items__nested.df()) == 5 @dlt.resource(primary_key="id", table_name="items", write_disposition="merge") def resource2(): @@ -395,5 +395,5 @@ def resource2(): bigquery_adapter(resource2, autodetect_schema=True) pipeline.run(resource2) - assert len(pipeline._dataset().items.df()) == 7 - assert len(pipeline._dataset().items__nested.df()) == 7 + assert len(pipeline.dataset().items.df()) == 7 + assert len(pipeline.dataset().items__nested.df()) == 7 diff --git a/tests/load/pipeline/test_duckdb.py b/tests/load/pipeline/test_duckdb.py index 98642bb263..76756556d5 100644 --- a/tests/load/pipeline/test_duckdb.py +++ b/tests/load/pipeline/test_duckdb.py @@ -273,10 +273,10 @@ def test_duckdb_credentials_separation( p2 = dlt.pipeline("p2", destination=duckdb(credentials=":pipeline:")) p1.run([1, 2, 3], table_name="p1_data") - p1_dataset = p1._dataset() + p1_dataset = p1.dataset() p2.run([1, 2, 3], table_name="p2_data") - p2_dataset = p2._dataset() + p2_dataset = p2.dataset() # both dataset should have independent duckdb databases # destinations should be bounded to pipelines still diff --git a/tests/load/test_read_interfaces.py b/tests/load/test_read_interfaces.py index 1949f2dfd7..f07532d818 100644 --- a/tests/load/test_read_interfaces.py +++ b/tests/load/test_read_interfaces.py @@ -161,7 +161,7 @@ def double_items(): ids=lambda x: x.name, ) def test_arrow_access(populated_pipeline: Pipeline) -> None: - table_relationship = populated_pipeline._dataset().items + table_relationship = populated_pipeline.dataset().items total_records = _total_records(populated_pipeline) chunk_size = _chunk_size(populated_pipeline) expected_chunk_counts = _expected_chunk_count(populated_pipeline) @@ -194,7 +194,7 @@ def test_arrow_access(populated_pipeline: Pipeline) -> None: ) def test_dataframe_access(populated_pipeline: Pipeline) -> None: # access via key - table_relationship = populated_pipeline._dataset()["items"] + table_relationship = populated_pipeline.dataset()["items"] total_records = _total_records(populated_pipeline) chunk_size = _chunk_size(populated_pipeline) expected_chunk_counts = _expected_chunk_count(populated_pipeline) @@ -234,7 +234,7 @@ def test_dataframe_access(populated_pipeline: Pipeline) -> None: ) def test_db_cursor_access(populated_pipeline: Pipeline) -> None: # check fetch accessors - table_relationship = populated_pipeline._dataset().items + table_relationship = populated_pipeline.dataset().items total_records = _total_records(populated_pipeline) chunk_size = _chunk_size(populated_pipeline) expected_chunk_counts = _expected_chunk_count(populated_pipeline) @@ -280,11 +280,11 @@ def test_ibis_dataset_access(populated_pipeline: Pipeline) -> None: # check correct error if not supported if populated_pipeline.destination.destination_type not in SUPPORTED_DESTINATIONS: with pytest.raises(NotImplementedError): - populated_pipeline._dataset().ibis() + populated_pipeline.dataset().ibis() return total_records = _total_records(populated_pipeline) - ibis_connection = populated_pipeline._dataset().ibis() + ibis_connection = populated_pipeline.dataset().ibis() map_i = lambda x: x if populated_pipeline.destination.destination_type == "dlt.destinations.snowflake": @@ -333,7 +333,7 @@ def test_ibis_dataset_access(populated_pipeline: Pipeline) -> None: ids=lambda x: x.name, ) def test_hint_preservation(populated_pipeline: Pipeline) -> None: - table_relationship = populated_pipeline._dataset().items + table_relationship = populated_pipeline.dataset().items # check that hints are carried over to arrow table expected_decimal_precision = 10 expected_decimal_precision_2 = 12 @@ -361,7 +361,7 @@ def test_hint_preservation(populated_pipeline: Pipeline) -> None: ) def test_loads_table_access(populated_pipeline: Pipeline) -> None: # check loads table access, we should have one entry - loads_table = populated_pipeline._dataset()[populated_pipeline.default_schema.loads_table_name] + loads_table = populated_pipeline.dataset()[populated_pipeline.default_schema.loads_table_name] assert len(loads_table.fetchall()) == 1 @@ -376,7 +376,7 @@ def test_loads_table_access(populated_pipeline: Pipeline) -> None: def test_sql_queries(populated_pipeline: Pipeline) -> None: # simple check that query also works tname = populated_pipeline.sql_client().make_qualified_table_name("items") - query_relationship = populated_pipeline._dataset()(f"select * from {tname} where id < 20") + query_relationship = populated_pipeline.dataset()(f"select * from {tname} where id < 20") # we selected the first 20 table = query_relationship.arrow() @@ -388,7 +388,7 @@ def test_sql_queries(populated_pipeline: Pipeline) -> None: f"SELECT i.id, di.double_id FROM {tname} as i JOIN {tdname} as di ON (i.id = di.id) WHERE" " i.id < 20 ORDER BY i.id ASC" ) - join_relationship = populated_pipeline._dataset()(query) + join_relationship = populated_pipeline.dataset()(query) table = join_relationship.fetchall() assert len(table) == 20 assert list(table[0]) == [0, 0] @@ -405,7 +405,7 @@ def test_sql_queries(populated_pipeline: Pipeline) -> None: ids=lambda x: x.name, ) def test_limit_and_head(populated_pipeline: Pipeline) -> None: - table_relationship = populated_pipeline._dataset().items + table_relationship = populated_pipeline.dataset().items assert len(table_relationship.head().fetchall()) == 5 assert len(table_relationship.limit(24).fetchall()) == 24 @@ -426,7 +426,7 @@ def test_limit_and_head(populated_pipeline: Pipeline) -> None: ids=lambda x: x.name, ) def test_column_selection(populated_pipeline: Pipeline) -> None: - table_relationship = populated_pipeline._dataset().items + table_relationship = populated_pipeline.dataset().items columns = ["_dlt_load_id", "other_decimal"] data_frame = table_relationship.select(*columns).head().df() @@ -464,18 +464,18 @@ def test_schema_arg(populated_pipeline: Pipeline) -> None: """Simple test to ensure schemas may be selected via schema arg""" # if there is no arg, the defautl schema is used - dataset = populated_pipeline._dataset() + dataset = populated_pipeline.dataset() assert dataset.schema.name == populated_pipeline.default_schema_name assert "items" in dataset.schema.tables # setting a different schema name will try to load that schema, # not find one and create an empty schema with that name - dataset = populated_pipeline._dataset(schema="unknown_schema") + dataset = populated_pipeline.dataset(schema="unknown_schema") assert dataset.schema.name == "unknown_schema" assert "items" not in dataset.schema.tables # providing the schema name of the right schema will load it - dataset = populated_pipeline._dataset(schema=populated_pipeline.default_schema_name) + dataset = populated_pipeline.dataset(schema=populated_pipeline.default_schema_name) assert dataset.schema.name == populated_pipeline.default_schema_name assert "items" in dataset.schema.tables diff --git a/tests/pipeline/test_dlt_versions.py b/tests/pipeline/test_dlt_versions.py index fbd4d412b3..51de3e0f76 100644 --- a/tests/pipeline/test_dlt_versions.py +++ b/tests/pipeline/test_dlt_versions.py @@ -538,5 +538,5 @@ def test_normalize_path_separator_legacy_behavior(test_storage: FileStorage) -> "_dlt_load_id", } # datasets must be the same - data_ = pipeline._dataset().issues_2.select("issue_id", "id").fetchall() + data_ = pipeline.dataset().issues_2.select("issue_id", "id").fetchall() print(data_) diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index e58db64e5e..cefb9408c5 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -1730,7 +1730,7 @@ def test_column_name_with_break_path() -> None: # get data assert_data_table_counts(pipeline, {"custom__path": 1}) # get data via dataset with dbapi - data_ = pipeline._dataset().custom__path[["example_custom_field__c", "reg_c"]].fetchall() + data_ = pipeline.dataset().custom__path[["example_custom_field__c", "reg_c"]].fetchall() assert data_ == [("custom", "c")] @@ -1754,7 +1754,7 @@ def test_column_name_with_break_path_legacy() -> None: # get data assert_data_table_counts(pipeline, {"custom_path": 1}) # get data via dataset with dbapi - data_ = pipeline._dataset().custom_path[["example_custom_field_c", "reg_c"]].fetchall() + data_ = pipeline.dataset().custom_path[["example_custom_field_c", "reg_c"]].fetchall() assert data_ == [("custom", "c")] @@ -1782,7 +1782,7 @@ def flattened_dict(): assert table["columns"]["value__timestamp"]["data_type"] == "timestamp" # make sure data is there - data_ = pipeline._dataset().flattened__dict[["delta", "value__timestamp"]].limit(1).fetchall() + data_ = pipeline.dataset().flattened__dict[["delta", "value__timestamp"]].limit(1).fetchall() assert data_ == [(0, now)] @@ -1812,7 +1812,7 @@ def flattened_dict(): assert set(table["columns"]) == {"delta", "value__timestamp", "_dlt_id", "_dlt_load_id"} assert table["columns"]["value__timestamp"]["data_type"] == "timestamp" # make sure data is there - data_ = pipeline._dataset().flattened_dict[["delta", "value__timestamp"]].limit(1).fetchall() + data_ = pipeline.dataset().flattened_dict[["delta", "value__timestamp"]].limit(1).fetchall() assert data_ == [(0, now)] diff --git a/tests/pipeline/test_pipeline_extra.py b/tests/pipeline/test_pipeline_extra.py index a51052d247..32b16c234f 100644 --- a/tests/pipeline/test_pipeline_extra.py +++ b/tests/pipeline/test_pipeline_extra.py @@ -521,7 +521,7 @@ def test_parquet_with_flattened_columns() -> None: assert "issue__reactions__url" in pipeline.default_schema.tables["events"]["columns"] assert "issue_reactions_url" not in pipeline.default_schema.tables["events"]["columns"] - events_table = pipeline._dataset().events.arrow() + events_table = pipeline.dataset().events.arrow() assert "issue__reactions__url" in events_table.schema.names assert "issue_reactions_url" not in events_table.schema.names @@ -536,7 +536,7 @@ def test_parquet_with_flattened_columns() -> None: info = pipeline.run(events_table, table_name="events", loader_file_format="parquet") assert_load_info(info) - events_table_new = pipeline._dataset().events.arrow() + events_table_new = pipeline.dataset().events.arrow() assert events_table.schema == events_table_new.schema # double row count assert events_table.num_rows * 2 == events_table_new.num_rows From 86964eab2b9f49acd31e67c907d909297f8366bc Mon Sep 17 00:00:00 2001 From: dave Date: Mon, 9 Dec 2024 18:12:33 +0100 Subject: [PATCH 03/35] rework transformation section --- .../website/docs/build-a-pipeline-tutorial.md | 27 +++-- .../docs/dlt-ecosystem/destinations/duckdb.md | 2 +- .../dlt-ecosystem/transformations/dbt/dbt.md | 8 +- .../dlt-ecosystem/transformations/index.md | 27 +++++ .../dlt-ecosystem/transformations/pandas.md | 42 ------- .../dlt-ecosystem/transformations/python.md | 109 ++++++++++++++++++ .../docs/dlt-ecosystem/transformations/sql.md | 55 ++++++--- .../verified-sources/rest_api/basic.md | 2 +- .../general-usage/dataset-access/dataset.md | 2 + .../website/docs/general-usage/destination.md | 2 +- docs/website/docs/general-usage/state.md | 15 ++- .../docs/tutorial/load-data-from-an-api.md | 20 +++- docs/website/sidebars.js | 11 +- 13 files changed, 229 insertions(+), 93 deletions(-) create mode 100644 docs/website/docs/dlt-ecosystem/transformations/index.md delete mode 100644 docs/website/docs/dlt-ecosystem/transformations/pandas.md create mode 100644 docs/website/docs/dlt-ecosystem/transformations/python.md diff --git a/docs/website/docs/build-a-pipeline-tutorial.md b/docs/website/docs/build-a-pipeline-tutorial.md index f85d2e19ea..860657b16c 100644 --- a/docs/website/docs/build-a-pipeline-tutorial.md +++ b/docs/website/docs/build-a-pipeline-tutorial.md @@ -262,20 +262,30 @@ In this example, the first pipeline loads the data using `pipedrive_source()`. T #### [Using the `dlt` SQL client](dlt-ecosystem/transformations/sql.md) -Another option is to leverage the `dlt` SQL client to query the loaded data and perform transformations using SQL statements. You can execute SQL statements that change the database schema or manipulate data within tables. Here's an example of inserting a row into the `customers` table using the `dlt` SQL client: +Another option is to leverage the `dlt` SQL client to query the loaded data and perform transformations using SQL statements. You can execute SQL statements that change the database schema or manipulate data within tables. Here's an example of creating a new table with aggregated sales data in duckdb: ```py -pipeline = dlt.pipeline(destination="bigquery", dataset_name="crm") +pipeline = dlt.pipeline(destination="duckdb", dataset_name="crm") with pipeline.sql_client() as client: client.execute_sql( - "INSERT INTO customers VALUES (%s, %s, %s)", 10, "Fred", "fred@fred.com" - ) + """ CREATE TABLE aggregated_sales AS + SELECT + category, + region, + SUM(amount) AS total_sales, + AVG(amount) AS average_sales + FROM + sales + GROUP BY + category, + region; + )""" ``` In this example, the `execute_sql` method of the SQL client allows you to execute SQL statements. The statement inserts a row with values into the `customers` table. -#### [Using Pandas](dlt-ecosystem/transformations/pandas.md) +#### [Using Pandas](dlt-ecosystem/transformations/python.md) You can fetch query results as Pandas data frames and perform transformations using Pandas functionalities. Here's an example of reading data from the `issues` table in DuckDB and counting reaction types using Pandas: @@ -287,11 +297,8 @@ pipeline = dlt.pipeline( dev_mode=True ) -with pipeline.sql_client() as client: - with client.execute_query( - 'SELECT "reactions__+1", "reactions__-1", reactions__laugh, reactions__hooray, reactions__rocket FROM issues' - ) as cursor: - reactions = cursor.df() +# get a dataframe of all reactions from the dataset +reactions = pipeline.dataset().issues.select("reactions__+1", "reactions__-1", "reactions__laugh", "reactions__hooray", "reactions__rocket").df() counts = reactions.sum(0).sort_values(0, ascending=False) ``` diff --git a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md index 2b284e991a..7dcdad2e24 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md +++ b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md @@ -118,7 +118,7 @@ to disable tz adjustments. ## Destination configuration -By default, a DuckDB database will be created in the current working directory with a name `.duckdb` (`chess.duckdb` in the example above). After loading, it is available in `read/write` mode via `with pipeline.sql_client() as con:`, which is a wrapper over `DuckDBPyConnection`. See [duckdb docs](https://duckdb.org/docs/api/python/overview#persistent-storage) for details. +By default, a DuckDB database will be created in the current working directory with a name `.duckdb` (`chess.duckdb` in the example above). After loading, it is available in `read/write` mode via `with pipeline.sql_client() as con:`, which is a wrapper over `DuckDBPyConnection`. See [duckdb docs](https://duckdb.org/docs/api/python/overview#persistent-storage) for details. If you want to read data, use [datasets](../general-usage/dataset-access/dataset) instead of the sql client. The `duckdb` credentials do not require any secret values. [You are free to pass the credentials and configuration explicitly](../../general-usage/destination.md#pass-explicit-credentials). For example: ```py diff --git a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md index 449f8b8bde..59eb340ef2 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md +++ b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md @@ -1,10 +1,10 @@ --- -title: Transform the data with dbt +title: Transforming data with dbt description: Transforming the data loaded by a dlt pipeline with dbt keywords: [transform, dbt, runner] --- -# Transform the data with dbt +# Transforming data with dbt [dbt](https://github.com/dbt-labs/dbt-core) is a framework that allows for the simple structuring of your transformations into DAGs. The benefits of using dbt include: @@ -105,8 +105,8 @@ You can run the example with dbt debug log: `RUNTIME__LOG_LEVEL=DEBUG python dbt ## Other transforming tools -If you want to transform the data before loading, you can use Python. If you want to transform the data after loading, you can use dbt or one of the following: +If you want to transform your data before loading, you can use Python. If you want to transform your data after loading, you can use dbt or one of the following: 1. [`dlt` SQL client.](../sql.md) -2. [Pandas.](../pandas.md) +2. [Python with dataframes or arrow tables.](../python.md) diff --git a/docs/website/docs/dlt-ecosystem/transformations/index.md b/docs/website/docs/dlt-ecosystem/transformations/index.md new file mode 100644 index 0000000000..eb99753027 --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/transformations/index.md @@ -0,0 +1,27 @@ +--- +title: Transforming your data +description: How to transform your data +keywords: [datasets, data, access, transformations] +--- +import DocCardList from '@theme/DocCardList'; + +# Transforming data + +If you'd like to transform your data after a pipeline load, you have 3 options available to you: + +* [Using dbt](./dbt/dbt.md) - dlt provides a convenient dbt wrapper to make integration easier +* [Using the `dlt` SQL client](./sql.md) - dlt exposes an sql client to transform data on your destination directly using sql +* [Using python with dataframes or arrow tables](./python.md) - you can also transform your data using arrow tables and dataframes in python + +If you need to preprocess some of your data before it is loaded, you can learn about strategies to: + +* [Rename columns](../general-usage/customising-pipelines/renaming_columns) +* [Pseudonymize columns](../general-usage/customising-pipelines/pseudonymizing_columns) +* [Remove columns](../general-usage/customising-pipelines/removing_columns) + +This is particularly useful if you are trying to remove data related to PII or other sensitive data, you want to remove columns that are not needed for your use case or you are using a destination that does not support certain data types in your source data. + + +# Learn more + + diff --git a/docs/website/docs/dlt-ecosystem/transformations/pandas.md b/docs/website/docs/dlt-ecosystem/transformations/pandas.md deleted file mode 100644 index e431313d1c..0000000000 --- a/docs/website/docs/dlt-ecosystem/transformations/pandas.md +++ /dev/null @@ -1,42 +0,0 @@ ---- -title: Transform the data with Pandas -description: Transform the data loaded by a dlt pipeline with Pandas -keywords: [transform, pandas] ---- - -# Transform the data with Pandas - -You can fetch the results of any SQL query as a dataframe. If the destination supports that -natively (i.e., BigQuery and DuckDB), `dlt` uses the native method. Thanks to this, reading -dataframes can be really fast! The example below reads GitHub reactions data from the `issues` table and -counts the reaction types. - -```py -pipeline = dlt.pipeline( - pipeline_name="github_pipeline", - destination="duckdb", - dataset_name="github_reactions", - dev_mode=True -) -with pipeline.sql_client() as client: - with client.execute_query( - 'SELECT "reactions__+1", "reactions__-1", reactions__laugh, reactions__hooray, reactions__rocket FROM issues' - ) as cursor: - # calling `df` on a cursor, returns the data as a pandas data frame - reactions = cursor.df() -counts = reactions.sum(0).sort_values(0, ascending=False) -``` - -The `df` method above returns all the data in the cursor as a data frame. You can also fetch data in -chunks by passing the `chunk_size` argument to the `df` method. - -Once your data is in a Pandas dataframe, you can transform it as needed. - -## Other transforming tools - -If you want to transform the data before loading, you can use Python. If you want to transform the -data after loading, you can use Pandas or one of the following: - -1. [dbt.](dbt/dbt.md) (recommended) -2. [`dlt` SQL client.](sql.md) - diff --git a/docs/website/docs/dlt-ecosystem/transformations/python.md b/docs/website/docs/dlt-ecosystem/transformations/python.md new file mode 100644 index 0000000000..e88d9943cb --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/transformations/python.md @@ -0,0 +1,109 @@ +--- +title: Transforming data in Python with arrow tables or dataframes +description: Transforming data loaded by a dlt pipeline with pandas dataframes or arrow tables +keywords: [transform, pandas] +--- + +# Transforming data in python with dataframes or arrow tables + +You can transform your data in python using pandas dataframes or arrow tables. To get started, please read the [dataset docs](../general-usage/dataset-access/dataset). + + +## Interactively transforming your data in python + +Using the methods explained in the [dataset docs](../general-usage/dataset-access/dataset), you can fetch data from your destination into a dataframe or arrow table in your local python process and work with it interactively. This even works for filesystem destinations: + + +The example below reads GitHub reactions data from the `issues` table and +counts the reaction types. + +```py +pipeline = dlt.pipeline( + pipeline_name="github_pipeline", + destination="duckdb", + dataset_name="github_reactions", + dev_mode=True +) + +# get a dataframe of all reactions from the dataset +reactions = pipeline.dataset().issues.select("reactions__+1", "reactions__-1", "reactions__laugh", "reactions__hooray", "reactions__rocket").df() + +# calculate and print out the sum of all reactions +counts = reactions.sum(0).sort_values(0, ascending=False) +print(counts) + +# alternatively, you can fetch the data as an arrow table +reactions = pipeline.dataset().issues.select("reactions__+1", "reactions__-1", "reactions__laugh", "reactions__hooray", "reactions__rocket").arrow() +# ... do transformations on the arrow table +``` + +## Persisting your transformed data + +Since dlt supports dataframes and arrow tables from resources directly, you can use the same pipeline to load the transformed data back into the destination. + + +### A simple example + +A simple example that creates a new table from an existing user table but only with columns that do not contain private information. Note that we use the iter_arrow() method on the relation to iterate over the arrow table instead of fetching it all at once. + +```py +pipeline = dlt.pipeline( + pipeline_name="users_pipeline", + destination="duckdb", + dataset_name="users_raw", + dev_mode=True +) + +# get user relation with only a few columns selected, but omitting email and name +users = pipeline.dataset().users.select("age", "amount_spent", "country") + +# load the data into a new table called users_clean in the same dataset +pipeline.run(users.iter_arrow(chunk_size=1000), table_name="users_clean") +``` + +### A more complex example + +The example above could easily be done in SQL. Let's assume you'd like to actually do some in python arrow transformations. For this will create a resources from which we can yield the modified arrow tables. The same is possibly with dataframes. + +```py +import pyarrow.compute as pc + +pipeline = dlt.pipeline( + pipeline_name="users_pipeline", + destination="duckdb", + dataset_name="users_raw", + dev_mode=True +) + +# NOTE: this resource will work like a regular resource and support write_disposition, primary_key, etc. +# NOTE: For selecting only users above 18, we could also use the filter method on the relation with ibis expressions +@dlt.resource(table_name="users_clean") +def users_clean(): + users = pipeline.dataset().users + for arrow_table in users.iter_arrow(chunk_size=1000): + + # we want to filter out users under 18 + age_filter = pc.greater_equal(arrow_table["age"], 18) + arrow_table = arrow_table.filter(age_filter) + + # we want to hash the email column + arrow_table = arrow_table.append_column("email_hash", pc.sha256(arrow_table["email"])) + + # we want to remove the email column and name column + arrow_table = arrow_table.drop(["email", "name"]) + + # yield the transformed arrow table + yield arrow_table + + +pipeline.run(users_clean()) +``` + +## Other transforming tools + +If you want to transform your data before loading, you can use Python. If you want to transform the +data after loading, you can use Pandas or one of the following: + +1. [dbt.](dbt/dbt.md) (recommended) +2. [`dlt` SQL client.](sql.md) + diff --git a/docs/website/docs/dlt-ecosystem/transformations/sql.md b/docs/website/docs/dlt-ecosystem/transformations/sql.md index ffd348d1a0..e6011a71ae 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/sql.md +++ b/docs/website/docs/dlt-ecosystem/transformations/sql.md @@ -1,33 +1,52 @@ --- -title: Transform the data with SQL +title: Transforming data with SQL description: Transforming the data loaded by a dlt pipeline with the dlt SQL client keywords: [transform, sql] --- -# Transform the data using the `dlt` SQL client +# Transforming data using the `dlt` SQL client A simple alternative to dbt is to query the data using the `dlt` SQL client and then perform the -transformations using Python. The `execute_sql` method allows you to execute any SQL statement, +transformations using sql statements in python. The `execute_sql` method allows you to execute any SQL statement, including statements that change the database schema or data in the tables. In the example below, we insert a row into the `customers` table. Note that the syntax is the same as for any standard `dbapi` connection. +:::info +* This method will work for all sql destinations supported by `dlt`, but not for the filesystem destination. +* Read the [sql client docs](../general-usage/dataset-access/dataset) for more information on how to access data with the sql client. +* If you are simply trying to read data, you should use the powerful [dataset interface](../general-usage/dataset-access/dataset) instead. +::: + + +Typically you will use this type of transformation if you can create or update tables directly from existing tables +without any need to insert data from your python environment. + +The example below creates a new table `aggregated_sales` that contains the total and average sales for each category and region + + ```py -pipeline = dlt.pipeline(destination="bigquery", dataset_name="crm") -try: - with pipeline.sql_client() as client: - client.execute_sql( - "INSERT INTO customers VALUES (%s, %s, %s)", - 10, - "Fred", - "fred@fred.com" - ) -except Exception: - ... +pipeline = dlt.pipeline(destination="duckdb", dataset_name="crm") + +# NOTE: this is the duckdb sql dialect, other destinations may use different expressions +with pipeline.sql_client() as client: + client.execute_sql( + """ CREATE OR REPLACE TABLE aggregated_sales AS + SELECT + category, + region, + SUM(amount) AS total_sales, + AVG(amount) AS average_sales + FROM + sales + GROUP BY + category, + region; + )""" ``` -In the case of SELECT queries, the data is returned as a list of rows, with the elements of a row -corresponding to selected columns. +You can also use the `execute_sql` method to run select queries. The data is returned as a list of rows, with the elements of a row +corresponding to selected columns. A more convenient way to extract data is to use dlt datasets. ```py try: @@ -44,9 +63,9 @@ except Exception: ## Other transforming tools -If you want to transform the data before loading, you can use Python. If you want to transform the +If you want to transform your data before loading, you can use Python. If you want to transform the data after loading, you can use SQL or one of the following: 1. [dbt](dbt/dbt.md) (recommended). -2. [Pandas](pandas.md). +2. [Python with dataframes or arrow tables](python.md). diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md index 14d9ecb04b..ea3c9c768b 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md @@ -306,7 +306,7 @@ A resource configuration is used to define a [dlt resource](../../../general-usa - `write_disposition`: The write disposition for the resource. - `primary_key`: The primary key for the resource. - `include_from_parent`: A list of fields from the parent resource to be included in the resource output. See the [resource relationships](#include-fields-from-the-parent-resource) section for more details. -- `processing_steps`: A list of [processing steps](#processing-steps-filter-and-transform-data) to filter and transform the data. +- `processing_steps`: A list of [processing steps](#processing-steps-filter-and-transform-data) to filter and transform your data. - `selected`: A flag to indicate if the resource is selected for loading. This could be useful when you want to load data only from child resources and not from the parent resource. - `auth`: An optional `AuthConfig` instance. If passed, is used over the one defined in the [client](#client) definition. Example: ```py diff --git a/docs/website/docs/general-usage/dataset-access/dataset.md b/docs/website/docs/general-usage/dataset-access/dataset.md index 07fe37ecb2..16ee7f870a 100644 --- a/docs/website/docs/general-usage/dataset-access/dataset.md +++ b/docs/website/docs/general-usage/dataset-access/dataset.md @@ -226,6 +226,8 @@ other_pipeline = dlt.pipeline(pipeline_name="other_pipeline", destination="duckd other_pipeline.run(limited_items_relation.iter_arrow(chunk_size=10_000), table_name="limited_items") ``` +Learn more about [transforming data in python with dataframes or arrow tables](../dlt-ecosystem/transformations/python). + ### Using `ibis` to query the data Visit the [Native Ibis integration](./ibis-backend.md) guide to learn more. diff --git a/docs/website/docs/general-usage/destination.md b/docs/website/docs/general-usage/destination.md index fa133b6257..ba42869957 100644 --- a/docs/website/docs/general-usage/destination.md +++ b/docs/website/docs/general-usage/destination.md @@ -128,7 +128,7 @@ When loading data, `dlt` will access the destination in two cases: 1. At the beginning of the `run` method to sync the pipeline state with the destination (or if you call `pipeline.sync_destination` explicitly). 2. In the `pipeline.load` method - to migrate the schema and load the load package. -Obviously, `dlt` will access the destination when you instantiate [sql_client](../dlt-ecosystem/transformations/sql.md). +`dlt` will also access the destination when you instantiate [sql_client](../dlt-ecosystem/transformations/sql.md). :::note `dlt` will not import the destination dependencies or access destination configuration if access is not needed. You can build multi-stage pipelines where steps are executed in separate processes or containers - the `extract` and `normalize` step do not need destination dependencies, configuration, and actual connection. diff --git a/docs/website/docs/general-usage/state.md b/docs/website/docs/general-usage/state.md index 46aa1d63ce..d849a093dc 100644 --- a/docs/website/docs/general-usage/state.md +++ b/docs/website/docs/general-usage/state.md @@ -123,14 +123,13 @@ def comments(user_id: str): # on the first pipeline run, the user_comments table does not yet exist so do not check at all # alternatively, catch DatabaseUndefinedRelation which is raised when an unknown table is selected if not current_pipeline.first_run: - with current_pipeline.sql_client() as client: - # we may get the last user comment or None which we replace with 0 - max_id = ( - client.execute_sql( - "SELECT MAX(_id) FROM user_comments WHERE user_id=?", user_id - )[0][0] - or 0 - ) + # get user comments table from pipeline dataset + user_comments = current_pipeline.dataset().user_comments + # get last user comment id with ibis expression, ibis-extras need to be installed + max_id = user_comments.filter(user_comments.user_id == user_id).select(user_comments["_id"].max()).df() + # if there are no comments for the user, max_id will be None, so we replace it with 0 + max_id = max_id[0] or 0 + # use max_id to filter our results (we simulate an API query) yield from [ {"_id": i, "value": letter, "user_id": user_id} diff --git a/docs/website/docs/tutorial/load-data-from-an-api.md b/docs/website/docs/tutorial/load-data-from-an-api.md index ddfef2cbe8..6a682457f4 100644 --- a/docs/website/docs/tutorial/load-data-from-an-api.md +++ b/docs/website/docs/tutorial/load-data-from-an-api.md @@ -72,7 +72,25 @@ Load package 1692364844.460054 is LOADED and contains no failed jobs `dlt` just created a database schema called **mydata** (the `dataset_name`) with a table **users** in it. -### Explore the data +### Explore the data in python + +You can use dlt [datasets](../general-usage/dataset-access/dataset) to easily query the data in pure python. + +```py +# get the dataset +dataset = dlt.dataset("mydata") + +# get the user relation +table = dataset.users + +# query the full table as dataframe +print(table.df()) + +# query the first 10 rows as arrow table +print(table.limit(10).arrow()) +``` + +### Explore the data in streamlit To allow a sneak peek and basic discovery, you can take advantage of [built-in integration with Streamlit](../reference/command-line-interface#show-tables-and-data-in-the-destination): diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index 274f3e82b3..53d1c2507d 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -210,13 +210,10 @@ const sidebars = { }, { type: 'category', - label: 'Transform the data', + label: 'Transforming data', link: { - type: 'generated-index', - title: 'Transform the data', - description: 'If you want to transform the data after loading, you can use one of the following methods: dbt, SQL, Pandas.', - slug: 'dlt-ecosystem/transformations', - keywords: ['transformations'], + type: 'doc', + id: 'dlt-ecosystem/transformations/index', }, items: [ { @@ -227,8 +224,8 @@ const sidebars = { 'dlt-ecosystem/transformations/dbt/dbt_cloud', ] }, + 'dlt-ecosystem/transformations/python', 'dlt-ecosystem/transformations/sql', - 'dlt-ecosystem/transformations/pandas', 'general-usage/customising-pipelines/renaming_columns', 'general-usage/customising-pipelines/pseudonymizing_columns', 'general-usage/customising-pipelines/removing_columns' From 581de8ebdc27eaf883ac4ffdf38f71b88af1bf03 Mon Sep 17 00:00:00 2001 From: dave Date: Mon, 9 Dec 2024 18:18:39 +0100 Subject: [PATCH 04/35] fix some linting errors --- docs/website/docs/build-a-pipeline-tutorial.md | 2 +- docs/website/docs/dlt-ecosystem/transformations/sql.md | 2 +- docs/website/docs/tutorial/load-data-from-an-api.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/website/docs/build-a-pipeline-tutorial.md b/docs/website/docs/build-a-pipeline-tutorial.md index 860657b16c..36d30a184f 100644 --- a/docs/website/docs/build-a-pipeline-tutorial.md +++ b/docs/website/docs/build-a-pipeline-tutorial.md @@ -280,7 +280,7 @@ with pipeline.sql_client() as client: GROUP BY category, region; - )""" + """) ``` In this example, the `execute_sql` method of the SQL client allows you to execute SQL statements. The statement inserts a row with values into the `customers` table. diff --git a/docs/website/docs/dlt-ecosystem/transformations/sql.md b/docs/website/docs/dlt-ecosystem/transformations/sql.md index e6011a71ae..02e5aa4396 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/sql.md +++ b/docs/website/docs/dlt-ecosystem/transformations/sql.md @@ -42,7 +42,7 @@ with pipeline.sql_client() as client: GROUP BY category, region; - )""" + """) ``` You can also use the `execute_sql` method to run select queries. The data is returned as a list of rows, with the elements of a row diff --git a/docs/website/docs/tutorial/load-data-from-an-api.md b/docs/website/docs/tutorial/load-data-from-an-api.md index 6a682457f4..4825c50fe3 100644 --- a/docs/website/docs/tutorial/load-data-from-an-api.md +++ b/docs/website/docs/tutorial/load-data-from-an-api.md @@ -78,7 +78,7 @@ You can use dlt [datasets](../general-usage/dataset-access/dataset) to easily qu ```py # get the dataset -dataset = dlt.dataset("mydata") +dataset = pipeline.dataset("mydata") # get the user relation table = dataset.users From c4f19a46029f16d5fcf9b4061bab5fa86a2efabb Mon Sep 17 00:00:00 2001 From: dave Date: Tue, 19 Nov 2024 09:59:00 +0100 Subject: [PATCH 05/35] add row counts feature for readabledataset --- dlt/common/destination/reference.py | 4 ++ dlt/destinations/dataset.py | 32 ++++++++++- tests/load/test_read_interfaces.py | 85 ++++++++++++++++++++++++++++- 3 files changed, 119 insertions(+), 2 deletions(-) diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index e27f99cde7..f4c8fdb6ed 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -588,6 +588,10 @@ def __getattr__(self, table: str) -> SupportsReadableRelation: ... def ibis(self) -> IbisBackend: ... + def row_counts( + self, *, data_tables: bool = True, dlt_tables: bool = False, table_names: List[str] = None + ) -> SupportsReadableRelation: ... + class JobClientBase(ABC): def __init__( diff --git a/dlt/destinations/dataset.py b/dlt/destinations/dataset.py index 27a7f5a7af..594e863f30 100644 --- a/dlt/destinations/dataset.py +++ b/dlt/destinations/dataset.py @@ -1,4 +1,8 @@ -from typing import Any, Generator, Sequence, Union, TYPE_CHECKING, Tuple +from typing import Any, Generator, Sequence, Union, List, TYPE_CHECKING, Tuple +from dlt.common.json import json +from copy import deepcopy + +from dlt.common.normalizers.naming.naming import NamingConvention from contextlib import contextmanager @@ -322,6 +326,32 @@ def __getattr__(self, table_name: str) -> SupportsReadableRelation: """access of table via property notation""" return self.table(table_name) + def row_counts( + self, *, data_tables: bool = True, dlt_tables: bool = False, table_names: List[str] = None + ) -> SupportsReadableRelation: + """Returns a dictionary of table names and their row counts, returns counts of all data tables by default""" + """If table_names is provided, only the tables in the list are returned regardless of the data_tables and dlt_tables flags""" + + selected_tables = table_names or [] + if not selected_tables: + if data_tables: + selected_tables += self.schema.data_table_names(seen_data_only=True) + if dlt_tables: + selected_tables += self.schema.dlt_table_names() + + # Build UNION ALL query to get row counts for all selected tables + queries = [] + for table in selected_tables: + queries.append( + f"SELECT '{table}' as table_name, COUNT(*) as row_count FROM" + f" {self.sql_client.make_qualified_table_name(table)}" + ) + + query = " UNION ALL ".join(queries) + + # Execute query and build result dict + return self(query) + def dataset( destination: TDestinationReferenceArg, diff --git a/tests/load/test_read_interfaces.py b/tests/load/test_read_interfaces.py index f07532d818..85b7ae7a41 100644 --- a/tests/load/test_read_interfaces.py +++ b/tests/load/test_read_interfaces.py @@ -211,7 +211,6 @@ def test_dataframe_access(populated_pipeline: Pipeline) -> None: if not skip_df_chunk_size_check: assert len(df.index) == chunk_size - # lowercase results for the snowflake case assert set(df.columns.values) == set(EXPECTED_COLUMNS) # iterate all dataframes @@ -365,6 +364,90 @@ def test_loads_table_access(populated_pipeline: Pipeline) -> None: assert len(loads_table.fetchall()) == 1 +@pytest.mark.no_load +@pytest.mark.essential +@pytest.mark.parametrize( + "populated_pipeline", + configs, + indirect=True, + ids=lambda x: x.name, +) +def test_row_counts(populated_pipeline: Pipeline) -> None: + total_records = _total_records(populated_pipeline) + + dataset = populated_pipeline.dataset() + # default is all data tables + assert set(dataset.row_counts().df().itertuples(index=False, name=None)) == { + ( + "items", + total_records, + ), + ( + "double_items", + total_records, + ), + ( + "items__children", + total_records * 2, + ), + } + # get only one data table + assert set( + dataset.row_counts(table_names=["items"]).df().itertuples(index=False, name=None) + ) == { + ( + "items", + total_records, + ), + } + # get all dlt tables + assert set( + dataset.row_counts(dlt_tables=True, data_tables=False) + .df() + .itertuples(index=False, name=None) + ) == { + ( + "_dlt_version", + 1, + ), + ( + "_dlt_loads", + 1, + ), + ( + "_dlt_pipeline_state", + 1, + ), + } + # get them all + assert set(dataset.row_counts(dlt_tables=True).df().itertuples(index=False, name=None)) == { + ( + "_dlt_version", + 1, + ), + ( + "_dlt_loads", + 1, + ), + ( + "_dlt_pipeline_state", + 1, + ), + ( + "items", + total_records, + ), + ( + "double_items", + total_records, + ), + ( + "items__children", + total_records * 2, + ), + } + + @pytest.mark.no_load @pytest.mark.essential @pytest.mark.parametrize( From c8fc3c05d9540e6615373dfb23fc23d32bd5ff8b Mon Sep 17 00:00:00 2001 From: dave Date: Wed, 11 Dec 2024 11:14:18 +0100 Subject: [PATCH 06/35] add dataset access example to getting started scripts --- docs/website/docs/intro.md | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/docs/website/docs/intro.md b/docs/website/docs/intro.md index b20d41c494..dbc72023fd 100644 --- a/docs/website/docs/intro.md +++ b/docs/website/docs/intro.md @@ -70,6 +70,10 @@ pipeline = dlt.pipeline( ) load_info = pipeline.run(source) + +# print load info and posts table as dataframe +print(load_info) +print(pipeline.dataset().posts.df()) ``` Follow the [REST API source tutorial](./tutorial/rest-api) to learn more about the source configuration and pagination methods. @@ -92,6 +96,10 @@ pipeline = dlt.pipeline( ) load_info = pipeline.run(source) + +# print load info and the "family" table as dataframe +print(load_info) +print(pipeline.dataset().family.df()) ``` Follow the [SQL source tutorial](./tutorial/sql-database) to learn more about the source configuration and supported databases. @@ -116,6 +124,10 @@ pipeline = dlt.pipeline( ) load_info = pipeline.run(resource) + +# print load info and the "example" table as dataframe +print(load_info) +print(pipeline.dataset().example.df()) ``` Follow the [filesystem source tutorial](./tutorial/filesystem) to learn more about the source configuration and supported storage services. @@ -128,7 +140,7 @@ dlt is able to load data from Python generators or directly from Python data str ```py import dlt -@dlt.resource +@dlt.resource(table_name="foo_data") def foo(): for i in range(10): yield {"id": i, "name": f"This is item {i}"} @@ -139,6 +151,10 @@ pipeline = dlt.pipeline( ) load_info = pipeline.run(foo) + +# print load info and the "foo_data" table as dataframe +print(load_info) +print(pipeline.dataset().foo_table.df()) ``` Check out the [Python data structures tutorial](./tutorial/load-data-from-an-api) to learn about dlt fundamentals and advanced usage scenarios. From 996266941d210b1c05ea6e277a3005a88656a783 Mon Sep 17 00:00:00 2001 From: dave Date: Wed, 11 Dec 2024 11:22:03 +0100 Subject: [PATCH 07/35] add notes about row_counts special query to datasets docs --- .../docs/general-usage/dataset-access/dataset.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/docs/website/docs/general-usage/dataset-access/dataset.md b/docs/website/docs/general-usage/dataset-access/dataset.md index 30b03154cd..36d3fd3a26 100644 --- a/docs/website/docs/general-usage/dataset-access/dataset.md +++ b/docs/website/docs/general-usage/dataset-access/dataset.md @@ -40,6 +40,9 @@ Assuming you have a `Pipeline` object (let's call it `pipeline`), you can obtain ```py # Get the readable dataset from the pipeline dataset = pipeline.dataset() + +# print the row counts of all tables in the destination as dataframe +print(dataset.row_counts().df()) ``` ### Access tables as `ReadableRelation` @@ -116,6 +119,18 @@ for items_chunk in items_relation.iter_fetch(chunk_size=500): The methods available on the ReadableRelation correspond to the methods available on the cursor returned by the SQL client. Please refer to the [SQL client](./sql-client.md#supported-methods-on-the-cursor) guide for more information. +## Special queries + +You can use the `row_counts` method to get the row counts of all tables in the destination as a dataframe. + +```py +# print the row counts of all tables in the destination as dataframe +print(dataset.row_counts().df()) + +# or as tuples +print(dataset.row_counts().fetchall()) +``` + ## Modifying queries You can refine your data retrieval by limiting the number of records, selecting specific columns, or chaining these operations. From 1fc98912f2801e3ef5713325a497ef2d05cd516e Mon Sep 17 00:00:00 2001 From: dave Date: Wed, 11 Dec 2024 11:45:07 +0100 Subject: [PATCH 08/35] fix internal docusaurus links --- docs/website/docs/dlt-ecosystem/destinations/duckdb.md | 2 +- docs/website/docs/dlt-ecosystem/transformations/index.md | 6 +++--- docs/website/docs/dlt-ecosystem/transformations/python.md | 4 ++-- docs/website/docs/dlt-ecosystem/transformations/sql.md | 4 ++-- docs/website/docs/general-usage/dataset-access/dataset.md | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md index 7dcdad2e24..8c49401d13 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md +++ b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md @@ -118,7 +118,7 @@ to disable tz adjustments. ## Destination configuration -By default, a DuckDB database will be created in the current working directory with a name `.duckdb` (`chess.duckdb` in the example above). After loading, it is available in `read/write` mode via `with pipeline.sql_client() as con:`, which is a wrapper over `DuckDBPyConnection`. See [duckdb docs](https://duckdb.org/docs/api/python/overview#persistent-storage) for details. If you want to read data, use [datasets](../general-usage/dataset-access/dataset) instead of the sql client. +By default, a DuckDB database will be created in the current working directory with a name `.duckdb` (`chess.duckdb` in the example above). After loading, it is available in `read/write` mode via `with pipeline.sql_client() as con:`, which is a wrapper over `DuckDBPyConnection`. See [duckdb docs](https://duckdb.org/docs/api/python/overview#persistent-storage) for details. If you want to read data, use [datasets](../../general-usage/dataset-access/dataset) instead of the sql client. The `duckdb` credentials do not require any secret values. [You are free to pass the credentials and configuration explicitly](../../general-usage/destination.md#pass-explicit-credentials). For example: ```py diff --git a/docs/website/docs/dlt-ecosystem/transformations/index.md b/docs/website/docs/dlt-ecosystem/transformations/index.md index eb99753027..7addd2848d 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/index.md +++ b/docs/website/docs/dlt-ecosystem/transformations/index.md @@ -15,9 +15,9 @@ If you'd like to transform your data after a pipeline load, you have 3 options a If you need to preprocess some of your data before it is loaded, you can learn about strategies to: -* [Rename columns](../general-usage/customising-pipelines/renaming_columns) -* [Pseudonymize columns](../general-usage/customising-pipelines/pseudonymizing_columns) -* [Remove columns](../general-usage/customising-pipelines/removing_columns) +* [Rename columns](../../general-usage/customising-pipelines/renaming_columns) +* [Pseudonymize columns](../../general-usage/customising-pipelines/pseudonymizing_columns) +* [Remove columns](../../general-usage/customising-pipelines/removing_columns) This is particularly useful if you are trying to remove data related to PII or other sensitive data, you want to remove columns that are not needed for your use case or you are using a destination that does not support certain data types in your source data. diff --git a/docs/website/docs/dlt-ecosystem/transformations/python.md b/docs/website/docs/dlt-ecosystem/transformations/python.md index e88d9943cb..91ece11688 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/python.md +++ b/docs/website/docs/dlt-ecosystem/transformations/python.md @@ -6,12 +6,12 @@ keywords: [transform, pandas] # Transforming data in python with dataframes or arrow tables -You can transform your data in python using pandas dataframes or arrow tables. To get started, please read the [dataset docs](../general-usage/dataset-access/dataset). +You can transform your data in python using pandas dataframes or arrow tables. To get started, please read the [dataset docs](../../general-usage/dataset-access/dataset). ## Interactively transforming your data in python -Using the methods explained in the [dataset docs](../general-usage/dataset-access/dataset), you can fetch data from your destination into a dataframe or arrow table in your local python process and work with it interactively. This even works for filesystem destinations: +Using the methods explained in the [dataset docs](../../general-usage/dataset-access/dataset), you can fetch data from your destination into a dataframe or arrow table in your local python process and work with it interactively. This even works for filesystem destinations: The example below reads GitHub reactions data from the `issues` table and diff --git a/docs/website/docs/dlt-ecosystem/transformations/sql.md b/docs/website/docs/dlt-ecosystem/transformations/sql.md index 02e5aa4396..c155289253 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/sql.md +++ b/docs/website/docs/dlt-ecosystem/transformations/sql.md @@ -14,8 +14,8 @@ connection. :::info * This method will work for all sql destinations supported by `dlt`, but not for the filesystem destination. -* Read the [sql client docs](../general-usage/dataset-access/dataset) for more information on how to access data with the sql client. -* If you are simply trying to read data, you should use the powerful [dataset interface](../general-usage/dataset-access/dataset) instead. +* Read the [sql client docs](../../ general-usage/dataset-access/dataset) for more information on how to access data with the sql client. +* If you are simply trying to read data, you should use the powerful [dataset interface](../../general-usage/dataset-access/dataset) instead. ::: diff --git a/docs/website/docs/general-usage/dataset-access/dataset.md b/docs/website/docs/general-usage/dataset-access/dataset.md index 36d3fd3a26..5afa9682fe 100644 --- a/docs/website/docs/general-usage/dataset-access/dataset.md +++ b/docs/website/docs/general-usage/dataset-access/dataset.md @@ -299,7 +299,7 @@ other_pipeline = dlt.pipeline(pipeline_name="other_pipeline", destination="duckd other_pipeline.run(limited_items_relation.iter_arrow(chunk_size=10_000), table_name="limited_items") ``` -Learn more about [transforming data in python with dataframes or arrow tables](../dlt-ecosystem/transformations/python). +Learn more about [transforming data in python with dataframes or arrow tables](../../dlt-ecosystem/transformations/python). ### Using `ibis` to query the data From d6ceab0b325e25cbdf4c9023ff493f49a9877fdb Mon Sep 17 00:00:00 2001 From: Alena Astrakhantseva Date: Fri, 13 Dec 2024 15:35:12 +0100 Subject: [PATCH 09/35] Update docs/website/docs/intro.md --- docs/website/docs/intro.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/intro.md b/docs/website/docs/intro.md index dbc72023fd..bc227b85ad 100644 --- a/docs/website/docs/intro.md +++ b/docs/website/docs/intro.md @@ -154,7 +154,7 @@ load_info = pipeline.run(foo) # print load info and the "foo_data" table as dataframe print(load_info) -print(pipeline.dataset().foo_table.df()) +print(pipeline.dataset().foo_data.df()) ``` Check out the [Python data structures tutorial](./tutorial/load-data-from-an-api) to learn about dlt fundamentals and advanced usage scenarios. From 6a133915de096584f4112be46f89c3dab9ac3a35 Mon Sep 17 00:00:00 2001 From: Alena Astrakhantseva Date: Fri, 13 Dec 2024 15:35:23 +0100 Subject: [PATCH 10/35] Update docs/website/docs/tutorial/load-data-from-an-api.md --- docs/website/docs/tutorial/load-data-from-an-api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/tutorial/load-data-from-an-api.md b/docs/website/docs/tutorial/load-data-from-an-api.md index 4825c50fe3..93c6d819b8 100644 --- a/docs/website/docs/tutorial/load-data-from-an-api.md +++ b/docs/website/docs/tutorial/load-data-from-an-api.md @@ -90,7 +90,7 @@ print(table.df()) print(table.limit(10).arrow()) ``` -### Explore the data in streamlit +### Explore data in Streamlit To allow a sneak peek and basic discovery, you can take advantage of [built-in integration with Streamlit](../reference/command-line-interface#show-tables-and-data-in-the-destination): From 8068b7be468d14c843a6e07e8ebeea6fac103917 Mon Sep 17 00:00:00 2001 From: Alena Astrakhantseva Date: Fri, 13 Dec 2024 15:35:32 +0100 Subject: [PATCH 11/35] Update docs/website/docs/tutorial/load-data-from-an-api.md --- docs/website/docs/tutorial/load-data-from-an-api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/tutorial/load-data-from-an-api.md b/docs/website/docs/tutorial/load-data-from-an-api.md index 93c6d819b8..739d3c3164 100644 --- a/docs/website/docs/tutorial/load-data-from-an-api.md +++ b/docs/website/docs/tutorial/load-data-from-an-api.md @@ -74,7 +74,7 @@ Load package 1692364844.460054 is LOADED and contains no failed jobs ### Explore the data in python -You can use dlt [datasets](../general-usage/dataset-access/dataset) to easily query the data in pure python. +You can use dlt [datasets](../general-usage/dataset-access/dataset) to easily query the data in pure Python. ```py # get the dataset From d8f5cf7012753eba8e4f205d90c854c7a88913b7 Mon Sep 17 00:00:00 2001 From: Alena Astrakhantseva Date: Fri, 13 Dec 2024 15:35:41 +0100 Subject: [PATCH 12/35] Update docs/website/docs/tutorial/load-data-from-an-api.md --- docs/website/docs/tutorial/load-data-from-an-api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/tutorial/load-data-from-an-api.md b/docs/website/docs/tutorial/load-data-from-an-api.md index 739d3c3164..73f780ba7a 100644 --- a/docs/website/docs/tutorial/load-data-from-an-api.md +++ b/docs/website/docs/tutorial/load-data-from-an-api.md @@ -72,7 +72,7 @@ Load package 1692364844.460054 is LOADED and contains no failed jobs `dlt` just created a database schema called **mydata** (the `dataset_name`) with a table **users** in it. -### Explore the data in python +### Explore data in Python You can use dlt [datasets](../general-usage/dataset-access/dataset) to easily query the data in pure Python. From c2647a54e50e90358ffffe11b1ac42b8182bd1f2 Mon Sep 17 00:00:00 2001 From: Alena Astrakhantseva Date: Fri, 13 Dec 2024 15:35:49 +0100 Subject: [PATCH 13/35] Update docs/website/docs/general-usage/dataset-access/dataset.md --- docs/website/docs/general-usage/dataset-access/dataset.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/general-usage/dataset-access/dataset.md b/docs/website/docs/general-usage/dataset-access/dataset.md index 5afa9682fe..70df7e3c82 100644 --- a/docs/website/docs/general-usage/dataset-access/dataset.md +++ b/docs/website/docs/general-usage/dataset-access/dataset.md @@ -301,7 +301,7 @@ other_pipeline.run(limited_items_relation.iter_arrow(chunk_size=10_000), table_n Learn more about [transforming data in python with dataframes or arrow tables](../../dlt-ecosystem/transformations/python). -### Using `ibis` to query the data +### Using `ibis` to query data Visit the [Native Ibis integration](./ibis-backend.md) guide to learn more. From afa16331e3e394545865533fc149a8aff8ca2ab0 Mon Sep 17 00:00:00 2001 From: Alena Astrakhantseva Date: Fri, 13 Dec 2024 15:35:58 +0100 Subject: [PATCH 14/35] Update docs/website/docs/general-usage/dataset-access/dataset.md --- docs/website/docs/general-usage/dataset-access/dataset.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/general-usage/dataset-access/dataset.md b/docs/website/docs/general-usage/dataset-access/dataset.md index 70df7e3c82..efc03cc076 100644 --- a/docs/website/docs/general-usage/dataset-access/dataset.md +++ b/docs/website/docs/general-usage/dataset-access/dataset.md @@ -299,7 +299,7 @@ other_pipeline = dlt.pipeline(pipeline_name="other_pipeline", destination="duckd other_pipeline.run(limited_items_relation.iter_arrow(chunk_size=10_000), table_name="limited_items") ``` -Learn more about [transforming data in python with dataframes or arrow tables](../../dlt-ecosystem/transformations/python). +Learn more about [transforming data in Python with Arrow tables or DataFrames](../../dlt-ecosystem/transformations/python). ### Using `ibis` to query data From 65d3c748e04d074395c1cf430a08f06fe3fb7387 Mon Sep 17 00:00:00 2001 From: Alena Astrakhantseva Date: Fri, 13 Dec 2024 15:36:06 +0100 Subject: [PATCH 15/35] Update docs/website/docs/dlt-ecosystem/transformations/index.md --- docs/website/docs/dlt-ecosystem/transformations/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/transformations/index.md b/docs/website/docs/dlt-ecosystem/transformations/index.md index 7addd2848d..457db2bd12 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/index.md +++ b/docs/website/docs/dlt-ecosystem/transformations/index.md @@ -15,7 +15,7 @@ If you'd like to transform your data after a pipeline load, you have 3 options a If you need to preprocess some of your data before it is loaded, you can learn about strategies to: -* [Rename columns](../../general-usage/customising-pipelines/renaming_columns) +* [Rename columns.](../../general-usage/customising-pipelines/renaming_columns) * [Pseudonymize columns](../../general-usage/customising-pipelines/pseudonymizing_columns) * [Remove columns](../../general-usage/customising-pipelines/removing_columns) From 949fdca320205367a44dc774c7f0fbaff50e7153 Mon Sep 17 00:00:00 2001 From: Alena Astrakhantseva Date: Fri, 13 Dec 2024 15:36:16 +0100 Subject: [PATCH 16/35] Update docs/website/docs/dlt-ecosystem/transformations/index.md --- docs/website/docs/dlt-ecosystem/transformations/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/transformations/index.md b/docs/website/docs/dlt-ecosystem/transformations/index.md index 457db2bd12..1963c80f5b 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/index.md +++ b/docs/website/docs/dlt-ecosystem/transformations/index.md @@ -11,7 +11,7 @@ If you'd like to transform your data after a pipeline load, you have 3 options a * [Using dbt](./dbt/dbt.md) - dlt provides a convenient dbt wrapper to make integration easier * [Using the `dlt` SQL client](./sql.md) - dlt exposes an sql client to transform data on your destination directly using sql -* [Using python with dataframes or arrow tables](./python.md) - you can also transform your data using arrow tables and dataframes in python +* [Using Python with DataFrames or Arrow tables](./python.md) - you can also transform your data using Arrow tables and DataFrames in Python. If you need to preprocess some of your data before it is loaded, you can learn about strategies to: From 5918440789d120ae5d7384c90c9e56d0c777b0f0 Mon Sep 17 00:00:00 2001 From: Alena Astrakhantseva Date: Fri, 13 Dec 2024 15:36:23 +0100 Subject: [PATCH 17/35] Update docs/website/docs/dlt-ecosystem/transformations/index.md --- docs/website/docs/dlt-ecosystem/transformations/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/transformations/index.md b/docs/website/docs/dlt-ecosystem/transformations/index.md index 1963c80f5b..b304aaf694 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/index.md +++ b/docs/website/docs/dlt-ecosystem/transformations/index.md @@ -10,7 +10,7 @@ import DocCardList from '@theme/DocCardList'; If you'd like to transform your data after a pipeline load, you have 3 options available to you: * [Using dbt](./dbt/dbt.md) - dlt provides a convenient dbt wrapper to make integration easier -* [Using the `dlt` SQL client](./sql.md) - dlt exposes an sql client to transform data on your destination directly using sql +* [Using the `dlt` SQL client](./sql.md) - dlt exposes an SQL client to transform data on your destination directly using SQL. * [Using Python with DataFrames or Arrow tables](./python.md) - you can also transform your data using Arrow tables and DataFrames in Python. If you need to preprocess some of your data before it is loaded, you can learn about strategies to: From ee36d4d7db6119bfeee12558f2b0c56fc7f7e513 Mon Sep 17 00:00:00 2001 From: Alena Astrakhantseva Date: Fri, 13 Dec 2024 15:36:30 +0100 Subject: [PATCH 18/35] Update docs/website/docs/dlt-ecosystem/transformations/index.md --- docs/website/docs/dlt-ecosystem/transformations/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/transformations/index.md b/docs/website/docs/dlt-ecosystem/transformations/index.md index b304aaf694..ea012e0a43 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/index.md +++ b/docs/website/docs/dlt-ecosystem/transformations/index.md @@ -9,7 +9,7 @@ import DocCardList from '@theme/DocCardList'; If you'd like to transform your data after a pipeline load, you have 3 options available to you: -* [Using dbt](./dbt/dbt.md) - dlt provides a convenient dbt wrapper to make integration easier +* [Using dbt](./dbt/dbt.md) - dlt provides a convenient dbt wrapper to make integration easier. * [Using the `dlt` SQL client](./sql.md) - dlt exposes an SQL client to transform data on your destination directly using SQL. * [Using Python with DataFrames or Arrow tables](./python.md) - you can also transform your data using Arrow tables and DataFrames in Python. From b475cca8586150b53e0a2ecc8dbb125c64aa4b58 Mon Sep 17 00:00:00 2001 From: Alena Astrakhantseva Date: Fri, 13 Dec 2024 15:36:36 +0100 Subject: [PATCH 19/35] Update docs/website/docs/dlt-ecosystem/destinations/duckdb.md --- docs/website/docs/dlt-ecosystem/destinations/duckdb.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md index 8c49401d13..a4537195ff 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md +++ b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md @@ -118,7 +118,7 @@ to disable tz adjustments. ## Destination configuration -By default, a DuckDB database will be created in the current working directory with a name `.duckdb` (`chess.duckdb` in the example above). After loading, it is available in `read/write` mode via `with pipeline.sql_client() as con:`, which is a wrapper over `DuckDBPyConnection`. See [duckdb docs](https://duckdb.org/docs/api/python/overview#persistent-storage) for details. If you want to read data, use [datasets](../../general-usage/dataset-access/dataset) instead of the sql client. +By default, a DuckDB database will be created in the current working directory with a name `.duckdb` (`chess.duckdb` in the example above). After loading, it is available in **read/write** mode via `with pipeline.sql_client() as con:`, which is a wrapper over `DuckDBPyConnection`. See [duckdb docs](https://duckdb.org/docs/api/python/overview#persistent-storage) for details. If you want to **read** data, use [pipeline.dataset()](../../general-usage/dataset-access/dataset) instead of `sql_client`. The `duckdb` credentials do not require any secret values. [You are free to pass the credentials and configuration explicitly](../../general-usage/destination.md#pass-explicit-credentials). For example: ```py From 68c1db5cb2d3cf5f46e73e6ab804247c7c305391 Mon Sep 17 00:00:00 2001 From: Alena Astrakhantseva Date: Fri, 13 Dec 2024 15:36:56 +0100 Subject: [PATCH 20/35] Update docs/website/docs/dlt-ecosystem/transformations/index.md --- docs/website/docs/dlt-ecosystem/transformations/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/transformations/index.md b/docs/website/docs/dlt-ecosystem/transformations/index.md index ea012e0a43..9221047471 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/index.md +++ b/docs/website/docs/dlt-ecosystem/transformations/index.md @@ -16,7 +16,7 @@ If you'd like to transform your data after a pipeline load, you have 3 options a If you need to preprocess some of your data before it is loaded, you can learn about strategies to: * [Rename columns.](../../general-usage/customising-pipelines/renaming_columns) -* [Pseudonymize columns](../../general-usage/customising-pipelines/pseudonymizing_columns) +* [Pseudonymize columns.](../../general-usage/customising-pipelines/pseudonymizing_columns) * [Remove columns](../../general-usage/customising-pipelines/removing_columns) This is particularly useful if you are trying to remove data related to PII or other sensitive data, you want to remove columns that are not needed for your use case or you are using a destination that does not support certain data types in your source data. From 986785f916dc35fce389c13faa8125e11f08bbbc Mon Sep 17 00:00:00 2001 From: Alena Astrakhantseva Date: Fri, 13 Dec 2024 15:37:09 +0100 Subject: [PATCH 21/35] Update docs/website/docs/dlt-ecosystem/transformations/index.md --- docs/website/docs/dlt-ecosystem/transformations/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/transformations/index.md b/docs/website/docs/dlt-ecosystem/transformations/index.md index 9221047471..6c51e8cd8d 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/index.md +++ b/docs/website/docs/dlt-ecosystem/transformations/index.md @@ -17,7 +17,7 @@ If you need to preprocess some of your data before it is loaded, you can learn a * [Rename columns.](../../general-usage/customising-pipelines/renaming_columns) * [Pseudonymize columns.](../../general-usage/customising-pipelines/pseudonymizing_columns) -* [Remove columns](../../general-usage/customising-pipelines/removing_columns) +* [Remove columns.](../../general-usage/customising-pipelines/removing_columns) This is particularly useful if you are trying to remove data related to PII or other sensitive data, you want to remove columns that are not needed for your use case or you are using a destination that does not support certain data types in your source data. From 0c21712911fd204be154f7a60211434a3cab79c5 Mon Sep 17 00:00:00 2001 From: Alena Astrakhantseva Date: Fri, 13 Dec 2024 15:37:20 +0100 Subject: [PATCH 22/35] Update docs/website/docs/dlt-ecosystem/transformations/python.md --- docs/website/docs/dlt-ecosystem/transformations/python.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/transformations/python.md b/docs/website/docs/dlt-ecosystem/transformations/python.md index 91ece11688..ae2131a923 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/python.md +++ b/docs/website/docs/dlt-ecosystem/transformations/python.md @@ -1,5 +1,5 @@ --- -title: Transforming data in Python with arrow tables or dataframes +title: Transforming data in Python with Arrow tables or DataFrames description: Transforming data loaded by a dlt pipeline with pandas dataframes or arrow tables keywords: [transform, pandas] --- From ce4c2ff3fb89f8c55bdb70966d8abc644b407add Mon Sep 17 00:00:00 2001 From: Alena Astrakhantseva Date: Fri, 13 Dec 2024 15:37:32 +0100 Subject: [PATCH 23/35] Update docs/website/docs/dlt-ecosystem/transformations/python.md --- docs/website/docs/dlt-ecosystem/transformations/python.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/transformations/python.md b/docs/website/docs/dlt-ecosystem/transformations/python.md index ae2131a923..29b32d53e7 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/python.md +++ b/docs/website/docs/dlt-ecosystem/transformations/python.md @@ -4,7 +4,7 @@ description: Transforming data loaded by a dlt pipeline with pandas dataframes o keywords: [transform, pandas] --- -# Transforming data in python with dataframes or arrow tables +# Transforming data in Python with Arrow tables or DataFrames You can transform your data in python using pandas dataframes or arrow tables. To get started, please read the [dataset docs](../../general-usage/dataset-access/dataset). From 3306e9a1d381f1b25e1bbc2c20e380477f6cadd6 Mon Sep 17 00:00:00 2001 From: Alena Astrakhantseva Date: Fri, 13 Dec 2024 15:37:46 +0100 Subject: [PATCH 24/35] Update docs/website/docs/dlt-ecosystem/transformations/python.md --- docs/website/docs/dlt-ecosystem/transformations/python.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/transformations/python.md b/docs/website/docs/dlt-ecosystem/transformations/python.md index 29b32d53e7..5359aa5525 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/python.md +++ b/docs/website/docs/dlt-ecosystem/transformations/python.md @@ -6,7 +6,7 @@ keywords: [transform, pandas] # Transforming data in Python with Arrow tables or DataFrames -You can transform your data in python using pandas dataframes or arrow tables. To get started, please read the [dataset docs](../../general-usage/dataset-access/dataset). +You can transform your data in Python using Pandas DataFrames or Arrow tables. To get started, please read the [dataset docs](../../general-usage/dataset-access/dataset). ## Interactively transforming your data in python From 39f68d8f7516834e24e589a13af7310b76389ce4 Mon Sep 17 00:00:00 2001 From: Alena Astrakhantseva Date: Fri, 13 Dec 2024 15:37:59 +0100 Subject: [PATCH 25/35] Update docs/website/docs/dlt-ecosystem/transformations/python.md --- docs/website/docs/dlt-ecosystem/transformations/python.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/transformations/python.md b/docs/website/docs/dlt-ecosystem/transformations/python.md index 5359aa5525..88d8ecb132 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/python.md +++ b/docs/website/docs/dlt-ecosystem/transformations/python.md @@ -9,7 +9,7 @@ keywords: [transform, pandas] You can transform your data in Python using Pandas DataFrames or Arrow tables. To get started, please read the [dataset docs](../../general-usage/dataset-access/dataset). -## Interactively transforming your data in python +## Interactively transforming your data in Python Using the methods explained in the [dataset docs](../../general-usage/dataset-access/dataset), you can fetch data from your destination into a dataframe or arrow table in your local python process and work with it interactively. This even works for filesystem destinations: From 4914b1c186bd17972350f5b6bae6b8b1885f912b Mon Sep 17 00:00:00 2001 From: Alena Astrakhantseva Date: Fri, 13 Dec 2024 15:38:15 +0100 Subject: [PATCH 26/35] Update docs/website/docs/dlt-ecosystem/transformations/python.md --- docs/website/docs/dlt-ecosystem/transformations/python.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/transformations/python.md b/docs/website/docs/dlt-ecosystem/transformations/python.md index 88d8ecb132..0081ead2d9 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/python.md +++ b/docs/website/docs/dlt-ecosystem/transformations/python.md @@ -11,7 +11,7 @@ You can transform your data in Python using Pandas DataFrames or Arrow tables. T ## Interactively transforming your data in Python -Using the methods explained in the [dataset docs](../../general-usage/dataset-access/dataset), you can fetch data from your destination into a dataframe or arrow table in your local python process and work with it interactively. This even works for filesystem destinations: +Using the methods explained in the [dataset docs](../../general-usage/dataset-access/dataset), you can fetch data from your destination into a DataFrame or Arrow table in your local Python process and work with it interactively. This even works for filesystem destinations: The example below reads GitHub reactions data from the `issues` table and From 1464341de31cb0057f370c12b9bd7f87eeccdbd9 Mon Sep 17 00:00:00 2001 From: Alena Astrakhantseva Date: Fri, 13 Dec 2024 15:38:31 +0100 Subject: [PATCH 27/35] Update docs/website/docs/dlt-ecosystem/transformations/python.md --- docs/website/docs/dlt-ecosystem/transformations/python.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/transformations/python.md b/docs/website/docs/dlt-ecosystem/transformations/python.md index 0081ead2d9..844217ffc1 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/python.md +++ b/docs/website/docs/dlt-ecosystem/transformations/python.md @@ -39,7 +39,7 @@ reactions = pipeline.dataset().issues.select("reactions__+1", "reactions__-1", " ## Persisting your transformed data -Since dlt supports dataframes and arrow tables from resources directly, you can use the same pipeline to load the transformed data back into the destination. +Since dlt supports DataFrames and Arrow tables from resources directly, you can use the same pipeline to load the transformed data back into the destination. ### A simple example From 316c5496f7e2aa0948585e538996d38e898282fc Mon Sep 17 00:00:00 2001 From: Alena Astrakhantseva Date: Fri, 13 Dec 2024 15:38:46 +0100 Subject: [PATCH 28/35] Update docs/website/docs/dlt-ecosystem/transformations/python.md --- docs/website/docs/dlt-ecosystem/transformations/python.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/transformations/python.md b/docs/website/docs/dlt-ecosystem/transformations/python.md index 844217ffc1..b27fa4e6d1 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/python.md +++ b/docs/website/docs/dlt-ecosystem/transformations/python.md @@ -44,7 +44,7 @@ Since dlt supports DataFrames and Arrow tables from resources directly, you can ### A simple example -A simple example that creates a new table from an existing user table but only with columns that do not contain private information. Note that we use the iter_arrow() method on the relation to iterate over the arrow table instead of fetching it all at once. +A simple example that creates a new table from an existing user table but only with columns that do not contain private information. Note that we use the `iter_arrow()` method on the relation to iterate over the arrow table instead of fetching it all at once. ```py pipeline = dlt.pipeline( From 9471cdb0917477f0e5e8d4e672e1e3f6b6fc4eea Mon Sep 17 00:00:00 2001 From: Alena Astrakhantseva Date: Fri, 13 Dec 2024 15:39:04 +0100 Subject: [PATCH 29/35] Update docs/website/docs/dlt-ecosystem/transformations/python.md --- docs/website/docs/dlt-ecosystem/transformations/python.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/transformations/python.md b/docs/website/docs/dlt-ecosystem/transformations/python.md index b27fa4e6d1..d43f8caaca 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/python.md +++ b/docs/website/docs/dlt-ecosystem/transformations/python.md @@ -63,7 +63,7 @@ pipeline.run(users.iter_arrow(chunk_size=1000), table_name="users_clean") ### A more complex example -The example above could easily be done in SQL. Let's assume you'd like to actually do some in python arrow transformations. For this will create a resources from which we can yield the modified arrow tables. The same is possibly with dataframes. +The example above could easily be done in SQL. Let's assume you'd like to actually do in Python some Arrow transformations. For this will create a resources from which we can yield the modified Arrow tables. The same is possibly with DataFrames. ```py import pyarrow.compute as pc From 615fdf4409f71c2b58126129d448236f521ef936 Mon Sep 17 00:00:00 2001 From: Alena Astrakhantseva Date: Fri, 13 Dec 2024 15:39:22 +0100 Subject: [PATCH 30/35] Update docs/website/docs/dlt-ecosystem/transformations/sql.md --- docs/website/docs/dlt-ecosystem/transformations/sql.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/transformations/sql.md b/docs/website/docs/dlt-ecosystem/transformations/sql.md index c155289253..3722234c9c 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/sql.md +++ b/docs/website/docs/dlt-ecosystem/transformations/sql.md @@ -7,7 +7,7 @@ keywords: [transform, sql] # Transforming data using the `dlt` SQL client A simple alternative to dbt is to query the data using the `dlt` SQL client and then perform the -transformations using sql statements in python. The `execute_sql` method allows you to execute any SQL statement, +transformations using SQL statements in Python. The `execute_sql` method allows you to execute any SQL statement, including statements that change the database schema or data in the tables. In the example below, we insert a row into the `customers` table. Note that the syntax is the same as for any standard `dbapi` connection. From 4690bcfae08fb48510ac34abb7e46c4ea9be3687 Mon Sep 17 00:00:00 2001 From: Alena Astrakhantseva Date: Fri, 13 Dec 2024 15:39:35 +0100 Subject: [PATCH 31/35] Update docs/website/docs/dlt-ecosystem/transformations/sql.md --- docs/website/docs/dlt-ecosystem/transformations/sql.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/transformations/sql.md b/docs/website/docs/dlt-ecosystem/transformations/sql.md index 3722234c9c..5358e65801 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/sql.md +++ b/docs/website/docs/dlt-ecosystem/transformations/sql.md @@ -13,7 +13,7 @@ insert a row into the `customers` table. Note that the syntax is the same as for connection. :::info -* This method will work for all sql destinations supported by `dlt`, but not for the filesystem destination. +* This method will work for all SQL destinations supported by `dlt`, but not for the filesystem destination. * Read the [sql client docs](../../ general-usage/dataset-access/dataset) for more information on how to access data with the sql client. * If you are simply trying to read data, you should use the powerful [dataset interface](../../general-usage/dataset-access/dataset) instead. ::: From e0f65cdc391bc6c6a2919f35c8f78aa5661e6bff Mon Sep 17 00:00:00 2001 From: Alena Astrakhantseva Date: Fri, 13 Dec 2024 15:39:47 +0100 Subject: [PATCH 32/35] Update docs/website/docs/dlt-ecosystem/transformations/sql.md --- docs/website/docs/dlt-ecosystem/transformations/sql.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/transformations/sql.md b/docs/website/docs/dlt-ecosystem/transformations/sql.md index 5358e65801..766a99f817 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/sql.md +++ b/docs/website/docs/dlt-ecosystem/transformations/sql.md @@ -14,7 +14,7 @@ connection. :::info * This method will work for all SQL destinations supported by `dlt`, but not for the filesystem destination. -* Read the [sql client docs](../../ general-usage/dataset-access/dataset) for more information on how to access data with the sql client. +* Read the [SQL client docs](../../ general-usage/dataset-access/dataset) for more information on how to access data with the SQL client. * If you are simply trying to read data, you should use the powerful [dataset interface](../../general-usage/dataset-access/dataset) instead. ::: From d537b1f34b58ca57ce0fbf5737eabd901ca7bc47 Mon Sep 17 00:00:00 2001 From: Alena Astrakhantseva Date: Fri, 13 Dec 2024 15:39:57 +0100 Subject: [PATCH 33/35] Update docs/website/docs/dlt-ecosystem/transformations/sql.md --- docs/website/docs/dlt-ecosystem/transformations/sql.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/transformations/sql.md b/docs/website/docs/dlt-ecosystem/transformations/sql.md index 766a99f817..9d2e90fb18 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/sql.md +++ b/docs/website/docs/dlt-ecosystem/transformations/sql.md @@ -20,7 +20,7 @@ connection. Typically you will use this type of transformation if you can create or update tables directly from existing tables -without any need to insert data from your python environment. +without any need to insert data from your Python environment. The example below creates a new table `aggregated_sales` that contains the total and average sales for each category and region From 3d3b6382c9bb098cd1784197804ee8453612c5d9 Mon Sep 17 00:00:00 2001 From: Alena Astrakhantseva Date: Fri, 13 Dec 2024 15:40:13 +0100 Subject: [PATCH 34/35] Update docs/website/docs/dlt-ecosystem/transformations/sql.md --- docs/website/docs/dlt-ecosystem/transformations/sql.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/transformations/sql.md b/docs/website/docs/dlt-ecosystem/transformations/sql.md index 9d2e90fb18..60f3e7f7a5 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/sql.md +++ b/docs/website/docs/dlt-ecosystem/transformations/sql.md @@ -67,5 +67,5 @@ If you want to transform your data before loading, you can use Python. If you wa data after loading, you can use SQL or one of the following: 1. [dbt](dbt/dbt.md) (recommended). -2. [Python with dataframes or arrow tables](python.md). +2. [Python with DataFrames or Arrow tables](python.md). From 16954fba8f9eab5a622b3ae978d0cef4c950878e Mon Sep 17 00:00:00 2001 From: Alena Astrakhantseva Date: Fri, 13 Dec 2024 15:40:31 +0100 Subject: [PATCH 35/35] Update docs/website/docs/general-usage/dataset-access/dataset.md --- docs/website/docs/general-usage/dataset-access/dataset.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/general-usage/dataset-access/dataset.md b/docs/website/docs/general-usage/dataset-access/dataset.md index efc03cc076..f9c01603f6 100644 --- a/docs/website/docs/general-usage/dataset-access/dataset.md +++ b/docs/website/docs/general-usage/dataset-access/dataset.md @@ -121,7 +121,7 @@ The methods available on the ReadableRelation correspond to the methods availabl ## Special queries -You can use the `row_counts` method to get the row counts of all tables in the destination as a dataframe. +You can use the `row_counts` method to get the row counts of all tables in the destination as a DataFrame. ```py # print the row counts of all tables in the destination as dataframe