Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main'
Browse files Browse the repository at this point in the history
  • Loading branch information
syou6162 committed Dec 19, 2024
2 parents 5734af4 + 7b12318 commit 468c808
Show file tree
Hide file tree
Showing 16 changed files with 662 additions and 48 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]

### Added
- Support for exporting a Data Contract to an Iceberg schema definition.
- When importing in dbt format, add the dbt `not_null` information as a datacontract `required` field (#547)

### Changed
- Type conversion when importing contracts into dbt and exporting contracts from dbt (#534)
- Ensure 'name' is the first column when exporting in dbt format, considering column attributes (#541)
- Rename dbt's `tests` to `data_tests` (#548)

### Fixed
- Modify the arguments to narrow down the import target with `--dbt-model` (#532)
- SodaCL: Prevent `KeyError: 'fail'` from happening when testing with SodaCL
- fix: populate database and schema values for bigquery in exported dbt sources (#543)
- Fixing the options for importing and exporting to standard output (#544)

## [0.10.15] - 2024-10-26

Expand Down
60 changes: 59 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -761,7 +761,7 @@ models:
│ t-staging-sql|odcs|odcs_v2|odcs_v3|rdf|avro|protobuf │
│ |great-expectations|terraform|avro-idl|sql|sql-query │
│ |html|go|bigquery|dbml|spark|sqlalchemy|data-caterer │
│ |dcs]
│ |dcs|iceberg]
│ --output PATH Specify the file path where the exported data will be │
│ saved. If no path is provided, the output will be │
│ printed to stdout. │
Expand Down Expand Up @@ -822,6 +822,7 @@ Available export options:
| `sqlalchemy` | Export to SQLAlchemy Models ||
| `data-caterer` | Export to Data Caterer in YAML format ||
| `dcs` | Export to Data Contract Specification in YAML format ||
| `iceberg` | Export to an Iceberg JSON Schema Definition | partial |
| Missing something? | Please create an issue on GitHub | TBD |


Expand Down Expand Up @@ -945,6 +946,63 @@ models:
- **avroLogicalType**: Specifies the logical type of the field in Avro. In this example, it is `local-timestamp-micros`.
- **avroDefault**: Specifies the default value for the field in Avro. In this example, it is 1672534861000000 which corresponds to ` 2023-01-01 01:01:01 UTC`.

#### Iceberg

Exports to an [Iceberg Table Json Schema Definition](https://iceberg.apache.org/spec/#appendix-c-json-serialization).

This export only supports a single model export at a time because Iceberg's schema definition is for a single table and the exporter maps 1 model to 1 table, use the `--model` flag
to limit your contract export to a single model.

```bash
$ datacontract export --format iceberg --model orders https://datacontract.com/examples/orders-latest/datacontract.yaml --output /tmp/orders_iceberg.json
$ cat /tmp/orders_iceberg.json | jq '.'
{
"type": "struct",
"fields": [
{
"id": 1,
"name": "order_id",
"type": "string",
"required": true
},
{
"id": 2,
"name": "order_timestamp",
"type": "timestamptz",
"required": true
},
{
"id": 3,
"name": "order_total",
"type": "long",
"required": true
},
{
"id": 4,
"name": "customer_id",
"type": "string",
"required": false
},
{
"id": 5,
"name": "customer_email_address",
"type": "string",
"required": true
},
{
"id": 6,
"name": "processed_timestamp",
"type": "timestamptz",
"required": true
}
],
"schema-id": 0,
"identifier-field-ids": [
1
]
}
```

### import

Expand Down
4 changes: 2 additions & 2 deletions datacontract/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ def export(
)
# Don't interpret console markup in output.
if output is None:
console.print(result, markup=False)
console.print(result, markup=False, soft_wrap=True)
else:
with output.open("w") as f:
f.write(result)
Expand Down Expand Up @@ -298,7 +298,7 @@ def import_(
iceberg_table=iceberg_table,
)
if output is None:
console.print(result.to_yaml())
console.print(result.to_yaml(), markup=False, soft_wrap=True)
else:
with output.open("w") as f:
f.write(result.to_yaml())
Expand Down
40 changes: 25 additions & 15 deletions datacontract/export/dbt_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def to_dbt_staging_sql(data_contract_spec: DataContractSpecification, model_name


def to_dbt_sources_yaml(data_contract_spec: DataContractSpecification, server: str = None):
source = {"name": data_contract_spec.id, "tables": []}
source = {"name": data_contract_spec.id}
dbt = {
"version": 2,
"sources": [source],
Expand All @@ -72,9 +72,14 @@ def to_dbt_sources_yaml(data_contract_spec: DataContractSpecification, server: s
adapter_type = None
if found_server is not None:
adapter_type = found_server.type
source["database"] = found_server.database
source["schema"] = found_server.schema_
if adapter_type == "bigquery":
source["database"] = found_server.project
source["schema"] = found_server.dataset
else:
source["database"] = found_server.database
source["schema"] = found_server.schema_

source["tables"] = []
for model_key, model_value in data_contract_spec.models.items():
dbt_model = _to_dbt_source_table(model_key, model_value, adapter_type)
source["tables"].append(dbt_model)
Expand Down Expand Up @@ -144,10 +149,12 @@ def _to_column(field_name: str, field: Field, supports_constraints: bool, adapte
column = {"name": field_name}
adapter_type = adapter_type or "snowflake"
dbt_type = convert_to_sql_type(field, adapter_type)

column["data_tests"] = []
if dbt_type is not None:
column["data_type"] = dbt_type
else:
column.setdefault("tests", []).append(
column["data_tests"].append(
{"dbt_expectations.dbt_expectations.expect_column_values_to_be_of_type": {"column_type": dbt_type}}
)
if field.description is not None:
Expand All @@ -156,21 +163,21 @@ def _to_column(field_name: str, field: Field, supports_constraints: bool, adapte
if supports_constraints:
column.setdefault("constraints", []).append({"type": "not_null"})
else:
column.setdefault("tests", []).append("not_null")
column["data_tests"].append("not_null")
if field.unique:
if supports_constraints:
column.setdefault("constraints", []).append({"type": "unique"})
else:
column.setdefault("tests", []).append("unique")
column["data_tests"].append("unique")
if field.enum is not None and len(field.enum) > 0:
column.setdefault("tests", []).append({"accepted_values": {"values": field.enum}})
column["data_tests"].append({"accepted_values": {"values": field.enum}})
if field.minLength is not None or field.maxLength is not None:
length_test = {}
if field.minLength is not None:
length_test["min_value"] = field.minLength
if field.maxLength is not None:
length_test["max_value"] = field.maxLength
column.setdefault("tests", []).append(
column["data_tests"].append(
{"dbt_expectations.expect_column_value_lengths_to_be_between": length_test}
)
if field.pii is not None:
Expand All @@ -181,7 +188,7 @@ def _to_column(field_name: str, field: Field, supports_constraints: bool, adapte
column.setdefault("tags", []).extend(field.tags)
if field.pattern is not None:
# Beware, the data contract pattern is a regex, not a like pattern
column.setdefault("tests", []).append(
column["data_tests"].append(
{"dbt_expectations.expect_column_values_to_match_regex": {"regex": field.pattern}}
)
if (
Expand All @@ -195,7 +202,7 @@ def _to_column(field_name: str, field: Field, supports_constraints: bool, adapte
range_test["min_value"] = field.minimum
if field.maximum is not None:
range_test["max_value"] = field.maximum
column.setdefault("tests", []).append({"dbt_expectations.expect_column_values_to_be_between": range_test})
column["data_tests"].append({"dbt_expectations.expect_column_values_to_be_between": range_test})
elif (
field.exclusiveMinimum is not None
or field.exclusiveMaximum is not None
Expand All @@ -208,18 +215,18 @@ def _to_column(field_name: str, field: Field, supports_constraints: bool, adapte
if field.exclusiveMaximum is not None:
range_test["max_value"] = field.exclusiveMaximum
range_test["strictly"] = True
column.setdefault("tests", []).append({"dbt_expectations.expect_column_values_to_be_between": range_test})
column["data_tests"].append({"dbt_expectations.expect_column_values_to_be_between": range_test})
else:
if field.minimum is not None:
column.setdefault("tests", []).append(
column["data_tests"].append(
{"dbt_expectations.expect_column_values_to_be_between": {"min_value": field.minimum}}
)
if field.maximum is not None:
column.setdefault("tests", []).append(
column["data_tests"].append(
{"dbt_expectations.expect_column_values_to_be_between": {"max_value": field.maximum}}
)
if field.exclusiveMinimum is not None:
column.setdefault("tests", []).append(
column["data_tests"].append(
{
"dbt_expectations.expect_column_values_to_be_between": {
"min_value": field.exclusiveMinimum,
Expand All @@ -228,7 +235,7 @@ def _to_column(field_name: str, field: Field, supports_constraints: bool, adapte
}
)
if field.exclusiveMaximum is not None:
column.setdefault("tests", []).append(
column["data_tests"].append(
{
"dbt_expectations.expect_column_values_to_be_between": {
"max_value": field.exclusiveMaximum,
Expand All @@ -237,5 +244,8 @@ def _to_column(field_name: str, field: Field, supports_constraints: bool, adapte
}
)

if not column["data_tests"]:
column.pop("data_tests")

# TODO: all constraints
return column
2 changes: 2 additions & 0 deletions datacontract/export/exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ class ExportFormat(str, Enum):
sqlalchemy = "sqlalchemy"
data_caterer = "data-caterer"
dcs = "dcs"
iceberg = "iceberg"


@classmethod
def get_supported_formats(cls):
Expand Down
4 changes: 4 additions & 0 deletions datacontract/export/exporter_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,3 +168,7 @@ def load_module_class(module_path, class_name):
exporter_factory.register_lazy_exporter(
name=ExportFormat.dcs, module_path="datacontract.export.dcs_exporter", class_name="DcsExporter"
)

exporter_factory.register_lazy_exporter(
name=ExportFormat.iceberg, module_path="datacontract.export.iceberg_converter", class_name="IcebergExporter"
)
Loading

0 comments on commit 468c808

Please sign in to comment.