From b9a976e0ef7edf14a53a8cb16f2409faaa779f0d Mon Sep 17 00:00:00 2001 From: jochen Date: Sat, 20 Jul 2024 16:58:52 +0200 Subject: [PATCH] Update quality --- CHANGELOG.md | 9 +- README.md | 516 +++++------------------ datacontract.init.yaml | 10 +- examples/orders-latest/datacontract.yaml | 28 +- versions/0.9.3/README.md | 292 ++++++++----- versions/0.9.3/datacontract.init.yaml | 207 +++++---- versions/0.9.3/datacontract.schema.json | 322 +++++++++++++- versions/0.9.3/definition.schema.json | 19 +- 8 files changed, 756 insertions(+), 647 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f025bc9..7f22319 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,8 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -Please note, while the major version is zero (0.y.z), Anything MAY change at any time. -The public API SHOULD NOT be considered stable. +## [1.0.1] - 2024-07-20 ### Added - Data quality attributes on model and field level @@ -25,6 +24,12 @@ The public API SHOULD NOT be considered stable. - Field `type: map` support with properties `keys` and `values` - Definitions: `fields`, for type `object`, `record`, and `struct` +### Removed + +- `quality` on top-level removed (is now considered as specification extension) +- `schema` removed (is now considered as specification extension) + + ## [0.9.3] - 2024-03-06 ### Added diff --git a/README.md b/README.md index cf22f20..2d34560 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ The specification comes along with the [Data Contract CLI](https://github.com/da Version --- -0.9.4([Changelog](CHANGELOG.md)) +1.0.1([Changelog](CHANGELOG.md)) Example --- @@ -42,7 +42,7 @@ Example [![Data Contract Catalog](https://img.shields.io/badge/Data%20Contract-Catalog-blue)](https://datacontract.com/examples/index.html) ```yaml -dataContractSpecification: 0.9.3 +dataContractSpecification: 1.0.1 id: urn:datacontract:checkout:orders-latest info: title: Orders Latest @@ -114,7 +114,7 @@ models: classification: sensitive quality: - type: text - name: The email address was verified by the system + name: The email address was verified by a user processed_timestamp: description: The timestamp when the record was processed by the data platform. type: timestamp @@ -123,14 +123,15 @@ models: jsonType: string jsonFormat: date-time quality: - - type: row_count - must_be_greater_than: 5 - type: sql description: The maximum duration between two orders should be less that 3600 seconds query: | SELECT MAX(EXTRACT(EPOCH FROM (order_timestamp - LAG(order_timestamp) OVER (ORDER BY order_timestamp)))) AS max_duration FROM orders must_be_less_than: 3600 + - type: row_count + engine: soda + must_be_greater_than: 5 line_items: description: A single article that is part of an order. type: table @@ -296,21 +297,19 @@ This is the root document. It is _RECOMMENDED_ that the root document be named: `datacontract.yaml`. -| Field | Type | Description | -|---------------------------|------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------| -| dataContractSpecification | `string` | REQUIRED. Specifies the Data Contract Specification being used. | -| id | `string` | REQUIRED. An organization-wide unique technical identifier, such as a UUID, URN, slug, string, or number | -| info | [Info Object](#info-object) | REQUIRED. Specifies the metadata of the data contract. May be used by tooling. | +| Field | Type | Description | +|---------------------------|--------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------| +| dataContractSpecification | `string` | REQUIRED. Specifies the Data Contract Specification being used. | +| id | `string` | REQUIRED. An organization-wide unique technical identifier, such as a UUID, URN, slug, string, or number | +| info | [Info Object](#info-object) | REQUIRED. Specifies the metadata of the data contract. May be used by tooling. | | servers | Map[`string`, [Server Object](#server-object)] | Specifies the servers of the data contract. | -| terms | [Terms Object](#terms-object) | Specifies the terms and conditions of the data contract. | +| terms | [Terms Object](#terms-object) | Specifies the terms and conditions of the data contract. | | models | Map[`string`, [Model Object](#model-object)] | Specifies the logical data model. | | definitions | Map[`string`, [Definition Object](#definition-object)] | Specifies definitions. | -| schema | [Schema Object](#schema-object) | Specifies the physical schema. The specification supports different schema format. | -| examples | Array of [Example Objects](#example-object) | Specifies example data sets for the data model. The specification supports different example types. | -| servicelevels | [Service Levels Object](#service-levels-object) | Specifies the service level of the provided data | -| quality | [Quality Object](#quality-object) | Deprecated on top-level. Use model-level and field-field level quality. Specifies the quality attributes and checks. | -| links | Map[`string`, `string`] | Additional external documentation links. | -| tags | Array of `string` | Custom metadata to provide additional context. | +| examples | Array of [Example Objects](#example-object) | Specifies example data sets for the data model. The specification supports different example types. | +| servicelevels | [Service Levels Object](#service-levels-object) | Specifies the service level of the provided data | +| links | Map[`string`, `string`] | Additional external documentation links. | +| tags | Array of `string` | Custom metadata to provide additional context. | This object _MAY_ be extended with [Specification Extensions](#specification-extensions). @@ -326,7 +325,7 @@ Metadata and life cycle information about the data contract. |-------------|-----------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------| | title | `string` | REQUIRED. The title of the data contract. | | version | `string` | REQUIRED. The version of the data contract document (which is distinct from the Data Contract Specification version or the Data Product implementation version). | -| status | `string` | The status of the data contract. Can be `proposed`, `in development`, `active`, `deprecated`, `retired`. | +| status | `string` | The status of the data contract. Can be `proposed`, `in development`, `active`, `deprecated`, `retired`. | | description | `string` | A description of the data contract. | | owner | `string` | The owner or team responsible for managing the data contract and providing the data. | | contact | [Contact Object](#contact-object) | Contact information for the data contract. | @@ -429,13 +428,13 @@ servers: #### SQL-Server Server Object -| Field | Type | Description | -|----------|-----------|------------------------------------------------------| -| type | `string` | `sqlserver` | -| host | `string` | The host to the database server | -| port | `integer` | The port to the database server, default: `1433` | -| database | `string` | The name of the database, e.g., `database`. | -| schema | `string` | The name of the schema in the database, e.g., `dbo`. | +| Field | Type | Description | +|----------|-----------|--------------------------------------------------------------------------| +| type | `string` | `sqlserver` | +| host | `string` | The host to the database server | +| port | `integer` | The port to the database server, default: `1433` | +| database | `string` | The name of the database, e.g., `database`. | +| schema | `string` | The name of the schema in the database, e.g., `dbo`. | | driver | `string` | The name of the supported driver, e.g., `ODBC Driver 18 for SQL Server`. | @@ -635,208 +634,6 @@ Models fields can refer to definitions using the `$ref` field to link to existin -### Schema Object - -The schema of the data contract describes the physical schema. -The type of the schema depends on the data platform. - -| Field | Type | Description | -|---------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------| -| type | `string` | REQUIRED. The type of the schema.
Typical values are: `dbt`, `bigquery`, `json-schema`, `sql-ddl`, `avro`, `protobuf`, `custom` | -| specification | [dbt Schema Object](#dbt-schema-object) \|
[BigQuery Schema Object](#bigquery-schema-object) \|
[JSON Schema Schema Object](#bigquery-schema-object) \|
[SQL DDL Schema Object](#sql-ddl-schema-object) \|
`string` | REQUIRED. The specification of the schema. The schema specification can be encoded as a string or as inline YAML. | - - -#### dbt Schema Object - -https://docs.getdbt.com/reference/model-properties - -Example (inline YAML): - -```yaml -schema: - type: dbt - specification: - version: 2 - models: - - name: "My Table" - description: "My description" - columns: - - name: "My column" - data_type: text - description: "My description" -``` - -Example (string): - -```yaml -schema: - type: dbt - specification: |- - version: 2 - models: - - name: "My Table" - description: "My description" - columns: - - name: "My column" - data_type: text - description: "My description" -``` - -#### BigQuery Schema Object - -The schema structure is defined by the [Google BigQuery Table](https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#resource:-table) object. You can extract such a Table object via the [tables.get](https://cloud.google.com/bigquery/docs/reference/rest/v2/tables/get) endpoint. - -Instead of providing a single Table object, you can also provide an array of such objects. Be aware that [tables.list](https://cloud.google.com/bigquery/docs/reference/rest/v2/tables/list) only returns a subset of the full Table object. You need to call every Table object via [tables.get](https://cloud.google.com/bigquery/docs/reference/rest/v2/tables/get) to get the full Table object, including the actual schema. - -Learn more: [Google BigQuery REST Reference v2](https://cloud.google.com/bigquery/docs/reference/rest) - - - -Example: - -```yaml -schema: - type: bigquery - specification: |- - { - "tableReference": { - "projectId": "my-project", - "datasetId": "my_dataset", - "tableId": "my_table" - }, - "description": "This is a description", - "type": "TABLE", - "schema": { - "fields": [ - { - "name": "name", - "type": "STRING", - "mode": "NULLABLE", - "description": "This is a description" - } - ] - } - } -``` - -#### JSON Schema Schema Object - -JSON Schema can be defined as JSON or rendered as YAML, following the [OpenAPI Schema Object dialect](https://spec.openapis.org/oas/v3.1.0#properties) - -Example (inline YAML): - -```yaml -schema: - type: json-schema - specification: - orders: - description: One record per order. Includes cancelled and deleted orders. - type: object - properties: - order_id: - type: string - description: Primary key of the orders table - order_timestamp: - type: string - format: date-time - description: The business timestamp in UTC when the order was successfully registered in the source system and the payment was successful. - order_total: - type: integer - description: Total amount of the order in the smallest monetary unit (e.g., cents). - line_items: - type: object - properties: - lines_item_id: - type: string - description: Primary key of the lines_item_id table - order_id: - type: string - description: Foreign key to the orders table - sku: - type: string - description: The purchased article number -``` - -Example (string): - -```yaml -schema: - type: json-schema - specification: |- - { - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "properties": { - "orders": { - "type": "object", - "description": "One record per order. Includes cancelled and deleted orders.", - "properties": { - "order_id": { - "type": "string", - "description": "Primary key of the orders table" - }, - "order_timestamp": { - "type": "string", - "format": "date-time", - "description": "The business timestamp in UTC when the order was successfully registered in the source system and the payment was successful." - }, - "order_total": { - "type": "integer", - "description": "Total amount of the order in the smallest monetary unit (e.g., cents)." - } - }, - "required": ["order_id", "order_timestamp", "order_total"] - }, - "line_items": { - "type": "object", - "properties": { - "lines_item_id": { - "type": "string", - "description": "Primary key of the lines_item_id table" - }, - "order_id": { - "type": "string", - "description": "Foreign key to the orders table" - }, - "sku": { - "type": "string", - "description": "The purchased article number" - } - }, - "required": ["lines_item_id", "order_id", "sku"] - } - }, - "required": ["orders", "line_items"] - } -``` - -#### SQL DDL Schema Object - -Classical SQL DDLs can be used to describe the structure. - - -Example (string): - -```yaml -schema: - type: sql-ddl - specification: |- - -- One record per order. Includes cancelled and deleted orders. - CREATE TABLE orders ( - order_id TEXT PRIMARY KEY, -- Primary key of the orders table - order_timestamp TIMESTAMPTZ NOT NULL, -- The business timestamp in UTC when the order was successfully registered in the source system and the payment was successful. - order_total INTEGER NOT NULL -- Total amount of the order in the smallest monetary unit (e.g., cents) - ); - - -- The items that are part of an order - CREATE TABLE line_items ( - lines_item_id TEXT PRIMARY KEY, -- Primary key of the lines_item_id table - order_id TEXT REFERENCES orders(order_id), -- Foreign key to the orders table - sku TEXT NOT NULL -- The purchased article number - ); - -``` - ### Example Object | Field | Type | Description | @@ -988,18 +785,17 @@ Quality attributes are checks that can be applied to the data to ensure its qual Quality attributes can be: - Text: A human-readable text that describes the quality of the data. - SQL: An individual SQL query that returns a single value that can be compared. -- Predefined Types: Some commonly-used predefined quality attributes such as `row_count`, `unique`, `freshness` -- Vendor-specific: Quality attributes that are specific to a vendor, such as Great Expectations, SodaCL or Montecarlo. +- Engine-specific Types: Currently engines `soda` and `great-expectations` are supported. -A quality object can be specified on field level, or on model level. The top-level quality object are deprecated. - -The fields of the quality object depends on the quality `type`. +A quality object can be specified on field level, or on model level. +The top-level quality object are deprecated. #### Text - Applicable on: [ ] top-level, [x] model, [x] field +Applicable on: [x] model, [x] field -A human-readable text that describe the quality of the data. Later in the development process, these might be translated into an executable check (such as `sql`), or checked through an AI engine. +A human-readable text that describes the quality of the data. +Later in the development process, these might be translated into an executable check (such as `sql`), or checked through an AI engine. | Field | Type | Description | |-------------|----------|--------------------------------------------------------------------| @@ -1013,29 +809,34 @@ Example: models: my_table: fields: - email: + iban: quality: - type: text - description: The email address was verified by the system + description: Must be a valid IBAN. ``` + #### SQL -Applicable on: [ ] top-level, [x] model, [x] field +Applicable on: [x] model, [x] field An individual SQL query that returns a single number or boolean value that can be compared. The SQL query must be in the SQL dialect of the provided server. -| Field | Type | Description | -|----------------------------------|------------------------|------------------------------------------------------------| -| type | `string` | `sql` | -| query | `string` | A SQL query that returns a single number or boolean value. | -| must_be_equal_to | `integer` or `boolean` | The threshold to check the return value of the query | -| must_be_greater_than | `integer` | The threshold to check the return value of the query | -| must_be_greater_than_or_equal_to | `integer` | The threshold to check the return value of the query | -| must_be_less_than | `integer` | The threshold to check the return value of the query | -| must_be_less_than_or_equal_to | `integer` | The threshold to check the return value of the query | -| name | `string` | Optional. A human-readable name for this check | -| description | `string` | A plain text describing the quality of the data. | +| Field | Type | Description | +|----------------------------------|-----------------------|---------------------------------------------------------------------------------| +| type | `string` | `sql` | +| name | `string` | Optional. A human-readable name for this check | +| description | `string` | A plain text describing the quality of the data. | +| query | `string` | A SQL query that returns a single number or a boolean value. | +| must_be | `integer` | The threshold to check the return value of the query | +| must_not_be | `integer` | The threshold to check the return value of the query | +| must_be_greater_than | `integer` | The threshold to check the return value of the query | +| must_be_greater_than_or_equal_to | `integer` | The threshold to check the return value of the query | +| must_be_less_than | `integer` | The threshold to check the return value of the query | +| must_be_less_than_or_equal_to | `integer` | The threshold to check the return value of the query | +| must_be_between | array of two integers | The threshold to check the return value of the query. Boundaries are inclusive. | +| must_not_be_between | array of two integers | The threshold to check the return value of the query. Boundaries are inclusive. | + ```yaml models: @@ -1050,47 +851,18 @@ models: ``` -#### Row Count - -Applicable on: [ ] top-level, [x] model, [ ] field +#### Soda Data Contract Checks +Applicable on: [x] model, [x] field -Counts the number of rows in a model. -| Field | Type | Description | -|----------------------------------|-----------|------------------------------------------------------| -| type | `string` | `row_count` | -| must_be_equal_to | `number` | The threshold to check the return value of the query | -| must_not_be_equal_to | `number` | The threshold to check the return value of the query | -| must_be_greater_than | `number` | The threshold to check the return value of the query | -| must_be_greater_than_or_equal_to | `number` | The threshold to check the return value of the query | -| must_be_less_than | `number` | The threshold to check the return value of the query | -| must_be_less_than_or_equal_to | `number` | The threshold to check the return value of the query | -| name | `string` | Optional. A human-readable name for this check | -| description | `string` | A plain text describing the quality of the data. | +Quality attributes can be defined with the engine `soda` as [Data contract check reference](https://docs.soda.io/soda/data-contracts-checks.html). +##### Duplicate -```yaml -models: - my_table: - quality: - - type: row_count - must_be_greater_than: 500000 -``` - - -#### Unique - -Applicable on: [ ] top-level, [x] model, [ ] field - -A uniqueness check for multiple fields. (For a single field uniqueness check, use the `unique` field attribute.) - -| Field | Type | Description | -|----------------------------------|-------------------|------------------------------------------------------------------------| -| type | `string` | `unique` | -| fields | Array of `string` | An ordered list of fields that values need to be unique in combination | -| name | `string` | Optional. A human-readable name for this check | -| description | `string` | A plain text describing the quality of the data. | +- `no_duplicate_values` (equal to the property `unique: true`, but supports also multiple fields) +- `duplicate_count` +- `duplicate_percent` Example: @@ -1100,52 +872,77 @@ models: fields: order_id: type: string + quality: + - engine: soda + type: no_duplicate_values country: + type: carrier + shipment_numer: type: string quality: - - type: unique - fields: - - country - - order_id + - engine: soda + type: duplicate_percent + must_be_less_than: 1.0 + name: A shipment number is unique for one carrier + columns: + - carrier + - shipment_numer ``` +Freshness +- `freshness_in_days` +- `freshness_in_hours` +- `freshness_in_minutes` -#### Freshness +Missing +- `no_missing_values` (equal to the property `required: true`) +- `missing_count` +- `missing_percent` +Row count +- `rows_exist` (default) +- `row_count` -Applicable on: [ ] top-level, [ ] model, [x] field +Example: +```yaml +models: + my_table: + quality: + - type: row_count + must_be_greater_than: 500000 +``` -At least one element in the model must have a timestamp value that is less than a certain threshold. +SQL aggregation +- `avg` +- `sum` -| Field | Type | Description | -|---------------------------|----------|--------------------------------------------------| -| type | `string` | `freshness` | -| must_be_less_than_seconds | `number` | The threshold in seconds to compare | -| name | `string` | Optional. A human-readable name for this check | -| description | `string` | A plain text describing the quality of the data. | +SQL metric query +- `metric_expression` +Validity +- `no_invalid_values` +- `invalid_count` +- `invalid_percent` Example: - ```yaml models: my_table: fields: - some_timestamp: - type: timestamp + warehouse_id: + type: string quality: - - type: freshness - must_be_less_than_seconds: 3600 - description: At least one element in the model must have a timestamp value that is less than 1 hour + - engine: soda + type: no_invalid_values + valid_sql_regex: '^[A-Z]{2}[0-9]{3}$' ``` - #### Great Expectations -Applicable on: [ ] top-level, [x] model, [x] field +Applicable on: [x] model, [ ] field -Quality attributes defined as an Great Expectations [Expectation](https://greatexpectations.io/expectations/). +Quality attributes defined as Great Expectations [Expectation](https://greatexpectations.io/expectations/). Example: @@ -1154,7 +951,7 @@ Example: models: my_table: quality: - - type: great-expectations + - engine: great-expectations expectation_type: expect_table_row_count_to_be_between kwargs: min_value: 10000 @@ -1162,111 +959,6 @@ models: ``` - -#### Great Expectations (Expectation Suite) - -Applicable on: [ ] top-level, [x] model, [ ] field - -Quality attributes defined as Great Expectations [Expectations](https://greatexpectations.io/expectations/). - -The `specification` represents an expectation suite as JSON string. - -New with v0.9.4: This quality type is only applicable on model level. - -Example: - -```yaml -models: - my_table: - quality: - - type: great-expectations - specification: | - [ - { - "expectation_type": "expect_table_row_count_to_be_between", - "kwargs": { - "min_value": 10000, - "max_value": 50000, - }, - "meta": { - - } - } - ] -``` - - -#### SodaCL - -Applicable on: [x] top-level, [x] model, [ ] field - -Quality attributes in [Soda Checks Language](https://docs.soda.io/soda-cl/soda-cl-overview.html). - -The `specification` represents the content of a `checks.yml` file. - -Example: - -```yaml -quality: - - type: SodaCL - specification: | - checks for orders: - - row_count > 0 - - duplicate_count(order_id) = 0 - checks for line_items: - - row_count > 0 -``` - -#### Monte Carlo - -Applicable on: [x] top-level, [x] model, [ ] field - -Quality attributes defined as Monte Carlos [Monitors as Code](https://docs.getmontecarlo.com/docs/monitors-as-code). - -The `specification` represents the content of a `montecarlo.yml` file. - -Example: - -```yaml -quality: - - type: montecarlo - specification: | - montecarlo: - field_health: - - table: project:dataset.table_name - timestamp_field: created - dimension_tracking: - - table: project:dataset.table_name - timestamp_field: created - field: order_status -``` - -#### Great Expectations Quality Object - -Quality attributes defined as Great Expectations [Expectations](https://greatexpectations.io/expectations/). - -The `specification` represents a list of expectations on a specific model. - -Example (string): - -```yaml -quality: - type: great-expectations - specification: - orders: |- - [ - { - "expectation_type": "expect_table_row_count_to_be_between", - "kwargs": { - "min_value": 10 - }, - "meta": { - - } - } - ] -``` - ### Config Object The config field can be used to set additional metadata that may be used by tools, e.g. to define a namespace for code generation, specify physical data types, toggle tests, etc. diff --git a/datacontract.init.yaml b/datacontract.init.yaml index 382ad8b..4c6ed27 100644 --- a/datacontract.init.yaml +++ b/datacontract.init.yaml @@ -1,4 +1,4 @@ -dataContractSpecification: 0.9.3 +dataContractSpecification: 1.0.1 id: my-data-contract-id info: title: My Data Contract @@ -99,11 +99,3 @@ info: # cron: 0 0 * * 0 # recoveryTime: 24 hours # recoveryPoint: 1 week - -### quality - -#quality: -# type: SodaCL -# specification: -# checks for my_model: |- -# - duplicate_count(id) = 0 diff --git a/examples/orders-latest/datacontract.yaml b/examples/orders-latest/datacontract.yaml index 11e16a5..1e27fce 100644 --- a/examples/orders-latest/datacontract.yaml +++ b/examples/orders-latest/datacontract.yaml @@ -1,4 +1,4 @@ -dataContractSpecification: 0.9.3 +dataContractSpecification: 1.0.1 id: urn:datacontract:checkout:orders-latest info: title: Orders Latest @@ -62,12 +62,15 @@ models: minLength: 10 maxLength: 20 customer_email_address: - description: The email address, as entered by the customer. The email address was not verified. + description: The email address, as entered by the customer. type: text format: email required: true pii: true classification: sensitive + quality: + - type: text + name: The email address was verified by a user processed_timestamp: description: The timestamp when the record was processed by the data platform. type: timestamp @@ -75,6 +78,16 @@ models: config: jsonType: string jsonFormat: date-time + quality: + - type: sql + description: The maximum duration between two orders should be less that 3600 seconds + query: | + SELECT MAX(EXTRACT(EPOCH FROM (order_timestamp - LAG(order_timestamp) OVER (ORDER BY order_timestamp)))) AS max_duration + FROM orders + must_be_less_than: 3600 + - type: row_count + engine: soda + must_be_greater_than: 5 line_items: description: A single article that is part of an order. type: table @@ -180,13 +193,4 @@ servicelevels: interval: weekly cron: 0 0 * * 0 recoveryTime: 24 hours - recoveryPoint: 1 week -quality: - type: SodaCL # data quality check format: SodaCL, montecarlo, custom - specification: # expressed as string or inline yaml or via "$ref: checks.yaml" - checks for orders: - - row_count >= 5 - - duplicate_count(order_id) = 0 - checks for line_items: - - values in (order_id) must exist in orders (order_id) - - row_count >= 5 + recoveryPoint: 1 week \ No newline at end of file diff --git a/versions/0.9.3/README.md b/versions/0.9.3/README.md index e0721f3..6463be2 100644 --- a/versions/0.9.3/README.md +++ b/versions/0.9.3/README.md @@ -1,4 +1,4 @@ -# Data Contract Specification +# Data Contract Specification Stars @@ -8,29 +8,28 @@ Data contracts bring data providers and data consumers together. -A _data contract_ is a document that defines the structure, format, semantics, quality, and terms of use for exchanging data between a data provider and their consumers. -A data contract is implemented by a data product's output port or other data technologies. +A _data contract_ is a document that defines the structure, format, semantics, quality, and terms of use for exchanging data between a data provider and their consumers. +Think of an API, but for data. +A data contract is implemented by a data product or other data technologies, even legacy data warehouses. Data contracts can also be used for the input port to specify the expectations of data dependencies and verify given guarantees. -The _data contract specification_ defines a YAML format to describe attributes of provided data sets. -It is data platform neutral and can be used with any data platform, such as AWS S3, Google BigQuery, Azure, Databricks, and Snowflake. -The data contract specification is an open initiative to define a common data contract format. +The _data contract specification_ defines a YAML format to describe attributes of provided data sets. +It is data platform neutral and can be used with any data platform, such as AWS S3, Google BigQuery, Azure, Databricks, and Snowflake. +The data contract specification is an open initiative to define a common data contract format. It follows [OpenAPI](https://www.openapis.org/) and [AsyncAPI](https://www.asyncapi.com/) conventions. -Data contracts come into play when data is exchanged between different teams or organizational units, such as in a [data mesh architecture](https://www.datamesh-architecture.com/). -First, and foremost, data contracts are a communication tool to express a common understanding of how data should be structured and interpreted. -They make semantic and quality expectations explicit. -They are often created collaboratively in [workshops](./workshop.md) together with data providers and data consumers. +Data contracts come into play when data is exchanged between different teams or organizational units, such as in a [data mesh architecture](https://www.datamesh-architecture.com/). +First, and foremost, data contracts are a communication tool to express a common understanding of how data should be structured and interpreted. +They make semantic and quality expectations explicit. +They are often created collaboratively in [workshops](./workshop.md) together with data providers and data consumers. Later in development and production, they also serve as the basis for code generation, testing, schema validations, quality checks, monitoring, access control, and computational governance policies. The specification comes along with the [Data Contract CLI](https://github.com/datacontract/datacontract-cli), an open-source tool to develop, validate, and enforce data contracts. -IntelliJ, VS Code and other common IDEs allow you to use autocompletions without additional configuration. - -_Note: The term "data contract" refers to a specification that is usually owned by the data provider and thus does not align with a "contract" in a legal sense as a mutual agreement between two parties. -The term "contract" may be somewhat misleading, but it is how it is used in practice. -The mutual agreement between one data provider and one data consumer is the "data usage agreement" that refers to a data contract. -Data usage agreements have a defined lifecycle, start/end date, and help the data provider to track who accesses their data and for which purposes._ +> _Note: The term "data contract" refers to a specification that is usually owned by the data provider and thus does not align with a "contract" in a legal sense as a mutual agreement between two parties. +> The term "contract" may be somewhat misleading, but it is how it is used by the industry. +> The mutual agreement between one data provider and one data consumer is the "data usage agreement" that refers to a data contract. +> Data usage agreements have a defined lifecycle, start/end date, and help the data provider to track who accesses their data and for which purposes._ Version --- @@ -53,15 +52,24 @@ info: All orders since 2020-01-01. Orders with their line items are in their current state (no history included). owner: Checkout Team + slackChannel: "#checkout" contact: name: John Doe (Data Product Owner) url: https://teams.microsoft.com/l/channel/example/checkout +tags: + - checkout + - orders + - s3 +links: + datacontractCli: https://cli.datacontract.com servers: production: type: s3 + environment: prod location: s3://datacontract-example-orders-latest/data/{model}/*.json format: json delimiter: new_line + description: "One folder per model. One file per day." terms: usage: | Data can be used for reports, analytics and machine learning use cases. @@ -108,6 +116,9 @@ models: description: The timestamp when the record was processed by the data platform. type: timestamp required: true + config: + jsonType: string + jsonFormat: date-time line_items: description: A single article that is part of an order. type: table @@ -135,6 +146,8 @@ definitions: example: 243c25e5-a081-43a9-aeab-6d5d5b6cb5e2 pii: true classification: restricted + tags: + - orders sku: domain: inventory name: sku @@ -145,6 +158,10 @@ definitions: description: | A Stock Keeping Unit (SKU) is an internal unique identifier for an article. It is typically associated with an article's barcode, such as the EAN/GTIN. + links: + wikipedia: https://en.wikipedia.org/wiki/Stock_keeping_unit + tags: + - inventory examples: - type: csv # csv, json, yaml, custom model: orders @@ -224,7 +241,7 @@ Data Contract CLI The [Data Contract CLI](https://cli.datacontract.com) is a command line tool and Python library to lint, test, import and export data contracts. -Here is short example how to verify that your actual dataset matches the data contract: +Here is short example how to verify that your actual dataset matches the data contract: ```bash pip3 install datacontract-cli @@ -236,34 +253,16 @@ or, if you prefer Docker: docker run datacontract/cli test https://datacontract.com/examples/orders-latest/datacontract.yaml ``` -The Data Contract contains all required information to verify data: +The Data Contract contains all required information to verify data: - The _servers_ block has the connection details to the actual data set. -- The _models_ define the syntax, formats, and constraints. +- The _models_ define the syntax, formats, and constraints. - The _quality_ defined further quality checks. The Data Contract CLI chooses the appropriate engine, formulates test cases, connects to the server, and executes the tests, based on the server type. More information and configuration options on [cli.datacontract.com](https://cli.datacontract.com). -IDE Integration ---- -IntelliJ comes with a built-in YAML plugin which will show you autocompletions. For VS Code we recommend to install the [YAML](https://marketplace.visualstudio.com/items?itemName=redhat.vscode-yaml) plugin. No additional configuration is required. Autocompletion is then enabled for files following these patterns: - -``` -datacontract.yaml -datacontract.yml -*-datacontract.yaml -*-datacontract.yml -*.datacontract.yaml -*.datacontract.yml -datacontract-*.yaml -datacontract-*.yml -**/datacontract/*.yml -**/datacontract/*.yaml -**/datacontracts/*.yml -**/datacontracts/*.yaml -``` Specification --- @@ -299,14 +298,16 @@ It is _RECOMMENDED_ that the root document be named: `datacontract.yaml`. | dataContractSpecification | `string` | REQUIRED. Specifies the Data Contract Specification being used. | | id | `string` | REQUIRED. An organization-wide unique technical identifier, such as a UUID, URN, slug, string, or number | | info | [Info Object](#info-object) | REQUIRED. Specifies the metadata of the data contract. May be used by tooling. | -| servers | Map[string, [Server Object](#server-object)] | Specifies the servers of the data contract. | +| servers | Map[`string`, [Server Object](#server-object)] | Specifies the servers of the data contract. | | terms | [Terms Object](#terms-object) | Specifies the terms and conditions of the data contract. | -| models | Map[string, [Model Object](#model-object)] | Specifies the logical data model. | -| definitions | Map[string, [Definition Object](#definition-object)] | Specifies definitions. | +| models | Map[`string`, [Model Object](#model-object)] | Specifies the logical data model. | +| definitions | Map[`string`, [Definition Object](#definition-object)] | Specifies definitions. | | schema | [Schema Object](#schema-object) | Specifies the physical schema. The specification supports different schema format. | | examples | Array of [Example Objects](#example-object) | Specifies example data sets for the data model. The specification supports different example types. | | servicelevels | [Service Levels Object](#service-levels-object) | Specifies the service level of the provided data | | quality | [Quality Object](#quality-object) | Specifies the quality attributes and checks. The specification supports different quality check DSLs. | +| links | Map[`string`, `string`] | Additional external documentation links. | +| tags | Array of `string` | Custom metadata to provide additional context. | This object _MAY_ be extended with [Specification Extensions](#specification-extensions). @@ -322,7 +323,7 @@ Metadata and life cycle information about the data contract. |-------------|-----------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------| | title | `string` | REQUIRED. The title of the data contract. | | version | `string` | REQUIRED. The version of the data contract document (which is distinct from the Data Contract Specification version or the Data Product implementation version). | -| status | `string` | The status of the data contract. Can be proposed, in development, active, retired. | +| status | `string` | The status of the data contract. Can be `proposed`, `in development`, `active`, `deprecated`, `retired`. | | description | `string` | A description of the data contract. | | owner | `string` | The owner or team responsible for managing the data contract and providing the data. | | contact | [Contact Object](#contact-object) | Contact information for the data contract. | @@ -346,10 +347,11 @@ This object _MAY_ be extended with [Specification Extensions](#specification-ext The fields are dependent on the defined type. -| Field | Type | Description | -|-------------|----------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| type | `string` | REQUIRED. The type of the data product technology that implements the data contract. Well-known server types are: `bigquery`, `s3`, `glue`, `redshift`, `azure`, `snowflake`, `databricks`, `postgres`, `oracle`, `kafka`, `pubsub`, `sftp`, `local` | -| description | `string` | An optional string describing the server. | +| Field | Type | Description | +|-------------|----------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| type | `string` | REQUIRED. The type of the data product technology that implements the data contract. Well-known server types are: `bigquery`, `s3`, `glue`, `redshift`, `azure`, `sqlserver`, `snowflake`, `databricks`, `postgres`, `oracle`, `kafka`, `pubsub`, `sftp`, `kinesis`, `trino`, `local` | +| description | `string` | An optional string describing the server. | +| environment | `string` | An optional string describing the environment, e.g., prod, sit, stg. | This object _MAY_ be extended with [Specification Extensions](#specification-extensions). @@ -422,6 +424,18 @@ servers: | delimiter | `string` | (Only for format = `json`), how multiple json documents are delimited within one file, e.g., `new_line`, `array` | +#### SQL-Server Server Object + +| Field | Type | Description | +|----------|-----------|------------------------------------------------------| +| type | `string` | `sqlserver` | +| host | `string` | The host to the database server | +| port | `integer` | The port to the database server, default: `1433` | +| database | `string` | The name of the database, e.g., `database`. | +| schema | `string` | The name of the schema in the database, e.g., `dbo`. | +| driver | `string` | The name of the supported driver, e.g., `ODBC Driver 18 for SQL Server`. | + + #### Snowflake Server Object | Field | Type | Description | @@ -485,6 +499,25 @@ servers: | format | `string` | Format of files, such as `parquet`, `delta`, `json`, `csv` | | delimiter | `string` | (Only for format = `json`), how multiple json documents are delimited within one file, e.g., `new_line`, `array` | +#### AWS Kinesis Data Streams Server Object + +| Field | Type | Description | +|--------|----------|---------------------------------------------------------------------------| +| type | `string` | `kinesis` | +| stream | `string` | The name of the Kinesis data stream. | +| region | `string` | AWS region, e.g., `eu-west-1`. | +| format | `string` | The format of the records. Examples: json, avro, protobuf. | + +#### Trino Server Object + +| Field | Type | Description | +|----------|-----------|-----------------------------------------------------------| +| type | `string` | `trino` | +| host | `string` | The Trino host | +| port | `integer` | The Trino port | +| catalog | `string` | The name of the catalog, e.g., `my_catalog`. | +| schema | `string` | The name of the schema in the catalog, e.g., `my_schema`. | + #### Local Server Object | Field | Type | Description | @@ -517,6 +550,8 @@ The name of the data model (table name) is defined by the key that refers to thi | description | `string` | An optional string describing the data model. | | title | `string` | An optional string for the title of the data model. Especially useful if the name of the model is cryptic or contains abbreviations. | | fields | Map[`string`, [Field Object](#field-object)] | The fields (e.g. columns) of the data model. | +| config | [Config Object](#config-object) | Any additional key-value pairs that might be useful for further tooling. | + @@ -548,9 +583,14 @@ The Field Objects describes one field (column, property, nested field) of a data | pii | `boolean` | An indication, if this field contains Personal Identifiable Information (PII). | | classification | `string` | The data class defining the sensitivity level for this field, according to the organization's classification scheme. Examples may be: `sensitive`, `restricted`, `internal`, `public`. | | tags | Array of `string` | Custom metadata to provide additional context. | +| links | Map[`string`,`string`] | Additional external documentation links. | | $ref | `string` | A reference URI to a definition in the specification, internally or externally. Properties will be inherited from the definition. | -| fields | Map[`string`, [Field Object](#field-object)] | The nested fields (e.g. columns) of the object, record, or struct. Use only when type is object, record, or struct. | -| items | [Field Object](#field-object) | The type of the elements in the array. Use only when type is array. | +| fields | Map[`string`, [Field Object](#field-object)] | The nested fields (e.g. columns) of the object, record, or struct. Use only when type is `object`, `record`, or `struct`. | +| items | [Field Object](#field-object) | The type of the elements in the array. Use only when type is `array`. | +| keys | [Field Object](#field-object) | Describes the key structure of a map. Defaults to `type: string` if a map is defined as type. Not all server types support different key types. Use only when type is `map`. | +| values | [Field Object](#field-object) | Describes the value structure of a map. Use only when type is `map`. | +| config | [Config Object](#config-object) | Any additional key-value pairs that might be useful for further tooling. | + ### Definition Object @@ -558,33 +598,39 @@ The Definition Object includes a clear and concise explanations of syntax, seman It serves as a reference for a common understanding of terminology, ensure consistent usage and to identify join-able fields. Models fields can refer to definitions using the `$ref` field to link to existing definitions and avoid duplicate documentations. -| Field | Type | Description | -|------------------|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| name | `string` | REQUIRED. The technical name of this definition. | -| type | [Data Type](#data-types) | REQUIRED. The logical data type | -| domain | `string` | The domain in which this definition is valid. Default: `global`. | -| title | `string` | The business name of this definition. | -| description | `string` | Clear and concise explanations related to the domain | -| enum | array of `string` | A value must be equal to one of the elements in this array value. Only evaluated if the value is not null. | -| format | `string` | `email`: A value must be complaint to [RFC 5321, section 4.1.2](https://www.rfc-editor.org/info/rfc5321).
`uri`: A value must be complaint to [RFC 3986](https://www.rfc-editor.org/info/rfc3986).
`uuid`: A value must be complaint to [RFC 4122](https://www.rfc-editor.org/info/rfc4122). Only evaluated if the value is not null. Only applies to unicode character sequences types (`string`, `text`, `varchar`). | -| precision | `number` | The maximum number of digits in a number. Only applies to numeric values. Defaults to 38. | -| scale | `number` | The maximum number of decimal places in a number. Only applies to numeric values. Defaults to 0. | -| minLength | `number` | A value must greater than, or equal to, the value of this. Only evaluated if the value is not null. Only applies to unicode character sequences types (`string`, `text`, `varchar`). | -| maxLength | `number` | A value must less than, or equal to, the value of this. Only evaluated if the value is not null. Only applies to unicode character sequences types (`string`, `text`, `varchar`). | -| pattern | `string` | A value must be valid according to the [ECMA-262](https://262.ecma-international.org/5.1/) regular expression dialect. Only evaluated if the value is not null. Only applies to unicode character sequences types (`string`, `text`, `varchar`). | -| minimum | `number` | A value of a number must greater than, or equal to, the value of this. Only evaluated if the value is not null. Only applies to numeric values. | -| exclusiveMinimum | `number` | A value of a number must greater than the value of this. Only evaluated if the value is not null. Only applies to numeric values. | -| maximum | `number` | A value of a number must less than, or equal to, the value of this. Only evaluated if the value is not null. Only applies to numeric values. | -| exclusiveMaximum | `number` | A value of a number must less than the value of this. Only evaluated if the value is not null. Only applies to numeric values. | -| example | `string` | An example value. | -| pii | `boolean` | An indication, if this field contains Personal Identifiable Information (PII). | -| classification | `string` | The data class defining the sensitivity level for this field, according to the organization's classification scheme. | -| tags | Array of `string` | Custom metadata to provide additional context. | +| Field | Type | Description | +|------------------|----------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| name | `string` | REQUIRED. The technical name of this definition. | +| type | [Data Type](#data-types) | REQUIRED. The logical data type | +| domain | `string` | The domain in which this definition is valid. Default: `global`. | +| title | `string` | The business name of this definition. | +| description | `string` | Clear and concise explanations related to the domain | +| enum | array of `string` | A value must be equal to one of the elements in this array value. Only evaluated if the value is not null. | +| format | `string` | `email`: A value must be complaint to [RFC 5321, section 4.1.2](https://www.rfc-editor.org/info/rfc5321).
`uri`: A value must be complaint to [RFC 3986](https://www.rfc-editor.org/info/rfc3986).
`uuid`: A value must be complaint to [RFC 4122](https://www.rfc-editor.org/info/rfc4122). Only evaluated if the value is not null. Only applies to unicode character sequences types (`string`, `text`, `varchar`). | +| precision | `number` | The maximum number of digits in a number. Only applies to numeric values. Defaults to 38. | +| scale | `number` | The maximum number of decimal places in a number. Only applies to numeric values. Defaults to 0. | +| minLength | `number` | A value must greater than, or equal to, the value of this. Only evaluated if the value is not null. Only applies to unicode character sequences types (`string`, `text`, `varchar`). | +| maxLength | `number` | A value must less than, or equal to, the value of this. Only evaluated if the value is not null. Only applies to unicode character sequences types (`string`, `text`, `varchar`). | +| pattern | `string` | A value must be valid according to the [ECMA-262](https://262.ecma-international.org/5.1/) regular expression dialect. Only evaluated if the value is not null. Only applies to unicode character sequences types (`string`, `text`, `varchar`). | +| minimum | `number` | A value of a number must greater than, or equal to, the value of this. Only evaluated if the value is not null. Only applies to numeric values. | +| exclusiveMinimum | `number` | A value of a number must greater than the value of this. Only evaluated if the value is not null. Only applies to numeric values. | +| maximum | `number` | A value of a number must less than, or equal to, the value of this. Only evaluated if the value is not null. Only applies to numeric values. | +| exclusiveMaximum | `number` | A value of a number must less than the value of this. Only evaluated if the value is not null. Only applies to numeric values. | +| example | `string` | An example value. | +| pii | `boolean` | An indication, if this field contains Personal Identifiable Information (PII). | +| classification | `string` | The data class defining the sensitivity level for this field, according to the organization's classification scheme. | +| tags | Array of `string` | Custom metadata to provide additional context. | +| links | Map[`string`, `string`] | Additional external documentation links. | +| fields | Map[`string`, [Field Object](#field-object)] | The nested fields (e.g. columns) of the object, record, or struct. Use only when type is `object`, `record`, or `struct`. | +| items | [Field Object](#field-object) | The type of the elements in the array. Use only when type is `array`. | +| keys | [Field Object](#field-object) | Describes the key structure of a map. Defaults to `type: string` if a map is defined as type. Not all server types support different key types. Use only when type is `map`. | +| values | [Field Object](#field-object) | Describes the value structure of a map. Use only when type is `map`. | + ### Schema Object -The schema of the data contract describes the physical schema. +The schema of the data contract describes the physical schema. The type of the schema depends on the data platform. | Field | Type | Description | @@ -816,7 +862,7 @@ examples: ### Service Levels Object A service level is defined as an agreed-upon, measurable level of performance for provided the data. -Data Contract Specification defines well-known service levels. +Data Contract Specification defines well-known service levels. This list can be extended with custom service levels. One can either describe each service level informally using the `description` field, or make use of the predefined fields for automation support, e.g., via the [Data Contract CLI](https://cli.datacontract.com). @@ -825,7 +871,7 @@ One can either describe each service level informally using the `description` fi |--------------|-----------------------------------------------|-------------------------------------------------------------------------| | availability | [Availability Object](#availability-object) | The promised uptime of the system that provides the data | | retention | [Retention Object](#retention-object) | The period how long data will be available. | -| latency | [Latency Object](#latency-object) | The maximum amount of time from the from the source to its destination. | +| latency | [Latency Object](#latency-object) | The maximum amount of time from the the source to its destination. | | freshness | [Freshness Object](#freshness-object) | The maximum age of the youngest entry. | | frequency | [Frequency Object](#frequency-object) | The update frequency. | | support | [Support Object](#support-object) | The times when support is provided. | @@ -907,7 +953,7 @@ Support describes the times when support will be available for contact. | description | `string` | An optional string describing the support service level. | | time | `string` | An optional string describing the times when support will be available for contact such as `24/7` or `business hours only`. | | responseTime | `string` | An optional string describing the time it takes for the support team to acknowledge a request. This does not mean the issue will be resolved immediately, but it assures users that their request has been received and will be dealt with. | - + This object _MAY_ be extended with [Specification Extensions](#specification-extensions). @@ -930,10 +976,10 @@ Backup specifies details about data backup procedures. The quality object contains quality attributes and checks. -| Field | Type | Description | -|---------------|---------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------| -| type | `string` | REQUIRED. The type of the schema.
Typical values are: `SodaCL`, `montecarlo`, `great-expectations`, `custom` | -| specification | [SodaCL Quality Object](#sodacl-quality-object) \|
[Monte Carlo Schema Object](#monte-carlo-quality-object) \|
`string` | REQUIRED. The specification of the quality attributes. The quality specification can be encoded as a string or as inline YAML. | +| Field | Type | Description | +|---------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------| +| type | `string` | REQUIRED. The type of the schema.
Typical values are: `SodaCL`, `montecarlo`, `great-expectations`, `custom` | +| specification | [SodaCL Quality Object](#sodacl-quality-object) \|
[Monte Carlo Schema Object](#monte-carlo-quality-object) \|
[Great Expectations Quality Object](#great-expectations-quality-object) \|
`string` | REQUIRED. The specification of the quality attributes. The quality specification can be encoded as a string or as inline YAML. | #### SodaCL Quality Object @@ -993,7 +1039,7 @@ quality: Quality attributes defined as Great Expectations [Expectations](https://greatexpectations.io/expectations/). -The `specification` represents a list of expectations on a specific model. +The `specification` represents a list of expectations on a specific model. Example (string): @@ -1015,6 +1061,47 @@ quality: ] ``` +### Config Object + +The config field can be used to set additional metadata that may be used by tools, e.g. to define a namespace for code generation, specify physical data types, toggle tests, etc. + +A config field can be added with any name. The value can be null, a primitive, an array or an object. + +For developer experience, a list of well-known field names is maintained here, as these fields are used in the Data Contract CLI: + + +| Field | Type | Description | +|-----------------|----------|----------------------------------------------------------------------------------------------------------------| +| avroNamespace | `string` | (Only on model level) The namespace to use when importing and exporting the data model from / to Apache Avro. | +| avroType | `string` | (Only on field level) Specify the field type to use when exporting the data model to Apache Avro. | +| avroLogicalType | `string` | (Only on field level) Specify the logical field type to use when exporting the data model to Apache Avro. | +| bigqueryType | `string` | (Only on field level) Specify the physical column type that is used in a BigQuery table, e.g., `NUMERIC(5, 2)` | +| snowflakeType | `string` | (Only on field level) Specify the physical column type that is used in a Snowflake table, e.g, `TIMESTAMP_LTZ` | +| redshiftType | `string` | (Only on field level) Specify the physical column type that is used in a Redshift table, e.g, `SMALLINT` | +| sqlserverType | `string` | (Only on field level) Specify the physical column type that is used in a Snowflake table, e.g, `DATETIME2` | +| databricksType | `string` | (Only on field level) Specify the physical column type that is used in a Databricks table | +| glueType | `string` | (Only on field level) Specify the physical column type that is used in a AWS Glue Data Catalog table | + +This object _MAY_ be extended with [Specification Extensions](#specification-extensions). + +Example: + +``` +models: + orders: + config: + avroNamespace: "my.namespace" + fields: + my_field_1: + description: Example for AVRO with Timestamp (millisecond precision) + type: timestamp + config: + avroType: long + avroLogicalType: timestamp-millis + snowflakeType: timestamp_tz +``` + + ### Data Types The following data types are supported for model fields and definitions: @@ -1030,6 +1117,7 @@ The following data types are supported for model fields and definitions: - Timestamp with no timezone: `timestamp_ntz` - Date with no time information: `date` - Array: `array` +- Map: `map` (may not be supported by some server types) - Sequence of 8-bit unsigned bytes: `bytes` - Complex type: `object`, `record`, `struct` - No value: `null` @@ -1038,32 +1126,40 @@ The following data types are supported for model fields and definitions: While the Data Contract Specification tries to accommodate most use cases, additional data can be added to extend the specification at certain points. -A custom fields can be added with any name. The value can be null, a primitive, an array or an object. +A custom field can be added with any name. The value can be null, a primitive, an array or an object. -### Design Principles - -The Data Contract Specification follows these design principles: - -- A free, open, and open-sourced standard -- Follow OpenAPI and AsyncAPI conventions so that it feels immediately familiar -- Support contract-first approaches -- Support code-first approaches -- Support tooling by being machine-readable Tooling --- -- [Data Contract CLI](https://github.com/datacontract/datacontract-cli) is a free CLI tool to help you create, develop, and maintain your data contracts. -- [Data Mesh Manager](https://www.datamesh-manager.com/) is a commercial tool to manage data products and data contracts. It supports the data contract specification and allows the user to import or export data contracts using this specification. - +- [Data Contract CLI](https://github.com/datacontract/datacontract-cli) is an open-source CLI tool to help you create, develop, and maintain your data contracts. +- [Data Contract Manager](https://www.datamesh-manager.com/) is a commercial tool to manage data contracts. It includes a data contract catalog, a Web-Editor, and a request and approval workflow to automate access to data products for a full enterprise data marketplace. +- [Data Contract GPT](https://gpt.datacontract.com) is a custom GPT that can help you write data contracts. +- [Data Contract Editor](https://editor.datacontract.com) is an open-source editor for Data Contracts, including a live html preview. -Other Data Contract Specifications +Code Completion --- -- [AIDA User Group's Open Data Contract Standard](https://github.com/AIDAUserGroup/open-data-contract-standard) -- [PayPal's Data Contract Template](https://github.com/paypal/data-contract-template/blob/main/docs/README.md) +The [JSON Schema](https://datacontract.com/datacontract.schema.json) of the current data contract specification is registered in [Schema Store](https://www.schemastore.org/), which brings code completion and syntax checks for all major IDEs. +IntelliJ comes with a built-in YAML plugin which will show you autocompletions. +For VS Code we recommend to install the [YAML](https://marketplace.visualstudio.com/items?itemName=redhat.vscode-yaml) plugin. +No additional configuration is required. + +Autocompletion is then enabled for files following these patterns: + +``` +datacontract.yaml +datacontract.yml +*-datacontract.yaml +*-datacontract.yml +*.datacontract.yaml +*.datacontract.yml +datacontract-*.yaml +datacontract-*.yml +**/datacontract/*.yml +**/datacontract/*.yaml +**/datacontracts/*.yml +**/datacontracts/*.yaml +``` -Literature ---- -- [Driving Data Quality with Data Contracts](https://www.amazon.com/dp/B0C37FPH3D) by Andrew Jones Authors --- @@ -1079,4 +1175,4 @@ License [MIT License](LICENSE) - + \ No newline at end of file diff --git a/versions/0.9.3/datacontract.init.yaml b/versions/0.9.3/datacontract.init.yaml index 382ad8b..29dbe19 100644 --- a/versions/0.9.3/datacontract.init.yaml +++ b/versions/0.9.3/datacontract.init.yaml @@ -1,109 +1,98 @@ -dataContractSpecification: 0.9.3 -id: my-data-contract-id -info: - title: My Data Contract - version: 0.0.1 -# description: -# owner: -# contact: -# name: -# url: -# email: - - -### servers - -#servers: -# production: -# type: s3 -# location: s3:// -# format: parquet -# delimiter: new_line - -### terms - -#terms: -# usage: -# limitations: -# billing: -# noticePeriod: - - -### models - -# models: -# my_model: -# description: -# type: -# fields: -# my_field: -# type: -# description: - - -### definitions - -# definitions: -# my_field: -# domain: -# name: -# title: -# type: -# description: -# example: -# pii: -# classification: - - -### examples - -#examples: -# - type: csv -# model: my_model -# data: |- -# id,timestamp,amount -# "1001","2023-09-09T08:30:00Z",2500 -# "1002","2023-09-08T15:45:00Z",1800 - -### servicelevels - -#servicelevels: -# availability: -# description: The server is available during support hours -# percentage: 99.9% -# retention: -# description: Data is retained for one year because! -# period: P1Y -# unlimited: false -# latency: -# description: Data is available within 25 hours after the order was placed -# threshold: 25h -# sourceTimestampField: orders.order_timestamp -# processedTimestampField: orders.processed_timestamp -# freshness: -# description: The age of the youngest row in a table. -# threshold: 25h -# timestampField: orders.order_timestamp -# frequency: -# description: Data is delivered once a day -# type: batch # or streaming -# interval: daily # for batch, either or cron -# cron: 0 0 * * * # for batch, either or interval -# support: -# description: The data is available during typical business hours at headquarters -# time: 9am to 5pm in EST on business days -# responseTime: 1h -# backup: -# description: Data is backed up once a week, every Sunday at 0:00 UTC. -# interval: weekly -# cron: 0 0 * * 0 -# recoveryTime: 24 hours -# recoveryPoint: 1 week - -### quality - -#quality: -# type: SodaCL -# specification: -# checks for my_model: |- -# - duplicate_count(id) = 0 +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "description": "Clear and concise explanations of syntax, semantic, and classification of business objects in a given domain.", + "properties": { + "domain": { + "type": "string", + "description": "The domain in which this definition is valid.", + "default": "global" + }, + "name": { + "type": "string", + "description": "The technical name of this definition." + }, + "title": { + "type": "string", + "description": "The business name of this definition." + }, + "description": { + "type": "string", + "description": "Clear and concise explanations related to the domain." + }, + "type": { + "type": "string", + "description": "The logical data type." + }, + "minLength": { + "type": "integer", + "description": "A value must be greater than or equal to this value. Applies only to string types." + }, + "maxLength": { + "type": "integer", + "description": "A value must be less than or equal to this value. Applies only to string types." + }, + "format": { + "type": "string", + "description": "Specific format requirements for the value (e.g., 'email', 'uri', 'uuid')." + }, + "precision": { + "type": "integer", + "examples": [ + 38 + ], + "description": "The maximum number of digits in a number. Only applies to numeric values. Defaults to 38." + }, + "scale": { + "type": "integer", + "examples": [ + 0 + ], + "description": "The maximum number of decimal places in a number. Only applies to numeric values. Defaults to 0." + }, + "pattern": { + "type": "string", + "description": "A regular expression pattern the value must match. Applies only to string types." + }, + "example": { + "type": "string", + "description": "An example value." + }, + "pii": { + "type": "boolean", + "description": "Indicates if the field contains Personal Identifiable Information (PII)." + }, + "classification": { + "type": "string", + "description": "The data class defining the sensitivity level for this field." + }, + "tags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Custom metadata to provide additional context." + }, + "links": { + "type": "object", + "description": "Links to external resources.", + "minProperties": 1, + "propertyNames": { + "pattern": "^[a-zA-Z0-9_-]+$" + }, + "additionalProperties": { + "type": "string", + "title": "Link", + "description": "A URL to an external resource.", + "format": "uri", + "examples": [ + "https://example.com" + ] + } + } + }, + "required": [ + "name", + "type" + ] +} \ No newline at end of file diff --git a/versions/0.9.3/datacontract.schema.json b/versions/0.9.3/datacontract.schema.json index 9c65c5d..a0904be 100644 --- a/versions/0.9.3/datacontract.schema.json +++ b/versions/0.9.3/datacontract.schema.json @@ -36,6 +36,7 @@ "proposed", "in development", "active", + "deprecated", "retired" ] }, @@ -78,6 +79,16 @@ }, "servers": { "type": "object", + "properties": { + "description": { + "type": "string", + "description": "An optional string describing the servers." + }, + "environment": { + "type": "string", + "description": "The environment in which the servers are running. Examples: prod, sit, stg." + } + }, "additionalProperties": { "oneOf": [ { @@ -280,6 +291,55 @@ "format" ] }, + { + "type": "object", + "title": "SqlserverServer", + "properties": { + "type": { + "type": "string", + "enum": [ + "sqlserver" + ], + "description": "The type of the data product technology that implements the data contract." + }, + "host": { + "type": "string", + "description": "The host to the database server", + "examples": [ + "localhost" + ] + }, + "port": { + "type": "integer", + "description": "The port to the database server.", + "default": 1433, + "examples": [ + 1433 + ] + }, + "database": { + "type": "string", + "description": "The name of the database.", + "examples": [ + "database" + ] + }, + "schema": { + "type": "string", + "description": "The name of the schema in the database.", + "examples": [ + "dbo" + ] + } + }, + "additionalProperties": true, + "required": [ + "type", + "host", + "database", + "schema" + ] + }, { "type": "object", "title": "SnowflakeServer", @@ -340,11 +400,25 @@ "additionalProperties": true, "required": [ "type", - "host", "catalog", "schema" ] }, + { + "type": "object", + "title": "DataframeServer", + "properties": { + "type": { + "type": "string", + "const": "dataframe", + "description": "The type of the data product technology that implements the data contract." + } + }, + "additionalProperties": true, + "required": [ + "type" + ] + }, { "type": "object", "title": "GlueServer", @@ -537,6 +611,89 @@ "topic" ] }, + { + "type": "object", + "title": "KinesisDataStreamsServer", + "description": "Kinesis Data Streams Server", + "properties": { + "type": { + "type": "string", + "enum": [ + "kinesis" + ], + "description": "The type of the data product technology that implements the data contract." + }, + "stream": { + "type": "string", + "description": "The name of the Kinesis data stream." + }, + "region": { + "type": "string", + "description": "AWS region.", + "examples": [ + "eu-west-1" + ] + }, + "format": { + "type": "string", + "description": "The format of the record", + "examples": [ + "json", + "avro", + "protobuf" + ] + } + }, + "additionalProperties": true, + "required": [ + "type", + "stream" + ] + }, + { + "type": "object", + "title": "TrinoServer", + "properties": { + "type": { + "type": "string", + "const": "trino", + "description": "The type of the data product technology that implements the data contract." + }, + "host": { + "type": "string", + "description": "The host to the database server", + "examples": [ + "localhost" + ] + }, + "port": { + "type": "integer", + "description": "The port to the database server." + }, + "catalog": { + "type": "string", + "description": "The name of the catalog.", + "examples": [ + "hive" + ] + }, + "schema": { + "type": "string", + "description": "The name of the schema in the database.", + "examples": [ + "my_schema" + ] + } + }, + "additionalProperties": true, + "required": [ + "type", + "host", + "port", + "catalog", + "schema" + ] + }, { "type": "object", "title": "LocalServer", @@ -664,6 +821,12 @@ "items": { "$ref": "#/properties/models/additionalProperties/properties/fields/additionalProperties" }, + "keys": { + "$ref": "#/properties/models/additionalProperties/properties/fields/additionalProperties" + }, + "values": { + "$ref": "#/properties/models/additionalProperties/properties/fields/additionalProperties" + }, "primary": { "type": "boolean", "default": false, @@ -725,7 +888,7 @@ "type": "string", "description": "A regular expression the value must match. Only applies to string types.", "examples": [ - "^[a-zA-Z0-9_-]+$" + "^[a-zA-Z0-9_-]+$" ] }, "minimum": { @@ -769,12 +932,99 @@ }, "description": "Custom metadata to provide additional context." }, + "links": { + "type": "object", + "description": "Links to external resources.", + "minProperties": 1, + "propertyNames": { + "pattern": "^[a-zA-Z0-9_-]+$" + }, + "additionalProperties": { + "type": "string", + "title": "Link", + "description": "A URL to an external resource.", + "format": "uri", + "examples": [ + "https://example.com" + ] + } + }, "$ref": { "type": "string", "description": "A reference URI to a definition in the specification, internally or externally. Properties will be inherited from the definition." + }, + "config": { + "type": "object", + "description": "Additional metadata for field configuration.", + "additionalProperties": { + "type": [ + "string", + "number", + "boolean", + "object", + "array", + "null" + ] + }, + "properties": { + "avroType": { + "type": "string", + "description": "Specify the field type to use when exporting the data model to Apache Avro." + }, + "avroLogicalType": { + "type": "string", + "description": "Specify the logical field type to use when exporting the data model to Apache Avro." + }, + "bigqueryType": { + "type": "string", + "description": "Specify the physical column type that is used in a BigQuery table, e.g., `NUMERIC(5, 2)`." + }, + "snowflakeType": { + "type": "string", + "description": "Specify the physical column type that is used in a Snowflake table, e.g., `TIMESTAMP_LTZ`." + }, + "redshiftType": { + "type": "string", + "description": "Specify the physical column type that is used in a Redshift table, e.g., `SMALLINT`." + }, + "sqlserverType": { + "type": "string", + "description": "Specify the physical column type that is used in a SQL Server table, e.g., `DATETIME2`." + }, + "databricksType": { + "type": "string", + "description": "Specify the physical column type that is used in a Databricks Unity Catalog table." + }, + "glueType": { + "type": "string", + "description": "Specify the physical column type that is used in an AWS Glue Data Catalog table." + } + } } } } + }, + "config": { + "type": "object", + "description": "Additional metadata for model configuration.", + "additionalProperties": { + "type": [ + "string", + "number", + "boolean", + "object", + "array", + "null" + + + ] + }, + "properties": { + "avroNamespace": { + "type": "string", + "description": "The namespace to use when importing and exporting the data model from / to Apache Avro." + } + } } } } @@ -809,6 +1059,22 @@ "type": { "$ref": "#/$defs/FieldType" }, + "fields": { + "description": "The nested fields (e.g. columns) of the object, record, or struct.", + "type": "object", + "additionalProperties": { + "$ref": "#/properties/models/additionalProperties/properties/fields/additionalProperties" + } + }, + "items": { + "$ref": "#/properties/models/additionalProperties/properties/fields/additionalProperties" + }, + "keys": { + "$ref": "#/properties/models/additionalProperties/properties/fields/additionalProperties" + }, + "values": { + "$ref": "#/properties/models/additionalProperties/properties/fields/additionalProperties" + }, "minLength": { "type": "integer", "description": "A value must be greater than or equal to this value. Applies only to string types." @@ -873,6 +1139,23 @@ "type": "string" }, "description": "Custom metadata to provide additional context." + }, + "links": { + "type": "object", + "description": "Links to external resources.", + "minProperties": 1, + "propertyNames": { + "pattern": "^[a-zA-Z0-9_-]+$" + }, + "additionalProperties": { + "type": "string", + "title": "Link", + "description": "A URL to an external resource.", + "format": "uri", + "examples": [ + "https://example.com" + ] + } } }, "required": [ @@ -1045,7 +1328,7 @@ }, "threshold": { "type": "string", - "description": "An optional maximum age of the youngest entry. Supported formats: Simple duration (e.g., `24 hours`, `5s`) and ISO 8601 duration (e.g, `PT24H`).", + "description": "An optional maximum age of the youngest entry. Supported formats: Simple duration (e.g., `24 hours`, `5s`) and ISO 8601 duration (e.g., `PT24H`).", "example": "25h" }, "timestampField": { @@ -1173,6 +1456,36 @@ "specification" ], "description": "The quality object contains quality attributes and checks." + }, + "links": { + "type": "object", + "description": "Links to external resources.", + "minProperties": 1, + "propertyNames": { + "pattern": "^[a-zA-Z0-9_-]+$" + }, + "additionalProperties": { + "type": "string", + "title": "Link", + "description": "A URL to an external resource.", + "format": "uri", + "examples": [ + "https://example.com" + ] + } + }, + "tags": { + "type": "array", + "items": { + "type": "string", + "description": "Tags to facilitate searching and filtering.", + "examples": [ + "databricks", + "pii", + "sensitive" + ] + }, + "description": "Tags to facilitate searching and filtering." } }, "required": [ @@ -1204,6 +1517,7 @@ "timestamp_ntz", "date", "array", + "map", "object", "record", "struct", @@ -1212,4 +1526,4 @@ ] } } -} +} \ No newline at end of file diff --git a/versions/0.9.3/definition.schema.json b/versions/0.9.3/definition.schema.json index 1cd561d..29dbe19 100644 --- a/versions/0.9.3/definition.schema.json +++ b/versions/0.9.3/definition.schema.json @@ -72,10 +72,27 @@ "type": "string" }, "description": "Custom metadata to provide additional context." + }, + "links": { + "type": "object", + "description": "Links to external resources.", + "minProperties": 1, + "propertyNames": { + "pattern": "^[a-zA-Z0-9_-]+$" + }, + "additionalProperties": { + "type": "string", + "title": "Link", + "description": "A URL to an external resource.", + "format": "uri", + "examples": [ + "https://example.com" + ] + } } }, "required": [ "name", "type" ] -} +} \ No newline at end of file