Skip to content

Commit

Permalink
Update quality
Browse files Browse the repository at this point in the history
  • Loading branch information
jochenchrist committed Jul 20, 2024
1 parent 3355948 commit b9a976e
Show file tree
Hide file tree
Showing 8 changed files with 756 additions and 647 deletions.
9 changes: 7 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

Please note, while the major version is zero (0.y.z), Anything MAY change at any time.
The public API SHOULD NOT be considered stable.
## [1.0.1] - 2024-07-20

### Added
- Data quality attributes on model and field level
Expand All @@ -25,6 +24,12 @@ The public API SHOULD NOT be considered stable.
- Field `type: map` support with properties `keys` and `values`
- Definitions: `fields`, for type `object`, `record`, and `struct`

### Removed

- `quality` on top-level removed (is now considered as specification extension)
- `schema` removed (is now considered as specification extension)


## [0.9.3] - 2024-03-06

### Added
Expand Down
516 changes: 104 additions & 412 deletions README.md

Large diffs are not rendered by default.

10 changes: 1 addition & 9 deletions datacontract.init.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
dataContractSpecification: 0.9.3
dataContractSpecification: 1.0.1
id: my-data-contract-id
info:
title: My Data Contract
Expand Down Expand Up @@ -99,11 +99,3 @@ info:
# cron: 0 0 * * 0
# recoveryTime: 24 hours
# recoveryPoint: 1 week

### quality

#quality:
# type: SodaCL
# specification:
# checks for my_model: |-
# - duplicate_count(id) = 0
28 changes: 16 additions & 12 deletions examples/orders-latest/datacontract.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
dataContractSpecification: 0.9.3
dataContractSpecification: 1.0.1
id: urn:datacontract:checkout:orders-latest
info:
title: Orders Latest
Expand Down Expand Up @@ -62,19 +62,32 @@ models:
minLength: 10
maxLength: 20
customer_email_address:
description: The email address, as entered by the customer. The email address was not verified.
description: The email address, as entered by the customer.
type: text
format: email
required: true
pii: true
classification: sensitive
quality:
- type: text
name: The email address was verified by a user
processed_timestamp:
description: The timestamp when the record was processed by the data platform.
type: timestamp
required: true
config:
jsonType: string
jsonFormat: date-time
quality:
- type: sql
description: The maximum duration between two orders should be less that 3600 seconds
query: |
SELECT MAX(EXTRACT(EPOCH FROM (order_timestamp - LAG(order_timestamp) OVER (ORDER BY order_timestamp)))) AS max_duration
FROM orders
must_be_less_than: 3600
- type: row_count
engine: soda
must_be_greater_than: 5
line_items:
description: A single article that is part of an order.
type: table
Expand Down Expand Up @@ -180,13 +193,4 @@ servicelevels:
interval: weekly
cron: 0 0 * * 0
recoveryTime: 24 hours
recoveryPoint: 1 week
quality:
type: SodaCL # data quality check format: SodaCL, montecarlo, custom
specification: # expressed as string or inline yaml or via "$ref: checks.yaml"
checks for orders:
- row_count >= 5
- duplicate_count(order_id) = 0
checks for line_items:
- values in (order_id) must exist in orders (order_id)
- row_count >= 5
recoveryPoint: 1 week
292 changes: 194 additions & 98 deletions versions/0.9.3/README.md

Large diffs are not rendered by default.

207 changes: 98 additions & 109 deletions versions/0.9.3/datacontract.init.yaml
Original file line number Diff line number Diff line change
@@ -1,109 +1,98 @@
dataContractSpecification: 0.9.3
id: my-data-contract-id
info:
title: My Data Contract
version: 0.0.1
# description:
# owner:
# contact:
# name:
# url:
# email:


### servers

#servers:
# production:
# type: s3
# location: s3://
# format: parquet
# delimiter: new_line

### terms

#terms:
# usage:
# limitations:
# billing:
# noticePeriod:


### models

# models:
# my_model:
# description:
# type:
# fields:
# my_field:
# type:
# description:


### definitions

# definitions:
# my_field:
# domain:
# name:
# title:
# type:
# description:
# example:
# pii:
# classification:


### examples

#examples:
# - type: csv
# model: my_model
# data: |-
# id,timestamp,amount
# "1001","2023-09-09T08:30:00Z",2500
# "1002","2023-09-08T15:45:00Z",1800

### servicelevels

#servicelevels:
# availability:
# description: The server is available during support hours
# percentage: 99.9%
# retention:
# description: Data is retained for one year because!
# period: P1Y
# unlimited: false
# latency:
# description: Data is available within 25 hours after the order was placed
# threshold: 25h
# sourceTimestampField: orders.order_timestamp
# processedTimestampField: orders.processed_timestamp
# freshness:
# description: The age of the youngest row in a table.
# threshold: 25h
# timestampField: orders.order_timestamp
# frequency:
# description: Data is delivered once a day
# type: batch # or streaming
# interval: daily # for batch, either or cron
# cron: 0 0 * * * # for batch, either or interval
# support:
# description: The data is available during typical business hours at headquarters
# time: 9am to 5pm in EST on business days
# responseTime: 1h
# backup:
# description: Data is backed up once a week, every Sunday at 0:00 UTC.
# interval: weekly
# cron: 0 0 * * 0
# recoveryTime: 24 hours
# recoveryPoint: 1 week

### quality

#quality:
# type: SodaCL
# specification:
# checks for my_model: |-
# - duplicate_count(id) = 0
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"description": "Clear and concise explanations of syntax, semantic, and classification of business objects in a given domain.",
"properties": {
"domain": {
"type": "string",
"description": "The domain in which this definition is valid.",
"default": "global"
},
"name": {
"type": "string",
"description": "The technical name of this definition."
},
"title": {
"type": "string",
"description": "The business name of this definition."
},
"description": {
"type": "string",
"description": "Clear and concise explanations related to the domain."
},
"type": {
"type": "string",
"description": "The logical data type."
},
"minLength": {
"type": "integer",
"description": "A value must be greater than or equal to this value. Applies only to string types."
},
"maxLength": {
"type": "integer",
"description": "A value must be less than or equal to this value. Applies only to string types."
},
"format": {
"type": "string",
"description": "Specific format requirements for the value (e.g., 'email', 'uri', 'uuid')."
},
"precision": {
"type": "integer",
"examples": [
38
],
"description": "The maximum number of digits in a number. Only applies to numeric values. Defaults to 38."
},
"scale": {
"type": "integer",
"examples": [
0
],
"description": "The maximum number of decimal places in a number. Only applies to numeric values. Defaults to 0."
},
"pattern": {
"type": "string",
"description": "A regular expression pattern the value must match. Applies only to string types."
},
"example": {
"type": "string",
"description": "An example value."
},
"pii": {
"type": "boolean",
"description": "Indicates if the field contains Personal Identifiable Information (PII)."
},
"classification": {
"type": "string",
"description": "The data class defining the sensitivity level for this field."
},
"tags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Custom metadata to provide additional context."
},
"links": {
"type": "object",
"description": "Links to external resources.",
"minProperties": 1,
"propertyNames": {
"pattern": "^[a-zA-Z0-9_-]+$"
},
"additionalProperties": {
"type": "string",
"title": "Link",
"description": "A URL to an external resource.",
"format": "uri",
"examples": [
"https://example.com"
]
}
}
},
"required": [
"name",
"type"
]
}
Loading

0 comments on commit b9a976e

Please sign in to comment.