forked from pflooky/data-caterer-example
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Update to 0.12.2, add in ODCS v3 example
- Loading branch information
Showing
5 changed files
with
254 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
FROM datacatering/data-caterer:0.12.1 | ||
FROM datacatering/data-caterer:0.12.2 | ||
|
||
COPY --chown=app:app build/libs/data-caterer-example-0.1.0.jar /opt/app/job.jar |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,236 @@ | ||
# What's this data contract about? | ||
domain: seller # Domain | ||
dataProduct: my quantum # Data product name | ||
version: 1.1.0 # Version (follows semantic versioning) | ||
status: active | ||
id: 53581432-6c55-4ba2-a65f-72344a91553a | ||
|
||
# Lots of information | ||
description: | ||
purpose: Views built on top of the seller tables. | ||
limitations: Data based on seller perspective, no buyer information | ||
usage: Predict sales over time | ||
authoritativeDefinitions: | ||
type: privacy-statement | ||
url: https://example.com/gdpr.pdf | ||
tenant: ClimateQuantumInc | ||
|
||
kind: DataContract | ||
apiVersion: v3.0.0 # Standard version (follows semantic versioning) | ||
|
||
# Infrastructure & servers | ||
servers: | ||
- server: my-postgres | ||
type: postgres | ||
host: localhost | ||
port: 5432 | ||
database: pypl-edw | ||
schema: pp_access_views | ||
|
||
# Dataset, schema and quality | ||
schema: | ||
- name: tbl | ||
physicalName: tbl_1 | ||
physicalType: table | ||
description: Provides core payment metrics | ||
authoritativeDefinitions: | ||
- url: https://catalog.data.gov/dataset/air-quality | ||
type: businessDefinition | ||
- url: https://youtu.be/jbY1BKFj9ec | ||
type: videoTutorial | ||
tags: [ ] | ||
dataGranularityDescription: Aggregation on columns txn_ref_dt, pmt_txn_id | ||
properties: | ||
- name: txn_ref_dt | ||
primaryKey: false | ||
primaryKeyPosition: -1 | ||
businessName: transaction reference date | ||
logicalType: date | ||
physicalType: date | ||
required: false | ||
description: Reference date for transaction | ||
partitioned: true | ||
partitionKeyPosition: 1 | ||
criticalDataElement: false | ||
tags: [ ] | ||
classification: public | ||
transformSourceObjects: | ||
- table_name_1 | ||
- table_name_2 | ||
- table_name_3 | ||
transformLogic: sel t1.txn_dt as txn_ref_dt from table_name_1 as t1, table_name_2 as t2, table_name_3 as t3 where t1.txn_dt=date-3 | ||
transformDescription: defines the logic in business terms; logic for dummies | ||
examples: | ||
- "2022-10-03" | ||
- "2020-01-28" | ||
customProperties: | ||
- property: anonymizationStrategy | ||
value: none | ||
- name: rcvr_id | ||
primaryKey: true | ||
primaryKeyPosition: 1 | ||
businessName: receiver id | ||
logicalType: string | ||
physicalType: varchar(18) | ||
required: false | ||
description: A description for column rcvr_id. | ||
partitioned: false | ||
partitionKeyPosition: -1 | ||
criticalDataElement: false | ||
tags: [ ] | ||
classification: restricted | ||
- name: rcvr_cntry_code | ||
primaryKey: false | ||
primaryKeyPosition: -1 | ||
businessName: receiver country code | ||
logicalType: string | ||
physicalType: varchar(2) | ||
required: false | ||
description: Country code | ||
partitioned: false | ||
partitionKeyPosition: -1 | ||
criticalDataElement: false | ||
tags: [ ] | ||
classification: public | ||
authoritativeDefinitions: | ||
- url: https://collibra.com/asset/742b358f-71a5-4ab1-bda4-dcdba9418c25 | ||
type: businessDefinition | ||
- url: https://github.com/myorg/myrepo | ||
type: transformationImplementation | ||
- url: jdbc:postgresql://localhost:5432/adventureworks/tbl_1/rcvr_cntry_code | ||
type: implementation | ||
encryptedName: rcvr_cntry_code_encrypted | ||
quality: | ||
- rule: nullCheck | ||
description: column should not contain null values | ||
dimension: completeness # dropdown 7 values | ||
type: library | ||
severity: error | ||
businessImpact: operational | ||
schedule: 0 20 * * * | ||
scheduler: cron | ||
customProperties: | ||
- property: FIELD_NAME | ||
value: | ||
- property: COMPARE_TO | ||
value: | ||
- property: COMPARISON_TYPE | ||
value: Greater than | ||
quality: | ||
- rule: countCheck | ||
type: library | ||
description: Ensure row count is within expected volume range | ||
dimension: completeness | ||
method: reconciliation | ||
severity: error | ||
businessImpact: operational | ||
schedule: 0 20 * * * | ||
scheduler: cron | ||
customProperties: | ||
- property: business-key | ||
value: | ||
- txn_ref_dt | ||
- rcvr_id | ||
|
||
|
||
# Pricing | ||
price: | ||
priceAmount: 9.95 | ||
priceCurrency: USD | ||
priceUnit: megabyte | ||
|
||
|
||
# Team | ||
team: | ||
- username: ceastwood | ||
role: Data Scientist | ||
dateIn: "2022-08-02" | ||
dateOut: "2022-10-01" | ||
replacedByUsername: mhopper | ||
- username: mhopper | ||
role: Data Scientist | ||
dateIn: "2022-10-01" | ||
- username: daustin | ||
role: Owner | ||
comment: Keeper of the grail | ||
dateIn: "2022-10-01" | ||
|
||
|
||
# Roles | ||
roles: | ||
- role: microstrategy_user_opr | ||
access: read | ||
firstLevelApprovers: Reporting Manager | ||
secondLevelApprovers: 'mandolorian' | ||
- role: bq_queryman_user_opr | ||
access: read | ||
firstLevelApprovers: Reporting Manager | ||
secondLevelApprovers: na | ||
- role: risk_data_access_opr | ||
access: read | ||
firstLevelApprovers: Reporting Manager | ||
secondLevelApprovers: 'dathvador' | ||
- role: bq_unica_user_opr | ||
access: write | ||
firstLevelApprovers: Reporting Manager | ||
secondLevelApprovers: 'mickey' | ||
|
||
# SLA | ||
slaDefaultElement: tab1.txn_ref_dt | ||
slaProperties: | ||
- property: latency # Property, see list of values in DP QoS | ||
value: 4 | ||
unit: d # d, day, days for days; y, yr, years for years | ||
element: tab1.txn_ref_dt # This would not be needed as it is the same table.column as the default one | ||
- property: generalAvailability | ||
value: "2022-05-12T09:30:10-08:00" | ||
- property: endOfSupport | ||
value: "2032-05-12T09:30:10-08:00" | ||
- property: endOfLife | ||
value: "2042-05-12T09:30:10-08:00" | ||
- property: retention | ||
value: 3 | ||
unit: y | ||
element: tab1.txn_ref_dt | ||
- property: frequency | ||
value: 1 | ||
valueExt: 1 | ||
unit: d | ||
element: tab1.txn_ref_dt | ||
- property: timeOfAvailability | ||
value: 09:00-08:00 | ||
element: tab1.txn_ref_dt | ||
driver: regulatory # Describes the importance of the SLA: [regulatory|analytics|operational|...] | ||
- property: timeOfAvailability | ||
value: 08:00-08:00 | ||
element: tab1.txn_ref_dt | ||
driver: analytics | ||
|
||
|
||
# Support | ||
support: | ||
- channel: '#product-help' # Simple Slack communication channel | ||
tool: slack | ||
url: https://aidaug.slack.com/archives/C05UZRSBKLY | ||
- channel: datacontract-ann # Simple distribution list | ||
tool: email | ||
url: mailto:[email protected] | ||
- channel: Feedback # Product Feedback | ||
description: General Product Feedback (Public) | ||
url: https://product-feedback.com | ||
|
||
# Tags | ||
tags: | ||
- transactions | ||
|
||
|
||
# Custom properties | ||
customProperties: | ||
- property: refRulesetName | ||
value: gcsc.ruleset.name | ||
- property: somePropertyName | ||
value: property.value | ||
- property: dataprocClusterName # Used for specific applications like Elevate | ||
value: [ cluster name ] | ||
|
||
contractCreatedTs: "2022-11-15T02:59:43+00:00" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
15 changes: 15 additions & 0 deletions
15
src/main/scala/io/github/datacatering/plan/AdvancedODCSV3PlanRun.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
package io.github.datacatering.plan | ||
|
||
import io.github.datacatering.datacaterer.api.PlanRun | ||
|
||
class AdvancedODCSV3PlanRun extends PlanRun { | ||
|
||
val accountTask = csv("customer_accounts", "/opt/app/data/customer/account-odcs-v3", Map("header" -> "true")) | ||
.schema(metadataSource.openDataContractStandard("/opt/app/mount/odcs/full-example-v3.odcs.yaml")) | ||
.count(count.records(100)) | ||
|
||
val conf = configuration.enableGeneratePlanAndTasks(true) | ||
.generatedReportsFolderPath("/opt/app/data/report") | ||
|
||
execute(conf, accountTask) | ||
} |