Skip to content

Commit

Permalink
Update to 0.12.2, add in ODCS v3 example
Browse files Browse the repository at this point in the history
  • Loading branch information
pflooky committed Nov 30, 2024
1 parent c3c2b76 commit 7edf562
Show file tree
Hide file tree
Showing 5 changed files with 254 additions and 4 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
FROM datacatering/data-caterer:0.12.1
FROM datacatering/data-caterer:0.12.2

COPY --chown=app:app build/libs/data-caterer-example-0.1.0.jar /opt/app/job.jar
236 changes: 236 additions & 0 deletions docker/mount/odcs/full-example-v3.odcs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
# What's this data contract about?
domain: seller # Domain
dataProduct: my quantum # Data product name
version: 1.1.0 # Version (follows semantic versioning)
status: active
id: 53581432-6c55-4ba2-a65f-72344a91553a

# Lots of information
description:
purpose: Views built on top of the seller tables.
limitations: Data based on seller perspective, no buyer information
usage: Predict sales over time
authoritativeDefinitions:
type: privacy-statement
url: https://example.com/gdpr.pdf
tenant: ClimateQuantumInc

kind: DataContract
apiVersion: v3.0.0 # Standard version (follows semantic versioning)

# Infrastructure & servers
servers:
- server: my-postgres
type: postgres
host: localhost
port: 5432
database: pypl-edw
schema: pp_access_views

# Dataset, schema and quality
schema:
- name: tbl
physicalName: tbl_1
physicalType: table
description: Provides core payment metrics
authoritativeDefinitions:
- url: https://catalog.data.gov/dataset/air-quality
type: businessDefinition
- url: https://youtu.be/jbY1BKFj9ec
type: videoTutorial
tags: [ ]
dataGranularityDescription: Aggregation on columns txn_ref_dt, pmt_txn_id
properties:
- name: txn_ref_dt
primaryKey: false
primaryKeyPosition: -1
businessName: transaction reference date
logicalType: date
physicalType: date
required: false
description: Reference date for transaction
partitioned: true
partitionKeyPosition: 1
criticalDataElement: false
tags: [ ]
classification: public
transformSourceObjects:
- table_name_1
- table_name_2
- table_name_3
transformLogic: sel t1.txn_dt as txn_ref_dt from table_name_1 as t1, table_name_2 as t2, table_name_3 as t3 where t1.txn_dt=date-3
transformDescription: defines the logic in business terms; logic for dummies
examples:
- "2022-10-03"
- "2020-01-28"
customProperties:
- property: anonymizationStrategy
value: none
- name: rcvr_id
primaryKey: true
primaryKeyPosition: 1
businessName: receiver id
logicalType: string
physicalType: varchar(18)
required: false
description: A description for column rcvr_id.
partitioned: false
partitionKeyPosition: -1
criticalDataElement: false
tags: [ ]
classification: restricted
- name: rcvr_cntry_code
primaryKey: false
primaryKeyPosition: -1
businessName: receiver country code
logicalType: string
physicalType: varchar(2)
required: false
description: Country code
partitioned: false
partitionKeyPosition: -1
criticalDataElement: false
tags: [ ]
classification: public
authoritativeDefinitions:
- url: https://collibra.com/asset/742b358f-71a5-4ab1-bda4-dcdba9418c25
type: businessDefinition
- url: https://github.com/myorg/myrepo
type: transformationImplementation
- url: jdbc:postgresql://localhost:5432/adventureworks/tbl_1/rcvr_cntry_code
type: implementation
encryptedName: rcvr_cntry_code_encrypted
quality:
- rule: nullCheck
description: column should not contain null values
dimension: completeness # dropdown 7 values
type: library
severity: error
businessImpact: operational
schedule: 0 20 * * *
scheduler: cron
customProperties:
- property: FIELD_NAME
value:
- property: COMPARE_TO
value:
- property: COMPARISON_TYPE
value: Greater than
quality:
- rule: countCheck
type: library
description: Ensure row count is within expected volume range
dimension: completeness
method: reconciliation
severity: error
businessImpact: operational
schedule: 0 20 * * *
scheduler: cron
customProperties:
- property: business-key
value:
- txn_ref_dt
- rcvr_id


# Pricing
price:
priceAmount: 9.95
priceCurrency: USD
priceUnit: megabyte


# Team
team:
- username: ceastwood
role: Data Scientist
dateIn: "2022-08-02"
dateOut: "2022-10-01"
replacedByUsername: mhopper
- username: mhopper
role: Data Scientist
dateIn: "2022-10-01"
- username: daustin
role: Owner
comment: Keeper of the grail
dateIn: "2022-10-01"


# Roles
roles:
- role: microstrategy_user_opr
access: read
firstLevelApprovers: Reporting Manager
secondLevelApprovers: 'mandolorian'
- role: bq_queryman_user_opr
access: read
firstLevelApprovers: Reporting Manager
secondLevelApprovers: na
- role: risk_data_access_opr
access: read
firstLevelApprovers: Reporting Manager
secondLevelApprovers: 'dathvador'
- role: bq_unica_user_opr
access: write
firstLevelApprovers: Reporting Manager
secondLevelApprovers: 'mickey'

# SLA
slaDefaultElement: tab1.txn_ref_dt
slaProperties:
- property: latency # Property, see list of values in DP QoS
value: 4
unit: d # d, day, days for days; y, yr, years for years
element: tab1.txn_ref_dt # This would not be needed as it is the same table.column as the default one
- property: generalAvailability
value: "2022-05-12T09:30:10-08:00"
- property: endOfSupport
value: "2032-05-12T09:30:10-08:00"
- property: endOfLife
value: "2042-05-12T09:30:10-08:00"
- property: retention
value: 3
unit: y
element: tab1.txn_ref_dt
- property: frequency
value: 1
valueExt: 1
unit: d
element: tab1.txn_ref_dt
- property: timeOfAvailability
value: 09:00-08:00
element: tab1.txn_ref_dt
driver: regulatory # Describes the importance of the SLA: [regulatory|analytics|operational|...]
- property: timeOfAvailability
value: 08:00-08:00
element: tab1.txn_ref_dt
driver: analytics


# Support
support:
- channel: '#product-help' # Simple Slack communication channel
tool: slack
url: https://aidaug.slack.com/archives/C05UZRSBKLY
- channel: datacontract-ann # Simple distribution list
tool: email
url: mailto:[email protected]
- channel: Feedback # Product Feedback
description: General Product Feedback (Public)
url: https://product-feedback.com

# Tags
tags:
- transactions


# Custom properties
customProperties:
- property: refRulesetName
value: gcsc.ruleset.name
- property: somePropertyName
value: property.value
- property: dataprocClusterName # Used for specific applications like Elevate
value: [ cluster name ]

contractCreatedTs: "2022-11-15T02:59:43+00:00"
3 changes: 1 addition & 2 deletions gradle.properties
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,5 @@ version=0.1.0

scalaVersion=2.12
scalaSpecificVersion=2.12.19
dataCatererVersion=0.12.1
sparkVersion=3.5.1
dataCatererVersion=0.12.2
sparkMajorVersion=3.5
2 changes: 1 addition & 1 deletion helm/data-caterer/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ image:
repository: "datacatering/data-caterer"
pullPolicy: "IfNotPresent"
# Overrides the image tag whose default is the chart appVersion.
tag: "0.12.1"
tag: "0.12.2"

imagePullSecrets: []
nameOverride: ""
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package io.github.datacatering.plan

import io.github.datacatering.datacaterer.api.PlanRun

class AdvancedODCSV3PlanRun extends PlanRun {

val accountTask = csv("customer_accounts", "/opt/app/data/customer/account-odcs-v3", Map("header" -> "true"))
.schema(metadataSource.openDataContractStandard("/opt/app/mount/odcs/full-example-v3.odcs.yaml"))
.count(count.records(100))

val conf = configuration.enableGeneratePlanAndTasks(true)
.generatedReportsFolderPath("/opt/app/data/report")

execute(conf, accountTask)
}

0 comments on commit 7edf562

Please sign in to comment.