Skip to content

Commit

Permalink
Add in Data Contract CLI example
Browse files Browse the repository at this point in the history
  • Loading branch information
pflooky committed Oct 9, 2024
1 parent e314a8d commit b0f03fb
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 0 deletions.
53 changes: 53 additions & 0 deletions docker/mount/datacontract-cli/datacontract.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
dataContractSpecification: 0.9.3
id: covid_cases
info:
title: COVID-19 cases
description: Johns Hopkins University Consolidated data on COVID-19 cases, sourced from Enigma
version: "0.0.1"
links:
blog: https://aws.amazon.com/blogs/big-data/a-public-data-lake-for-analysis-of-covid-19-data/
data-explorer: https://dj2taa9i652rf.cloudfront.net/
data: https://covid19-lake.s3.us-east-2.amazonaws.com/enigma-jhu/json/part-00000-adec1cd2-96df-4c6b-a5f2-780f092951ba-c000.json
servers:
s3-json:
type: s3
location: s3://covid19-lake/enigma-jhu/json/*.json
format: json
delimiter: new_line
models:
covid_cases:
description: the number of confirmed covid cases reported for a specified region, with location and county/province/country information.
fields:
fips:
type: string
description: state and county two digits code
admin2:
type: string
description: county name
province_state:
type: string
description: province name or state name
country_region:
type: string
description: country name or region name
last_update:
type: timestamp_ntz
description: last update timestamp
latitude:
type: double
description: location (latitude)
longitude:
type: double
description: location (longitude)
confirmed:
type: int
description: number of confirmed cases
combined_key:
type: string
description: county name+state name
quality:
type: SodaCL
specification:
checks for covid_cases:
- freshness(last_update::datetime) < 5000d # dataset is not updated anymore
- row_count > 1000
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
package io.github.datacatering.plan

import io.github.datacatering.datacaterer.api.PlanRun

class AdvancedDataContractCliPlanRun extends PlanRun {

val accountTask = csv("customer_accounts", "/opt/app/data/customer/account-datacontract-cli", Map("header" -> "true"))
.schema(metadataSource.dataContractCli("/opt/app/mount/datacontract-cli/datacontract.yaml"))
.schema(
field.name("latitude").min(-90).max(90),
field.name("longitude").min(-180).max(180),
field.name("country_region").expression("#{Address.state}")
)
.count(count.records(100))

val conf = configuration.enableGeneratePlanAndTasks(true)
.generatedReportsFolderPath("/opt/app/data/report")

execute(conf, accountTask)
}

0 comments on commit b0f03fb

Please sign in to comment.