diff --git a/docker/mount/datacontract-cli/datacontract.yaml b/docker/mount/datacontract-cli/datacontract.yaml new file mode 100644 index 0000000..ef1a291 --- /dev/null +++ b/docker/mount/datacontract-cli/datacontract.yaml @@ -0,0 +1,53 @@ +dataContractSpecification: 0.9.3 +id: covid_cases +info: + title: COVID-19 cases + description: Johns Hopkins University Consolidated data on COVID-19 cases, sourced from Enigma + version: "0.0.1" + links: + blog: https://aws.amazon.com/blogs/big-data/a-public-data-lake-for-analysis-of-covid-19-data/ + data-explorer: https://dj2taa9i652rf.cloudfront.net/ + data: https://covid19-lake.s3.us-east-2.amazonaws.com/enigma-jhu/json/part-00000-adec1cd2-96df-4c6b-a5f2-780f092951ba-c000.json +servers: + s3-json: + type: s3 + location: s3://covid19-lake/enigma-jhu/json/*.json + format: json + delimiter: new_line +models: + covid_cases: + description: the number of confirmed covid cases reported for a specified region, with location and county/province/country information. + fields: + fips: + type: string + description: state and county two digits code + admin2: + type: string + description: county name + province_state: + type: string + description: province name or state name + country_region: + type: string + description: country name or region name + last_update: + type: timestamp_ntz + description: last update timestamp + latitude: + type: double + description: location (latitude) + longitude: + type: double + description: location (longitude) + confirmed: + type: int + description: number of confirmed cases + combined_key: + type: string + description: county name+state name +quality: + type: SodaCL + specification: + checks for covid_cases: + - freshness(last_update::datetime) < 5000d # dataset is not updated anymore + - row_count > 1000 \ No newline at end of file diff --git a/src/main/scala/io/github/datacatering/plan/AdvancedDataContractCliPlanRun.scala b/src/main/scala/io/github/datacatering/plan/AdvancedDataContractCliPlanRun.scala new file mode 100644 index 0000000..9811dfd --- /dev/null +++ b/src/main/scala/io/github/datacatering/plan/AdvancedDataContractCliPlanRun.scala @@ -0,0 +1,20 @@ +package io.github.datacatering.plan + +import io.github.datacatering.datacaterer.api.PlanRun + +class AdvancedDataContractCliPlanRun extends PlanRun { + + val accountTask = csv("customer_accounts", "/opt/app/data/customer/account-datacontract-cli", Map("header" -> "true")) + .schema(metadataSource.dataContractCli("/opt/app/mount/datacontract-cli/datacontract.yaml")) + .schema( + field.name("latitude").min(-90).max(90), + field.name("longitude").min(-180).max(180), + field.name("country_region").expression("#{Address.state}") + ) + .count(count.records(100)) + + val conf = configuration.enableGeneratePlanAndTasks(true) + .generatedReportsFolderPath("/opt/app/data/report") + + execute(conf, accountTask) +}