diff --git a/transform/dbt_project.yml b/transform/dbt_project.yml index 09d1f5ed..00c03239 100644 --- a/transform/dbt_project.yml +++ b/transform/dbt_project.yml @@ -27,3 +27,7 @@ models: +materialized: view clearinghouse: +schema: clearinghouse + intermediate: + +materialized: view + vds: + +schema: vds diff --git a/transform/models/_sources.yml b/transform/models/_sources.yml index ec26811d..3dd64453 100644 --- a/transform/models/_sources.yml +++ b/transform/models/_sources.yml @@ -59,7 +59,7 @@ sources: depending on the measurement capabilities of the detectors. columns: - name: FILENAME - description: The name of the file in the clearinghouse where the metdata was obtained from. + description: The name of the file in the clearinghouse where the data was obtained from. - name: ID description: Unique VDS (station identifier) value. Use this value to cross-reference with Metadata files. - name: SAMPLE_DATE diff --git a/transform/models/intermediate/vds/_vds.yml b/transform/models/intermediate/vds/_vds.yml new file mode 100644 index 00000000..52cbdc6e --- /dev/null +++ b/transform/models/intermediate/vds/_vds.yml @@ -0,0 +1,157 @@ +version: 2 + +models: + - name: int_vds__station_meta + description: | + Vehicle detector station metadata from the [PeMS Clearinghouse feeds](https://pems.dot.ca.gov/feeds/clhouse/). + columns: + - name: META_DATE + description: Date of the metadata file. + - name: ID + description: | + An integer value that uniquely indenties the Station Metadata. + Use this value to 'join' other clearinghouse files that contain Station Metadata + tests: + - not_null + - name: ABSOLUTE_POSTMILE + description: Absolute postmile value along a partilcuar state route where the VDS is located + - name: STATE_POSTMILE + description: State Postmile value + - name: CITY + description: The city number where the VDS is located, if available + - name: COUNTY + description: The unique number that identifies the county that contains a specific VDS within PeMS. + - name: DIRECTION + description: A string indicating the freeway direction of a specific VDS. Directions are N, E, S or W. + - name: DISTRICT + description: The district in which the VDS is located. Values are 1-12 + - name: FREEWAY + description: The freeway where the VDS is located + - name: LANES + description: Total number of lanes for a specific VDS + - name: LATITUDE + description: Latitude of a specific VDS + - name: LENGTH + description: The length associated with a VDS + - name: LONGITUDE + description: Longitude of a specific VDS + - name: NAME + description: A descriptive name assocaited with a VDS + - name: TYPE + description: Two character string identify the VDS type + - name: _VALID_FROM + description: | + The date that this metadata became valid. Should be the same as `META_DATE`. + Finding the right metadata for a date should involve a check with + `_VALID_FROM <= date < _VALID_TO`. + - name: _VALID_TO + description: | + The date that this metadata became *invalid*. + Finding the right metadata for a date should involve a check with + `_VALID_FROM <= date < _VALID_TO`. + - name: int_vds__station_status + description: | + TMDD Status data from PeMS Clearinghouse. This seems to be the main source of detector + ID to station ID mapping. + columns: + - name: META_DATE + description: The date of the XML metadata file + - name: STATION_ID + description: The station ID + - name: DETECTOR_ID + description: | + The detector ID. In general, there can be several detectors for a single + station, corresponding to different lanes of traffic. + - name: DETECTOR_NAME + description: The name of the detector + - name: DETECTOR_STATUS + description: | + Detector status as reported by the XML file. Probably not useful when determining + *actual* status of the detector. + - name: LAST_UPDATE_TIME + description: The last updated time. This seems to be empty for the whole dataset. + - name: LANE_NUMBER + description: | + The lane number for the detector. This seems to be reported as + 1, 10, 100, 1000, 10000, etc for each successive lane, rather than counting up + from one. + - name: _VALID_FROM + description: | + The date that this metadata became valid. Should be the same as `META_DATE`. + Finding the right metadata for a date should involve a check with + `_VALID_FROM <= date < _VALID_TO`. + - name: _VALID_TO + description: | + The date that this metadata became *invalid*. + Finding the right metadata for a date should involve a check with + `_VALID_FROM <= date < _VALID_TO`. + - name: int_vds__most_recent_station_meta + description: | + Vehicle detector station metadata from the [PeMS Clearinghouse feeds](https://pems.dot.ca.gov/feeds/clhouse/). + + This table is restricted to the most recent update for each district, any prior metadata + is ignored. + columns: + - name: META_DATE + description: Date of the metadata file. + - name: ID + description: | + An integer value that uniquely indenties the Station Metadata. + Use this value to 'join' other clearinghouse files that contain Station Metadata + tests: + - not_null + - name: ABSOLUTE_POSTMILE + description: Absolute postmile value along a partilcuar state route where the VDS is located + - name: STATE_POSTMILE + description: State Postmile value + - name: CITY + description: The city number where the VDS is located, if available + - name: COUNTY + description: The unique number that identifies the county that contains a specific VDS within PeMS. + - name: DIRECTION + description: A string indicating the freeway direction of a specific VDS. Directions are N, E, S or W. + - name: DISTRICT + description: The district in which the VDS is located. Values are 1-12 + - name: FREEWAY + description: The freeway where the VDS is located + - name: LANES + description: Total number of lanes for a specific VDS + - name: LATITUDE + description: Latitude of a specific VDS + - name: LENGTH + description: The length associated with a VDS + - name: LONGITUDE + description: Longitude of a specific VDS + - name: NAME + description: A descriptive name assocaited with a VDS + - name: TYPE + description: Two character string identify the VDS type + - name: int_vds__most_recent_station_status + description: | + TMDD Status data from PeMS Clearinghouse. This seems to be the main source of detector + ID to station ID mapping. + + This table is restricted to the most recent update for each district, any prior metadata + is ignored. + columns: + - name: META_DATE + description: The date of the XML metadata file + - name: STATION_ID + description: The station ID + - name: DETECTOR_ID + description: | + The detector ID. In general, there can be several detectors for a single + station, corresponding to different lanes of traffic. + - name: DETECTOR_NAME + description: The name of the detector + - name: DETECTOR_STATUS + description: | + Detector status as reported by the XML file. Probably not useful when determining + *actual* status of the detector. + - name: LAST_UPDATE_TIME + description: The last updated time. This seems to be empty for the whole dataset. + - name: LANE_NUMBER + description: | + The lane number for the detector. This seems to be reported as + 1, 10, 100, 1000, 10000, etc for each successive lane, rather than counting up + from one. diff --git a/transform/models/intermediate/vds/int_vds__most_recent_station_meta.sql b/transform/models/intermediate/vds/int_vds__most_recent_station_meta.sql new file mode 100644 index 00000000..ad72d1b2 --- /dev/null +++ b/transform/models/intermediate/vds/int_vds__most_recent_station_meta.sql @@ -0,0 +1,13 @@ +{{ config(materialized="ephemeral") }} + +with station_meta as ( + select * from {{ ref("int_vds__station_meta") }} +), + +most_recent_station_meta as ( + select * exclude (filename, _valid_from, _valid_to) + from station_meta + where _valid_to is null +) + +select * from most_recent_station_meta diff --git a/transform/models/intermediate/vds/int_vds__most_recent_station_status.sql b/transform/models/intermediate/vds/int_vds__most_recent_station_status.sql new file mode 100644 index 00000000..aee15e1e --- /dev/null +++ b/transform/models/intermediate/vds/int_vds__most_recent_station_status.sql @@ -0,0 +1,13 @@ +{{ config(materialized="ephemeral") }} + +with station_status as ( + select * from {{ ref("int_vds__station_status") }} +), + +most_recent_station_status as ( + select * exclude (filename, _valid_from, _valid_to) + from station_status + where _valid_to is null +) + +select * from most_recent_station_status diff --git a/transform/models/intermediate/vds/int_vds__station_meta.sql b/transform/models/intermediate/vds/int_vds__station_meta.sql new file mode 100644 index 00000000..fdcc3b86 --- /dev/null +++ b/transform/models/intermediate/vds/int_vds__station_meta.sql @@ -0,0 +1,33 @@ +with station_meta as ( + select * from {{ ref("stg_clearinghouse__station_meta") }} +), + +meta_dates as ( + select distinct + district, + meta_date + from station_meta +), + +validity_dates as ( + select + district, + meta_date, + meta_date as _valid_from, + lead(meta_date) over (partition by district order by meta_date asc) as _valid_to + from meta_dates +), + +station_meta_scd as ( + select + station_meta.*, + validity_dates._valid_from, + validity_dates._valid_to + from station_meta + inner join validity_dates + on + station_meta.meta_date = validity_dates.meta_date + and station_meta.district = validity_dates.district +) + +select * from station_meta_scd diff --git a/transform/models/intermediate/vds/int_vds__station_status.sql b/transform/models/intermediate/vds/int_vds__station_status.sql new file mode 100644 index 00000000..3b8c2524 --- /dev/null +++ b/transform/models/intermediate/vds/int_vds__station_status.sql @@ -0,0 +1,33 @@ +with station_status as ( + select * from {{ ref("stg_clearinghouse__station_status") }} +), + +status_dates as ( + select distinct + district, + meta_date + from station_status +), + +validity_dates as ( + select + district, + meta_date, + meta_date as _valid_from, + lead(meta_date) over (partition by district order by meta_date asc) as _valid_to + from status_dates +), + +station_status_scd as ( + select + station_status.*, + validity_dates._valid_from, + validity_dates._valid_to + from station_status + inner join validity_dates + on + station_status.meta_date = validity_dates.meta_date + and station_status.district = validity_dates.district +) + +select * from station_status_scd diff --git a/transform/models/staging/clearinghouse/_clearinghouse.yml b/transform/models/staging/clearinghouse/_clearinghouse.yml index d2c9d996..adb1c482 100644 --- a/transform/models/staging/clearinghouse/_clearinghouse.yml +++ b/transform/models/staging/clearinghouse/_clearinghouse.yml @@ -5,6 +5,8 @@ models: description: | Vehicle detector station metadata from the [PeMS Clearinghouse feeds](https://pems.dot.ca.gov/feeds/clhouse/). columns: + - name: FILENAME + description: The name of the file in the clearinghouse where the metdata was obtained from. - name: META_DATE description: Date of the metadata file. - name: ID @@ -135,6 +137,8 @@ models: TMDD Status data from PeMS Clearinghouse. This seems to be the main source of detector ID to station ID mapping. columns: + - name: FILENAME + description: The name of the file in the clearinghouse where the metdata was obtained from. - name: META_DATE description: The date of the XML metadata file - name: STATION_ID diff --git a/transform/models/staging/clearinghouse/stg_clearinghouse__station_meta.sql b/transform/models/staging/clearinghouse/stg_clearinghouse__station_meta.sql index b40ed077..d988b463 100644 --- a/transform/models/staging/clearinghouse/stg_clearinghouse__station_meta.sql +++ b/transform/models/staging/clearinghouse/stg_clearinghouse__station_meta.sql @@ -1,6 +1,7 @@ -{% set date_re = 'clhouse/meta/d\\\\d{2}/\\\\d{4}/\\\\d{2}/d\\\\d{2}_text_meta_(\\\\d{4})_(\\\\d{2})_(\\\\d{2}).txt' %} +{% set date_re='clhouse/meta/d\\\\d{2}/\\\\d{4}/\\\\d{2}/d\\\\d{2}_text_meta_(\\\\d{4})_(\\\\d{2})_(\\\\d{2}).txt' %} SELECT + FILENAME, DATE_FROM_PARTS( REGEXP_SUBSTR(FILENAME, '{{ date_re }}', 1, 1, '', 1)::INT, REGEXP_SUBSTR(FILENAME, '{{ date_re }}', 1, 1, '', 2)::INT, diff --git a/transform/models/staging/clearinghouse/stg_clearinghouse__station_raw.sql b/transform/models/staging/clearinghouse/stg_clearinghouse__station_raw.sql index 16caa8d7..151e3b94 100644 --- a/transform/models/staging/clearinghouse/stg_clearinghouse__station_raw.sql +++ b/transform/models/staging/clearinghouse/stg_clearinghouse__station_raw.sql @@ -1,4 +1,7 @@ +{% set district_re='clhouse/meta/d(\\\\d{2})' %} + SELECT + SUBSTR(FILENAME, 14, 2)::INT AS DISTRICT, ID, SAMPLE_DATE, SAMPLE_TIMESTAMP, @@ -22,5 +25,8 @@ SELECT SPEED_6, FLOW_7, OCCUPANCY_7, - SPEED_7 + SPEED_7, + FLOW_8, + OCCUPANCY_8, + SPEED_8 FROM {{ source('clearinghouse', 'station_raw') }} diff --git a/transform/models/staging/clearinghouse/stg_clearinghouse__station_status.sql b/transform/models/staging/clearinghouse/stg_clearinghouse__station_status.sql index a0cca51b..eb2bf0dd 100644 --- a/transform/models/staging/clearinghouse/stg_clearinghouse__station_status.sql +++ b/transform/models/staging/clearinghouse/stg_clearinghouse__station_status.sql @@ -1,14 +1,18 @@ -{% set date_re='clhouse/status/d\\\\d{2}/\\\\d{4}/\\\\d{2}/d\\\\d{2}_tmdd_meta_(\\\\d{4})_(\\\\d{2})_(\\\\d{2}).xml' %} +{% set + date_re='clhouse/status/d\\\\d{2}/\\\\d{4}/\\\\d{2}/d(\\\\d{2})_tmdd_meta_(\\\\d{4})_(\\\\d{2})_(\\\\d{2}).xml' +%} /* Helpful article for flattening XML: https://community.snowflake.com/s/article/HOW-TO-QUERY-NESTED-XML-DATA-IN-SNOWFLAKE */ SELECT + STATUS.FILENAME AS FILENAME, DATE_FROM_PARTS( - REGEXP_SUBSTR(STATUS.FILENAME, '{{ date_re }}', 1, 1, '', 1)::INT, REGEXP_SUBSTR(STATUS.FILENAME, '{{ date_re }}', 1, 1, '', 2)::INT, - REGEXP_SUBSTR(STATUS.FILENAME, '{{ date_re }}', 1, 1, '', 3)::INT + REGEXP_SUBSTR(STATUS.FILENAME, '{{ date_re }}', 1, 1, '', 3)::INT, + REGEXP_SUBSTR(STATUS.FILENAME, '{{ date_re }}', 1, 1, '', 4)::INT ) AS META_DATE, + REGEXP_SUBSTR(STATUS.FILENAME, '{{ date_re }}', 1, 1, '', 1)::INT AS DISTRICT, XMLGET(STATUS.CONTENT, 'station-id'):"$" AS STATION_ID, XMLGET(XMLGET(DETECTOR.VALUE, 'detector'), 'detector-id'):"$"::VARCHAR AS DETECTOR_ID, XMLGET(XMLGET(DETECTOR.VALUE, 'detector'), 'detector-name'):"$"::VARCHAR AS DETECTOR_NAME,