Skip to content

Commit

Permalink
Merge pull request #95 from cagov/clearinghouse-most-recent-station-meta
Browse files Browse the repository at this point in the history
Model for most recent clearinghouse metadata from each district
  • Loading branch information
ian-r-rose authored Mar 5, 2024
2 parents 59b9155 + bbdc330 commit 64d34cb
Show file tree
Hide file tree
Showing 11 changed files with 274 additions and 6 deletions.
4 changes: 4 additions & 0 deletions transform/dbt_project.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,7 @@ models:
+materialized: view
clearinghouse:
+schema: clearinghouse
intermediate:
+materialized: view
vds:
+schema: vds
2 changes: 1 addition & 1 deletion transform/models/_sources.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ sources:
depending on the measurement capabilities of the detectors.
columns:
- name: FILENAME
description: The name of the file in the clearinghouse where the metdata was obtained from.
description: The name of the file in the clearinghouse where the data was obtained from.
- name: ID
description: Unique VDS (station identifier) value. Use this value to cross-reference with Metadata files.
- name: SAMPLE_DATE
Expand Down
157 changes: 157 additions & 0 deletions transform/models/intermediate/vds/_vds.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
version: 2

models:
- name: int_vds__station_meta
description: |
Vehicle detector station metadata from the [PeMS Clearinghouse feeds](https://pems.dot.ca.gov/feeds/clhouse/).
columns:
- name: META_DATE
description: Date of the metadata file.
- name: ID
description: |
An integer value that uniquely indenties the Station Metadata.
Use this value to 'join' other clearinghouse files that contain Station Metadata
tests:
- not_null
- name: ABSOLUTE_POSTMILE
description: Absolute postmile value along a partilcuar state route where the VDS is located
- name: STATE_POSTMILE
description: State Postmile value
- name: CITY
description: The city number where the VDS is located, if available
- name: COUNTY
description: The unique number that identifies the county that contains a specific VDS within PeMS.
- name: DIRECTION
description: A string indicating the freeway direction of a specific VDS. Directions are N, E, S or W.
- name: DISTRICT
description: The district in which the VDS is located. Values are 1-12
- name: FREEWAY
description: The freeway where the VDS is located
- name: LANES
description: Total number of lanes for a specific VDS
- name: LATITUDE
description: Latitude of a specific VDS
- name: LENGTH
description: The length associated with a VDS
- name: LONGITUDE
description: Longitude of a specific VDS
- name: NAME
description: A descriptive name assocaited with a VDS
- name: TYPE
description: Two character string identify the VDS type
- name: _VALID_FROM
description: |
The date that this metadata became valid. Should be the same as `META_DATE`.
Finding the right metadata for a date should involve a check with
`_VALID_FROM <= date < _VALID_TO`.
- name: _VALID_TO
description: |
The date that this metadata became *invalid*.
Finding the right metadata for a date should involve a check with
`_VALID_FROM <= date < _VALID_TO`.
- name: int_vds__station_status
description: |
TMDD Status data from PeMS Clearinghouse. This seems to be the main source of detector
ID to station ID mapping.
columns:
- name: META_DATE
description: The date of the XML metadata file
- name: STATION_ID
description: The station ID
- name: DETECTOR_ID
description: |
The detector ID. In general, there can be several detectors for a single
station, corresponding to different lanes of traffic.
- name: DETECTOR_NAME
description: The name of the detector
- name: DETECTOR_STATUS
description: |
Detector status as reported by the XML file. Probably not useful when determining
*actual* status of the detector.
- name: LAST_UPDATE_TIME
description: The last updated time. This seems to be empty for the whole dataset.
- name: LANE_NUMBER
description: |
The lane number for the detector. This seems to be reported as
1, 10, 100, 1000, 10000, etc for each successive lane, rather than counting up
from one.
- name: _VALID_FROM
description: |
The date that this metadata became valid. Should be the same as `META_DATE`.
Finding the right metadata for a date should involve a check with
`_VALID_FROM <= date < _VALID_TO`.
- name: _VALID_TO
description: |
The date that this metadata became *invalid*.
Finding the right metadata for a date should involve a check with
`_VALID_FROM <= date < _VALID_TO`.
- name: int_vds__most_recent_station_meta
description: |
Vehicle detector station metadata from the [PeMS Clearinghouse feeds](https://pems.dot.ca.gov/feeds/clhouse/).
This table is restricted to the most recent update for each district, any prior metadata
is ignored.
columns:
- name: META_DATE
description: Date of the metadata file.
- name: ID
description: |
An integer value that uniquely indenties the Station Metadata.
Use this value to 'join' other clearinghouse files that contain Station Metadata
tests:
- not_null
- name: ABSOLUTE_POSTMILE
description: Absolute postmile value along a partilcuar state route where the VDS is located
- name: STATE_POSTMILE
description: State Postmile value
- name: CITY
description: The city number where the VDS is located, if available
- name: COUNTY
description: The unique number that identifies the county that contains a specific VDS within PeMS.
- name: DIRECTION
description: A string indicating the freeway direction of a specific VDS. Directions are N, E, S or W.
- name: DISTRICT
description: The district in which the VDS is located. Values are 1-12
- name: FREEWAY
description: The freeway where the VDS is located
- name: LANES
description: Total number of lanes for a specific VDS
- name: LATITUDE
description: Latitude of a specific VDS
- name: LENGTH
description: The length associated with a VDS
- name: LONGITUDE
description: Longitude of a specific VDS
- name: NAME
description: A descriptive name assocaited with a VDS
- name: TYPE
description: Two character string identify the VDS type
- name: int_vds__most_recent_station_status
description: |
TMDD Status data from PeMS Clearinghouse. This seems to be the main source of detector
ID to station ID mapping.
This table is restricted to the most recent update for each district, any prior metadata
is ignored.
columns:
- name: META_DATE
description: The date of the XML metadata file
- name: STATION_ID
description: The station ID
- name: DETECTOR_ID
description: |
The detector ID. In general, there can be several detectors for a single
station, corresponding to different lanes of traffic.
- name: DETECTOR_NAME
description: The name of the detector
- name: DETECTOR_STATUS
description: |
Detector status as reported by the XML file. Probably not useful when determining
*actual* status of the detector.
- name: LAST_UPDATE_TIME
description: The last updated time. This seems to be empty for the whole dataset.
- name: LANE_NUMBER
description: |
The lane number for the detector. This seems to be reported as
1, 10, 100, 1000, 10000, etc for each successive lane, rather than counting up
from one.
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{{ config(materialized="ephemeral") }}

with station_meta as (
select * from {{ ref("int_vds__station_meta") }}
),

most_recent_station_meta as (
select * exclude (filename, _valid_from, _valid_to)
from station_meta
where _valid_to is null
)

select * from most_recent_station_meta
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{{ config(materialized="ephemeral") }}

with station_status as (
select * from {{ ref("int_vds__station_status") }}
),

most_recent_station_status as (
select * exclude (filename, _valid_from, _valid_to)
from station_status
where _valid_to is null
)

select * from most_recent_station_status
33 changes: 33 additions & 0 deletions transform/models/intermediate/vds/int_vds__station_meta.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
with station_meta as (
select * from {{ ref("stg_clearinghouse__station_meta") }}
),

meta_dates as (
select distinct
district,
meta_date
from station_meta
),

validity_dates as (
select
district,
meta_date,
meta_date as _valid_from,
lead(meta_date) over (partition by district order by meta_date asc) as _valid_to
from meta_dates
),

station_meta_scd as (
select
station_meta.*,
validity_dates._valid_from,
validity_dates._valid_to
from station_meta
inner join validity_dates
on
station_meta.meta_date = validity_dates.meta_date
and station_meta.district = validity_dates.district
)

select * from station_meta_scd
33 changes: 33 additions & 0 deletions transform/models/intermediate/vds/int_vds__station_status.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
with station_status as (
select * from {{ ref("stg_clearinghouse__station_status") }}
),

status_dates as (
select distinct
district,
meta_date
from station_status
),

validity_dates as (
select
district,
meta_date,
meta_date as _valid_from,
lead(meta_date) over (partition by district order by meta_date asc) as _valid_to
from status_dates
),

station_status_scd as (
select
station_status.*,
validity_dates._valid_from,
validity_dates._valid_to
from station_status
inner join validity_dates
on
station_status.meta_date = validity_dates.meta_date
and station_status.district = validity_dates.district
)

select * from station_status_scd
4 changes: 4 additions & 0 deletions transform/models/staging/clearinghouse/_clearinghouse.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ models:
description: |
Vehicle detector station metadata from the [PeMS Clearinghouse feeds](https://pems.dot.ca.gov/feeds/clhouse/).
columns:
- name: FILENAME
description: The name of the file in the clearinghouse where the metdata was obtained from.
- name: META_DATE
description: Date of the metadata file.
- name: ID
Expand Down Expand Up @@ -135,6 +137,8 @@ models:
TMDD Status data from PeMS Clearinghouse. This seems to be the main source of detector
ID to station ID mapping.
columns:
- name: FILENAME
description: The name of the file in the clearinghouse where the metdata was obtained from.
- name: META_DATE
description: The date of the XML metadata file
- name: STATION_ID
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{% set date_re = 'clhouse/meta/d\\\\d{2}/\\\\d{4}/\\\\d{2}/d\\\\d{2}_text_meta_(\\\\d{4})_(\\\\d{2})_(\\\\d{2}).txt' %}
{% set date_re='clhouse/meta/d\\\\d{2}/\\\\d{4}/\\\\d{2}/d\\\\d{2}_text_meta_(\\\\d{4})_(\\\\d{2})_(\\\\d{2}).txt' %}

SELECT
FILENAME,
DATE_FROM_PARTS(
REGEXP_SUBSTR(FILENAME, '{{ date_re }}', 1, 1, '', 1)::INT,
REGEXP_SUBSTR(FILENAME, '{{ date_re }}', 1, 1, '', 2)::INT,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
{% set district_re='clhouse/meta/d(\\\\d{2})' %}

SELECT
SUBSTR(FILENAME, 14, 2)::INT AS DISTRICT,
ID,
SAMPLE_DATE,
SAMPLE_TIMESTAMP,
Expand All @@ -22,5 +25,8 @@ SELECT
SPEED_6,
FLOW_7,
OCCUPANCY_7,
SPEED_7
SPEED_7,
FLOW_8,
OCCUPANCY_8,
SPEED_8
FROM {{ source('clearinghouse', 'station_raw') }}
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
{% set date_re='clhouse/status/d\\\\d{2}/\\\\d{4}/\\\\d{2}/d\\\\d{2}_tmdd_meta_(\\\\d{4})_(\\\\d{2})_(\\\\d{2}).xml' %}
{% set
date_re='clhouse/status/d\\\\d{2}/\\\\d{4}/\\\\d{2}/d(\\\\d{2})_tmdd_meta_(\\\\d{4})_(\\\\d{2})_(\\\\d{2}).xml'
%}
/*
Helpful article for flattening XML:
https://community.snowflake.com/s/article/HOW-TO-QUERY-NESTED-XML-DATA-IN-SNOWFLAKE
*/
SELECT
STATUS.FILENAME AS FILENAME,
DATE_FROM_PARTS(
REGEXP_SUBSTR(STATUS.FILENAME, '{{ date_re }}', 1, 1, '', 1)::INT,
REGEXP_SUBSTR(STATUS.FILENAME, '{{ date_re }}', 1, 1, '', 2)::INT,
REGEXP_SUBSTR(STATUS.FILENAME, '{{ date_re }}', 1, 1, '', 3)::INT
REGEXP_SUBSTR(STATUS.FILENAME, '{{ date_re }}', 1, 1, '', 3)::INT,
REGEXP_SUBSTR(STATUS.FILENAME, '{{ date_re }}', 1, 1, '', 4)::INT
) AS META_DATE,
REGEXP_SUBSTR(STATUS.FILENAME, '{{ date_re }}', 1, 1, '', 1)::INT AS DISTRICT,
XMLGET(STATUS.CONTENT, 'station-id'):"$" AS STATION_ID,
XMLGET(XMLGET(DETECTOR.VALUE, 'detector'), 'detector-id'):"$"::VARCHAR AS DETECTOR_ID,
XMLGET(XMLGET(DETECTOR.VALUE, 'detector'), 'detector-name'):"$"::VARCHAR AS DETECTOR_NAME,
Expand Down

0 comments on commit 64d34cb

Please sign in to comment.