Skip to content

Commit

Permalink
Merge pull request #124 from cagov/unpivot-vds-data
Browse files Browse the repository at this point in the history
Unpivot VDS Data
  • Loading branch information
britt-allen authored Mar 22, 2024
2 parents d610b69 + d1b3542 commit 95c7a5f
Show file tree
Hide file tree
Showing 10 changed files with 81 additions and 170 deletions.
4 changes: 2 additions & 2 deletions transform/dbt_project.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,5 @@ models:
+schema: clearinghouse
intermediate:
+materialized: view
vds:
+schema: vds
clearinghouse:
+schema: clearinghouse
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
version: 2

models:
- name: int_vds__station_meta
- name: int_clearinghouse__station_meta
description: |
Vehicle detector station metadata from the [PeMS Clearinghouse feeds](https://pems.dot.ca.gov/feeds/clhouse/).
columns:
Expand Down Expand Up @@ -49,7 +49,7 @@ models:
The date that this metadata became *invalid*.
Finding the right metadata for a date should involve a check with
`_VALID_FROM <= date < _VALID_TO`.
- name: int_vds__station_status
- name: int_clearinghouse__station_status
description: |
TMDD Status data from PeMS Clearinghouse. This seems to be the main source of detector
ID to station ID mapping.
Expand Down Expand Up @@ -85,7 +85,7 @@ models:
The date that this metadata became *invalid*.
Finding the right metadata for a date should involve a check with
`_VALID_FROM <= date < _VALID_TO`.
- name: int_vds__most_recent_station_meta
- name: int_clearinghouse__most_recent_station_meta
description: |
Vehicle detector station metadata from the [PeMS Clearinghouse feeds](https://pems.dot.ca.gov/feeds/clhouse/).
Expand Down Expand Up @@ -126,7 +126,7 @@ models:
description: A descriptive name assocaited with a VDS
- name: TYPE
description: Two character string identify the VDS type
- name: int_vds__most_recent_station_status
- name: int_clearinghouse__most_recent_station_status
description: |
TMDD Status data from PeMS Clearinghouse. This seems to be the main source of detector
ID to station ID mapping.
Expand Down Expand Up @@ -154,48 +154,22 @@ models:
description: |
The lane number for the detector. This seems to be reported as
1, 10, 100, 1000, 10000, etc for each successive lane, rather than counting up
from one.
- name: int_vds__five_minute_station_agg
from one
- name: int_clearinghouse__five_minute_station_agg
description: |
Raw 30 second sample data aggregated to the 5 minute level.
Flow/volume numbers are summed, occupancy and speed numbers are averaged.
Volume values are summed, occupancy is averaged.
columns:
- name: ID
description: The station ID
- name: SAMPLE_DATE
description: The date on which the sample was taken
- name: SAMPLE_TIMESTAMP
description: The timestamp of the start for the 5 minute aggregated samples.
- name: FLOW_1
description: The sum of the flow values over the sample period.
- name: FLOW_2
description: The sum of the flow values over the sample period.
- name: FLOW_3
description: The sum of the flow values over the sample period.
- name: FLOW_4
description: The sum of the flow values over the sample period.
- name: FLOW_5
description: The sum of the flow values over the sample period.
- name: FLOW_6
description: The sum of the flow values over the sample period.
- name: FLOW_7
description: The sum of the flow values over the sample period.
- name: FLOW_8
description: The sum of the flow values over the sample period.
- name: OCCUPANCY_1
description: The average of the occupancy values over the sample period.
- name: OCCUPANCY_2
description: The average of the occupancy values over the sample period.
- name: OCCUPANCY_3
description: The average of the occupancy values over the sample period.
- name: OCCUPANCY_4
description: The average of the occupancy values over the sample period.
- name: OCCUPANCY_5
description: The average of the occupancy values over the sample period.
- name: OCCUPANCY_1
description: The average of the occupancy values over the sample period.
- name: OCCUPANCY_7
description: The average of the occupancy values over the sample period.
- name: OCCUPANCY_8
- name: LANE
description: |
The lane number of the detector, starting from 1
- name: VOLUME
description: The sum of the flow values for a detector over the sample period.
- name: OCCUPANCY
description: The average of the occupancy values over the sample period.
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{{ config(
materialized="incremental",
cluster_by=["sample_date"],
unique_key=["ID", "SAMPLE_TIMESTAMP"],
unique_key=["ID", "LANE", "SAMPLE_TIMESTAMP"],
snowflake_warehouse=get_snowflake_refresh_warehouse()
) }}

Expand Down Expand Up @@ -33,26 +33,11 @@ aggregated as (
id,
sample_date,
sample_timestamp_trunc as sample_timestamp,
-- Sum of all the flow values
sum(flow_1) as flow_1,
sum(flow_2) as flow_2,
sum(flow_3) as flow_3,
sum(flow_4) as flow_4,
sum(flow_5) as flow_5,
sum(flow_6) as flow_6,
sum(flow_7) as flow_7,
sum(flow_8) as flow_8,
-- Average of all the occupancy values
avg(occupancy_1) as occupancy_1,
avg(occupancy_2) as occupancy_2,
avg(occupancy_3) as occupancy_3,
avg(occupancy_4) as occupancy_4,
avg(occupancy_5) as occupancy_5,
avg(occupancy_6) as occupancy_6,
avg(occupancy_7) as occupancy_7,
avg(occupancy_8) as occupancy_8
lane,
sum(volume) as volume, -- Sum of all the flow values
avg(occupancy) as occupancy -- Average of all the occupancy values
from station_raw
group by id, sample_date, sample_timestamp_trunc
group by id, lane, sample_date, sample_timestamp_trunc
)

select * from aggregated
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{{ config(materialized="ephemeral") }}

with station_meta as (
select * from {{ ref("int_vds__station_meta") }}
select * from {{ ref("int_clearinghouse__station_meta") }}
),

most_recent_station_meta as (
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{{ config(materialized="ephemeral") }}

with station_status as (
select * from {{ ref("int_vds__station_status") }}
select * from {{ ref("int_clearinghouse__station_status") }}
),

most_recent_station_status as (
Expand Down
84 changes: 8 additions & 76 deletions transform/models/staging/clearinghouse/_clearinghouse.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,85 +52,17 @@ models:
description: 30 second raw data sample time as reported by the field element as YYYY/MM/DD.
- name: SAMPLE_TIMESTAMP
description: 30 second raw data sample time as reported by the field element as YYYY/MM/DD HH24:MI:SS.
- name: FLOW_1
- name: LANE
description: The lane number for the detector, starting from 1.
- name: VOLUME
description: |
Number of vehicle that passed over the detector during the sample period for lane 1.
Number of vehicle that passed over the detector during the sample period for the lane.
Units are Vehicles/sample period.
- name: FLOW_2
- name: OCCUPANCY
description: Occupancy of the lane during the sample period expressed as a decimal number between 0 and 1.
- name: SPEED
description: |
Number of vehicle that passed over the detector during the sample period for lane 2.
Units are Vehicles/sample period.
- name: FLOW_3
description: |
Number of vehicle that passed over the detector during the sample period for lane 3.
Units are Vehicles/sample period.
- name: FLOW_4
description: |
Number of vehicle that passed over the detector during the sample period for lane 4.
Units are Vehicles/sample period.
- name: FLOW_5
description: |
Number of vehicle that passed over the detector during the sample period for lane 5.
Units are Vehicles/sample period.
- name: FLOW_6
description: |
Number of vehicle that passed over the detector during the sample period for lane 6.
Units are Vehicles/sample period.
- name: FLOW_7
description: |
Number of vehicle that passed over the detector during the sample period for lane 7.
Units are Vehicles/sample period.
- name: FLOW_8
description: |
Number of vehicle that passed over the detector during the sample period for lane 8.
Units are Vehicles/sample period.
- name: OCCUPANCY_1
description: Occupancy of the lane 1 during the sample period expressed as a decimal number between 0 and 1.
- name: OCCUPANCY_2
description: Occupancy of the lane 2 during the sample period expressed as a decimal number between 0 and 1.
- name: OCCUPANCY_3
description: Occupancy of the lane 3 during the sample period expressed as a decimal number between 0 and 1.
- name: OCCUPANCY_4
description: Occupancy of the lane 4 during the sample period expressed as a decimal number between 0 and 1.
- name: OCCUPANCY_5
description: Occupancy of the lane 5 during the sample period expressed as a decimal number between 0 and 1.
- name: OCCUPANCY_6
description: Occupancy of the lane 6 during the sample period expressed as a decimal number between 0 and 1.
- name: OCCUPANCY_7
description: Occupancy of the lane 7 during the sample period expressed as a decimal number between 0 and 1.
- name: OCCUPANCY_8
description: Occupancy of the lane 8 during the sample period expressed as a decimal number between 0 and 1.
- name: SPEED_1
description: |
Speed in lane 1 as measured by the detector in miles/hour.
Empty if the detector does not report speed.
- name: SPEED_2
description: |
Speed in lane 2 as measured by the detector in miles/hour.
Empty if the detector does not report speed.
- name: SPEED_3
description: |
Speed in lane 3 as measured by the detector in miles/hour.
Empty if the detector does not report speed.
- name: SPEED_4
description: |
Speed in lane 4 as measured by the detector in miles/hour.
Empty if the detector does not report speed.
- name: SPEED_5
description: |
Speed in lane 5 as measured by the detector in miles/hour.
Empty if the detector does not report speed.
- name: SPEED_6
description: |
Speed in lane 6 as measured by the detector in miles/hour.
Empty if the detector does not report speed.
- name: SPEED_7
description: |
Speed in lane 7 as measured by the detector in miles/hour.
Empty if the detector does not report speed.
- name: SPEED_8
description: |
Speed in lane 8 as measured by the detector in miles/hour.
Speed in the lane as measured by the detector in miles/hour.
Empty if the detector does not report speed.
- name: stg_clearinghouse__station_status
description: |
Expand Down
Original file line number Diff line number Diff line change
@@ -1,32 +1,51 @@
{% set district_re='clhouse/meta/d(\\\\d{2})' %}

SELECT
SUBSTR(FILENAME, 14, 2)::INT AS DISTRICT,
ID,
SAMPLE_DATE,
SAMPLE_TIMESTAMP,
FLOW_1,
OCCUPANCY_1,
SPEED_1,
FLOW_2,
OCCUPANCY_2,
SPEED_2,
FLOW_3,
OCCUPANCY_3,
SPEED_3,
FLOW_4,
OCCUPANCY_4,
SPEED_4,
FLOW_5,
OCCUPANCY_5,
SPEED_5,
FLOW_6,
OCCUPANCY_6,
SPEED_6,
FLOW_7,
OCCUPANCY_7,
SPEED_7,
FLOW_8,
OCCUPANCY_8,
SPEED_8
FROM {{ source('clearinghouse', 'station_raw') }}
select
substr(s.filename, 14, 2)::int as district,
s.id,
s.sample_date,
s.sample_timestamp,
lane.value as lane,
-- neat trick to flatten multiple columns at once rather than using UNPIVOT:
-- https://stackoverflow.com/questions/36798558/lateral-flatten-two-columns-without-repetition-in-snowflake
[
s.flow_1,
s.flow_2,
s.flow_3,
s.flow_4,
s.flow_5,
s.flow_6,
s.flow_7,
s.flow_8
][
lane.index
] as volume,
[
s.occupancy_1,
s.occupancy_2,
s.occupancy_3,
s.occupancy_4,
s.occupancy_5,
s.occupancy_6,
s.occupancy_7,
s.occupancy_8
][
lane.index
] as occupancy,
[
s.speed_1,
s.speed_2,
s.speed_3,
s.speed_4,
s.speed_5,
s.speed_6,
s.speed_7,
s.speed_8
][
lane.index
] as speed
from {{ source('clearinghouse', 'station_raw') }} as s,
lateral flatten([1, 2, 3, 4, 5, 6, 7, 8]) as lane
{% if target.name == 'dev' %}
where s.sample_date >= dateadd('day', 31, current_date())
{% endif %}
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
{{ config(materialized="table") }}
{% set
date_re='clhouse/status/d\\\\d{2}/\\\\d{4}/\\\\d{2}/d(\\\\d{2})_tmdd_meta_(\\\\d{4})_(\\\\d{2})_(\\\\d{2}).xml'
%}
Expand All @@ -13,7 +14,7 @@ SELECT
REGEXP_SUBSTR(STATUS.FILENAME, '{{ date_re }}', 1, 1, '', 4)::INT
) AS META_DATE,
REGEXP_SUBSTR(STATUS.FILENAME, '{{ date_re }}', 1, 1, '', 1)::INT AS DISTRICT,
XMLGET(STATUS.CONTENT, 'station-id'):"$" AS STATION_ID,
XMLGET(STATUS.CONTENT, 'station-id'):"$"::VARCHAR AS STATION_ID,
XMLGET(XMLGET(DETECTOR.VALUE, 'detector'), 'detector-id'):"$"::VARCHAR AS DETECTOR_ID,
XMLGET(XMLGET(DETECTOR.VALUE, 'detector'), 'detector-name'):"$"::VARCHAR AS DETECTOR_NAME,
XMLGET(XMLGET(DETECTOR.VALUE, 'detector'), 'detector-status'):"$"::VARCHAR AS DETECTOR_STATUS,
Expand Down

0 comments on commit 95c7a5f

Please sign in to comment.