Skip to content

Commit

Permalink
Update group query to be consistent between runs
Browse files Browse the repository at this point in the history
- avoid repo clogging up with variations on same parquet
  • Loading branch information
ajparsons committed Oct 13, 2023
1 parent c1686f0 commit 87a3127
Show file tree
Hide file tree
Showing 7 changed files with 42 additions and 15 deletions.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
title: Register of Members Interests (2000-)
description: Register of members interests with basic NLP extraction
custom:
row_count: 114243
row_count: 114372
path: register_of_interests.parquet
name: register_of_interests
profile: data-resource
Expand All @@ -22,7 +22,7 @@ schema:
description: Name of member
constraints:
unique: false
example: A J Beith
example: Aaron Bell
- name: category_name
type: string
description: Category of interest
Expand Down Expand Up @@ -77,4 +77,4 @@ schema:
constraints:
unique: false
example: ''
hash: aa9de9d58878cb7eddefcb009928ac8c
hash: c44f3dc4b0411ff1ee6fe7580b458cdf
Binary file modified data/data_packages/parliament_2019/register_of_interests.parquet
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
title: Register of Members Interests (latest register)
description: Register of members interests with basic NLP extraction
custom:
row_count: 17673
row_count: 17802
path: register_of_interests.parquet
name: register_of_interests
profile: data-resource
Expand Down Expand Up @@ -97,4 +97,4 @@ schema:
constraints:
unique: false
example: ''
hash: f6137e1b8ad79fd9a8cc63c42699ac51
hash: 6e5d4876fe1dae7abba816d2193ba93b
Binary file modified data/interim/nlp.cache.parquet
Binary file not shown.
Binary file modified data/interim/processed_regmem.parquet
Binary file not shown.
47 changes: 37 additions & 10 deletions src/parl_register_interests/sql/group_same_entry.sql
Original file line number Diff line number Diff line change
@@ -1,10 +1,37 @@
select
public_whip_id,
category_name,
free_text,
last(member_name) as member_name,
min(registry_date) as earliest_declaration,
max(registry_date) as latest_declaration
from {{ parquet_path }}
group by public_whip_id, category_name, free_text
order by public_whip_id, latest_declaration
-- We want to ensure we have the same name for all entries of the same person
-- But it's not like that in the source
-- So we get the last name declared, and then join that back to the original data
WITH LastMemberNames AS (
SELECT DISTINCT public_whip_id, member_name
FROM (
SELECT DISTINCT
public_whip_id,
member_name,
registry_date,
ROW_NUMBER() OVER (PARTITION BY public_whip_id ORDER BY registry_date DESC) AS rn
FROM {{ parquet_path }}
) as subquery
WHERE rn = 1
)

SELECT
t.public_whip_id as public_whip_id,
t.category_name as category_name,
t.free_text as free_text,
MIN(lm.member_name) AS member_name,
MIN(t.registry_date) AS earliest_declaration,
MAX(t.registry_date) AS latest_declaration
FROM {{ parquet_path }} t
JOIN LastMemberNames lm ON t.public_whip_id = lm.public_whip_id
GROUP BY
t.public_whip_id,
lm.member_name,
t.category_name,
t.free_text,
ORDER BY
t.public_whip_id,
latest_declaration,
t.category_name,
t.free_text,
lm.member_name,
earliest_declaration;

0 comments on commit 87a3127

Please sign in to comment.