diff --git a/data/data_packages/all_time_register/register_of_interests.parquet b/data/data_packages/all_time_register/register_of_interests.parquet index 50205409..d45cef8b 100644 Binary files a/data/data_packages/all_time_register/register_of_interests.parquet and b/data/data_packages/all_time_register/register_of_interests.parquet differ diff --git a/data/data_packages/all_time_register/register_of_interests.resource.yaml b/data/data_packages/all_time_register/register_of_interests.resource.yaml index 42b5fc17..056b8714 100644 --- a/data/data_packages/all_time_register/register_of_interests.resource.yaml +++ b/data/data_packages/all_time_register/register_of_interests.resource.yaml @@ -1,7 +1,7 @@ title: Register of Members Interests (2000-) description: Register of members interests with basic NLP extraction custom: - row_count: 114243 + row_count: 114372 path: register_of_interests.parquet name: register_of_interests profile: data-resource @@ -22,7 +22,7 @@ schema: description: Name of member constraints: unique: false - example: A J Beith + example: Aaron Bell - name: category_name type: string description: Category of interest @@ -77,4 +77,4 @@ schema: constraints: unique: false example: '' -hash: aa9de9d58878cb7eddefcb009928ac8c +hash: c44f3dc4b0411ff1ee6fe7580b458cdf diff --git a/data/data_packages/parliament_2019/register_of_interests.parquet b/data/data_packages/parliament_2019/register_of_interests.parquet index dad82c3b..157de597 100644 Binary files a/data/data_packages/parliament_2019/register_of_interests.parquet and b/data/data_packages/parliament_2019/register_of_interests.parquet differ diff --git a/data/data_packages/parliament_2019/register_of_interests.resource.yaml b/data/data_packages/parliament_2019/register_of_interests.resource.yaml index c515699e..14efb16f 100644 --- a/data/data_packages/parliament_2019/register_of_interests.resource.yaml +++ b/data/data_packages/parliament_2019/register_of_interests.resource.yaml @@ -1,7 +1,7 @@ title: Register of Members Interests (latest register) description: Register of members interests with basic NLP extraction custom: - row_count: 17673 + row_count: 17802 path: register_of_interests.parquet name: register_of_interests profile: data-resource @@ -97,4 +97,4 @@ schema: constraints: unique: false example: '' -hash: f6137e1b8ad79fd9a8cc63c42699ac51 +hash: 6e5d4876fe1dae7abba816d2193ba93b diff --git a/data/interim/nlp.cache.parquet b/data/interim/nlp.cache.parquet index e57ddc35..95f83c8d 100644 Binary files a/data/interim/nlp.cache.parquet and b/data/interim/nlp.cache.parquet differ diff --git a/data/interim/processed_regmem.parquet b/data/interim/processed_regmem.parquet index 50205409..d45cef8b 100644 Binary files a/data/interim/processed_regmem.parquet and b/data/interim/processed_regmem.parquet differ diff --git a/src/parl_register_interests/sql/group_same_entry.sql b/src/parl_register_interests/sql/group_same_entry.sql index 29137e30..84918fe4 100644 --- a/src/parl_register_interests/sql/group_same_entry.sql +++ b/src/parl_register_interests/sql/group_same_entry.sql @@ -1,10 +1,37 @@ -select - public_whip_id, - category_name, - free_text, - last(member_name) as member_name, - min(registry_date) as earliest_declaration, - max(registry_date) as latest_declaration -from {{ parquet_path }} -group by public_whip_id, category_name, free_text -order by public_whip_id, latest_declaration \ No newline at end of file +-- We want to ensure we have the same name for all entries of the same person +-- But it's not like that in the source +-- So we get the last name declared, and then join that back to the original data +WITH LastMemberNames AS ( + SELECT DISTINCT public_whip_id, member_name + FROM ( + SELECT DISTINCT + public_whip_id, + member_name, + registry_date, + ROW_NUMBER() OVER (PARTITION BY public_whip_id ORDER BY registry_date DESC) AS rn + FROM {{ parquet_path }} + ) as subquery + WHERE rn = 1 +) + +SELECT + t.public_whip_id as public_whip_id, + t.category_name as category_name, + t.free_text as free_text, + MIN(lm.member_name) AS member_name, + MIN(t.registry_date) AS earliest_declaration, + MAX(t.registry_date) AS latest_declaration +FROM {{ parquet_path }} t +JOIN LastMemberNames lm ON t.public_whip_id = lm.public_whip_id +GROUP BY + t.public_whip_id, + lm.member_name, + t.category_name, + t.free_text, +ORDER BY + t.public_whip_id, + latest_declaration, + t.category_name, + t.free_text, + lm.member_name, + earliest_declaration;