Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WiP #1

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,6 @@ node_modules

# CDK
cdk.out

# Python
**/__pycache__/**
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ Use [`uv`](https://github.com/astral-sh/uv) or [`pip-tools`](https://github.com/
Commands given using `uv`. See [`uv` equivalence]() for comparison.

```shell
uv venv --prompt . .venv
uv pip sync requirements/dev.txt
PYENV_VERSION=3.10.14 uv venv --prompt . .venv
uv pip sync requirements/base.txt requirements/dev.txt
```

### Commit hooks
Expand Down Expand Up @@ -66,7 +66,7 @@ npm install

#### dev dependencies
* Add a dependency to `requirements/dev.in`
* Run `uv pip compile --generate-hashes --output-file requirements/dev.txt requirements/base.in requirements/dev.in`
* Run `uv pip compile --generate-hashes --output-file requirements/dev.txt requirements/dev.in`


## CDK Setup
Expand Down
58 changes: 58 additions & 0 deletions cdk/lambdas/partition-addressbase-by-first-letter/handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import boto3


def handler(event, context):
if len(event["Records"]) != 1:
return {"message": "More than one object creation event sent"}
record = event["Records"][0]
if record["eventSource"] != "aws:s3":
return {"message": "Not an s3 Event"}

if not record["eventName"].startswith("ObjectCreated"):
return {"message": "Not an object creation event"}

# Do checks on addressbase_bucket and key??
# addressbase_bucket = record["s3"]["addressbase_bucket"]["name"]
# key = record["s3"]["key"]

# Invoke query
athena_client = boto3.client("athena")
workgroup = "dc-data-baker"
database = "dc_data_baker"

query_id = get_query(
athena_client,
"partition-addressbase-cleaned-query",
workgroup,
)
return athena_client.start_query_execution(
QueryExecutionId=query_id,
QueryExecutionContext={"Database": database},
WorkGroup=workgroup,
)

# Update glue table description after query has run - this should probably be another lambda...


def get_query(client, name, workgroup):
query_ids = client.list_named_queries(WorkGroup=workgroup)["NamedQueryIds"]
for qid in query_ids:
query = client.get_named_query(NamedQueryId=qid)
if name == query["NamedQuery"]["Name"]:
return query
return None


# {'NamedQuery': {'Name': 'partition-addressbase-cleaned-query',
# 'Database': 'dc_data_baker',
# 'QueryString': "\n UNLOAD (SELECT\n\tsplit_part(postcode, ' ', 1) as outcode,\n\tuprn,\n\taddress,\n\tpostcode,\n\tST_X(ST_GeometryFromText(split_part(location, ';', 2))) as longitude,\n\tST_Y(ST_GeometryFromText(split_part(location, ';', 2))) as latitude,\n\tsubstr(postcode, 1,1) as first_letter\nFROM addressbase_cleaned_raw\n\n) \n TO 's3://pollingstations.private.data/addressbase/testing/addressbase_partitioned/' \n WITH(\n format = 'PARQUET',\n compression = 'SNAPPY',\n partioned_by = ARRAY['first_letter'] \n )\n ",
# 'NamedQueryId': '3a08bb81-4049-4825-a952-aeaf1b94c6f1',
# 'WorkGroup': 'data-baker-workgroup'},
# 'ResponseMetadata': {'RequestId': '50e5e5bc-a7da-42b7-97d1-e46e7a5f4644',
# 'HTTPStatusCode': 200,
# 'HTTPHeaders': {'date': 'Fri, 26 Apr 2024 06:16:35 GMT',
# 'content-type': 'application/x-amz-json-1.1',
# 'content-length': '780',
# 'connection': 'keep-alive',
# 'x-amzn-requestid': '50e5e5bc-a7da-42b7-97d1-e46e7a5f4644'},
# 'RetryAttempts': 0}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
boto3
polars
40 changes: 40 additions & 0 deletions cdk/lambdas/partition-addressbase-by-first-letter/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# This file was autogenerated by uv via the following command:
# uv pip compile --generate-hashes --output-file cdk/lambdas/partition-addressbase-by-first-letter/requirements.txt cdk/lambdas/partition-addressbase-by-first-letter/requirements.in
boto3==1.34.91 \
--hash=sha256:5077917041adaaae15eeca340289547ef905ca7e11516e9bd22d394fb5057d2a \
--hash=sha256:97fac686c47647db4b44e4789317e4aeecd38511d71e84f8d20abe33eb630ff1
botocore==1.34.91 \
--hash=sha256:4d1b13f2b1c28ce1743b1e5895ae62bb7e67f892b51882164ea19c27a130852b \
--hash=sha256:93ef7071292a1b2b9fc26537f8ae3a8227da1177969241939ea3fbdb1a1a1d0c
# via
# boto3
# s3transfer
jmespath==1.0.1 \
--hash=sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980 \
--hash=sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe
# via
# boto3
# botocore
polars==0.20.22 \
--hash=sha256:08ee57946f34e2de3ebfc7853d21a14eb92e3993e71d788a6aaaa0757e7bd3e2 \
--hash=sha256:15d8807828f9c3ddbab60b4aa17ea1dea7743a3dddebfd1c6186826257a687ca \
--hash=sha256:2f7b08e1725d1a7c522aa316304e8ddb835c69b579577249764c7fa4eeb97305 \
--hash=sha256:abc5da1f6f7e2ee15bdab74cd19939948a0910799b27ee3eb0768bb95f0e9aff \
--hash=sha256:ceeb767bb944605539db63c528fe074708f0e23ece2f78f3dfc5132ac2e84d64 \
--hash=sha256:d211aed6ae34845e1a9766e3b487f73ee9d5044927cc748f7498a72a5a0c8805
python-dateutil==2.9.0.post0 \
--hash=sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3 \
--hash=sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427
# via botocore
s3transfer==0.10.1 \
--hash=sha256:5683916b4c724f799e600f41dd9e10a9ff19871bf87623cc8f491cb4f5fa0a19 \
--hash=sha256:ceb252b11bcf87080fb7850a224fb6e05c8a776bab8f2b64b7f25b969464839d
# via boto3
six==1.16.0 \
--hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \
--hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254
# via python-dateutil
urllib3==2.2.1 \
--hash=sha256:450b20ec296a467077128bff42b73080516e71b56ff59a60a02bef2232c4fa9d \
--hash=sha256:d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19
# via botocore
9 changes: 9 additions & 0 deletions cdk/queries/partition-addressbase-cleaned.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
SELECT
split_part(postcode, ' ', 1) as outcode,
uprn,
address,
postcode,
ST_X(ST_GeometryFromText(split_part(location, ';', 2))) as longitude,
ST_Y(ST_GeometryFromText(split_part(location, ';', 2))) as latitude,
substr(postcode, 1,1) as first_letter
FROM $$from_table$$
80 changes: 40 additions & 40 deletions cdk/queries/uprn-to-ballots-first-letter.sql
Original file line number Diff line number Diff line change
@@ -1,41 +1,41 @@
SELECT
combined_results.uprn,
combined_results.address,
combined_results.postcode,
ARRAY_SORT(ARRAY_AGG(distinct combined_results.election_id)) AS ballot_ids,
combined_results.first_letter as first_letter
FROM (
SELECT
ab.uprn,
ab.address,
ab.postcode,
ab.first_letter,
cb.election_id
FROM
current_ballots cb
CROSS JOIN addressbase_partitioned ab
WHERE
ST_CONTAINS(
ST_POLYGON(cb.geometry),
ST_POINT(ab.longitude, ab.latitude)
)
AND cb.source_table = 'Organisation'
UNION
SELECT
ab.uprn,
ab.address,
ab.postcode,
ab.first_letter,
cb.election_id
FROM
current_ballots cb
CROSS JOIN addressbase_partitioned ab
WHERE
ST_CONTAINS(
ST_Polygon(cb.geometry),
ST_POINT(ab.longitude, ab.latitude)
)
AND cb.source_table = 'Division'
) AS combined_results
GROUP BY
combined_results.uprn, combined_results.address, combined_results.postcode, combined_results.first_letter
combined_results.uprn,
combined_results.address,
combined_results.postcode,
array_sort(array_agg(distinct combined_results.election_id)) AS ballot_ids,
combined_results.first_letter as first_letter
FROM (
SELECT
ab.uprn,
ab.address,
ab.postcode,
ab.first_letter,
cb.election_id
FROM
$$from_table$$ cb
CROSS JOIN addressbase_partitioned ab
WHERE
ST_CONTAINS(
ST_Polygon(cb.geometry),
ST_POINT(ab.longitude, ab.latitude)
)
AND cb.source_table = 'Organisation'
UNION
SELECT
ab.uprn,
ab.address,
ab.postcode,
ab.first_letter,
cb.election_id
FROM
$$from_table$$ cb
CROSS JOIN addressbase_partitioned ab
WHERE
ST_CONTAINS(
ST_Polygon(cb.geometry),
ST_POINT(ab.longitude, ab.latitude)
)
AND cb.source_table = 'Division'
) AS combined_results
GROUP BY
combined_results.uprn, combined_results.address, combined_results.postcode, combined_results.first_letter
Loading