From 2f7bc48f3d6f830e7b6193f383ba667e12bcb89a Mon Sep 17 00:00:00 2001 From: Jigsaw Date: Wed, 11 Dec 2024 09:42:50 -0800 Subject: [PATCH] Internal change GitOrigin-RevId: 4607c9bbbea5e5d4afa7f15f12a85d1d0ac2efd2 --- bin/process_polis_data.py | 97 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100755 bin/process_polis_data.py diff --git a/bin/process_polis_data.py b/bin/process_polis_data.py new file mode 100755 index 0000000..3640e6e --- /dev/null +++ b/bin/process_polis_data.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 + +import pandas as pd +import argparse as arg + + +print("starting program") + +# argparse setup with arguments for two input files +def getargs(): + parser = arg.ArgumentParser(description="Process Polis data from the openData export data.") + parser.add_argument("export_directory", help="Path to export directory.") + parser.add_argument("--participants-votes", help="Participants votes file (override).") + parser.add_argument("--comments", help="Path to the comments file (override).") + parser.add_argument("-o", "--output_file", help="Path to the output CSV file.", required=True) + args = parser.parse_args() + args.participants_votes = args.participants_votes or f"{args.export_directory}/participants-votes.csv" + args.comments = args.comments or f"{args.export_directory}/comments.csv" + return args + + +print("processing args") +args = getargs() + +# Read the CSV files into pandas DataFrames +try: + votes = pd.read_csv(args.participants_votes) + comments = pd.read_csv(args.comments) +except FileNotFoundError as e: + print(f"Error: One or both input files not found: {e}") + exit(1) +except pd.errors.EmptyDataError as e: + print(f"Error: One or both input files are empty: {e}") + exit(1) +except pd.errors.ParserError as e: + print(f"Error parsing CSV file: {e}") + exit(1) + +print("args processed") + +# make sure to cast comment ids as ints +comments['comment-id'] = comments['comment-id'].astype(int) + + +# filter out votes rows where group-id is nan, and make ints +votes = votes[votes['group-id'].notna()] +votes['group-id'] = votes['group-id'].astype(int) +group_ids = votes['group-id'].unique() + +# prompt: find all of the column names in the votes df that match a numeric regex +import re +comment_ids = [col for col in votes.columns if re.match(r'^\d+$', col)] +print(comment_ids) + + +# Melt the DataFrame +melted_votes = votes.melt(id_vars=["group-id"], value_vars=comment_ids, var_name='comment-id', value_name='value') +melted_votes['comment-id'] = melted_votes['comment-id'].astype(int) +print(melted_votes) +# Group, count, unstack, and fill missing values +result = ( + melted_votes.groupby(['comment-id','group-id'])['value'] + .value_counts() + .unstack(fill_value=0) + .reset_index() +) + +# Rename columns +result = result.rename(columns={-1: 'disagree-count', 0: 'pass-count', 1: 'agree-count'}) + +# pivot out the group-id column so that each of the vote count columns look like "group-N-VOTE-count" +pivoted = result.pivot(index="comment-id", columns='group-id') + +# ...? +for_merge = pd.DataFrame({'comment-id': pivoted['disagree-count'][0.0].index}) +for group_id in group_ids: + for count_col in ["disagree-count", "pass-count", "agree-count"]: + for_merge["group-" + str(group_id) + "-" + count_col] = pivoted[count_col][group_id] + +# zero out total vote tallies since incorrect from filtering or database caching +comments["agrees"] = 0 +comments["disagrees"] = 0 +comments["passes"] = 0 + +# merge in the per group tallies above +comments = comments.merge(for_merge, on='comment-id') + +# add up from the votes matrix for consistency +for group_id in group_ids: + comments["disagrees"] += comments["group-" + str(group_id) + "-disagree-count"] + comments["agrees"] += comments["group-" + str(group_id) + "-agree-count"] + comments["passes"] += comments["group-" + str(group_id) + "-pass-count"] + +# prompt: write out to a CSV file +comments = comments.rename(columns={'comment-body': 'comment_text'}) +comments.to_csv(args.output_file, index=False) +