Skip to content

Commit

Permalink
Temporarily disable saving clean tags to disk (#2557)
Browse files Browse the repository at this point in the history
  • Loading branch information
obulat authored Jul 5, 2023
1 parent ba2081e commit 680122f
Showing 1 changed file with 7 additions and 0 deletions.
7 changes: 7 additions & 0 deletions ingestion_server/ingestion_server/cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,10 @@ def _clean_data_worker(rows, temp_table, sources_config, all_fields: list[str]):
update_field_expressions = []
for field, clean_value in cleaned_data.items():
update_field_expressions.append(f"{field} = {clean_value}")
# Save cleaned values for later
# (except for tags, which take up too much space)
if field == "tags":
continue
cleaned_values[field].append((identifier, clean_value))

if len(update_field_expressions) > 0:
Expand All @@ -272,6 +276,9 @@ def save_cleaned_data(result: dict) -> dict[str, int]:

cleanup_counts = {field: len(items) for field, items in result.items()}
for field, cleaned_items in result.items():
# Skip the tag field because the file is too large and fills up the disk
if field == "tag":
continue
if cleaned_items:
with open(f"{field}.tsv", "a") as f:
csv_writer = csv.writer(f, delimiter="\t")
Expand Down

0 comments on commit 680122f

Please sign in to comment.