Skip to content

Commit

Permalink
Prevent pandas from converting/formatting values
Browse files Browse the repository at this point in the history
  • Loading branch information
marius-mather committed Mar 25, 2024
1 parent 5327db2 commit c617458
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 3 deletions.
26 changes: 24 additions & 2 deletions src/scripts/find_dadis_local_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def full_local_match_workflow(
"dadis_update_date",
],
)
matched_breeds = clean_output(matched_breeds)
matched_breeds.to_csv(temp_out, sep="\t", index=False, header=False)
temp_out.close()
logger.info("Output written to temp file.")
Expand All @@ -61,8 +62,17 @@ def full_local_match_workflow(

def read_vbo_data(filename: str) -> pd.DataFrame:
vbo_breeds = pd.read_table(
filename, sep="\t", skiprows=[1], low_memory=False
).convert_dtypes()
filename,
sep="\t",
skiprows=[1],
dtype={"obsolete": str, "description_of_origin": str},
na_values=[],
low_memory=False
).convert_dtypes(
infer_objects=False,
convert_string=False,
convert_boolean=False
)
return vbo_breeds


Expand Down Expand Up @@ -155,6 +165,18 @@ def write_tsv_header(
csv_out.writerow(header)


def clean_output(df: pd.DataFrame) -> pd.DataFrame:
"""
Clean the dataframe before writing
* Convert any None values to empty strings
"""
string_columns = df.select_dtypes(include="object").columns
for column in string_columns:
df[column] = df[column].fillna("")
return df


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Find DADIS entries matching VBO breeds"
Expand Down
24 changes: 23 additions & 1 deletion src/scripts/find_dadis_transboundary_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def full_matching_workflow(
output_file=temp_out,
extra_cols=["dadis_transboundary_id"],
)
matched_breeds = clean_output(matched_breeds)
# Write the actual data below the headers
matched_breeds.to_csv(temp_out, sep="\t", index=False, header=False)
temp_out.close()
Expand All @@ -48,7 +49,16 @@ def full_matching_workflow(


def read_vbo_data(filename: str) -> pd.DataFrame:
df = pd.read_table(filename, skiprows=[1]).convert_dtypes()
df = pd.read_table(
filename,
skiprows=[1],
na_values=[],
dtype={"obsolete": str},
).convert_dtypes(
infer_objects=False,
convert_string=False,
convert_boolean=False
)
return df


Expand Down Expand Up @@ -215,6 +225,18 @@ def write_tsv_header(
csv_out.writerow(header)


def clean_output(df: pd.DataFrame) -> pd.DataFrame:
"""
Clean the dataframe before writing
* Convert any None values to empty strings
"""
string_columns = df.select_dtypes(include="object").columns
for column in string_columns:
df[column] = df[column].fillna("")
return df


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Find DADIS entries matching VBO breeds"
Expand Down

0 comments on commit c617458

Please sign in to comment.