From d71440ccc74baa9b4cbedc554a221dc966984038 Mon Sep 17 00:00:00 2001 From: Laurens de Bruin <96109694+laurens88@users.noreply.github.com> Date: Mon, 8 Apr 2024 17:26:14 +0200 Subject: [PATCH] Add Sample Datatool (#41) Adds a new tool to datatools that samples old, new and random records from a dataset. --- README.md | 12 ++++- asreviewcontrib/datatools/entrypoint.py | 8 ++- asreviewcontrib/datatools/sample.py | 70 +++++++++++++++++++++++++ tests/demo_data/sample_data.csv | 7 +++ tests/test_sample.py | 17 ++++++ 5 files changed, 112 insertions(+), 2 deletions(-) create mode 100644 asreviewcontrib/datatools/sample.py create mode 100644 tests/demo_data/sample_data.csv create mode 100644 tests/test_sample.py diff --git a/README.md b/README.md index eeb3218..7f108ac 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,8 @@ LAB](https://github.com/asreview/asreview) that can be used to: - [**Deduplicate**](#data-dedup) data - [**Stack**](#data-vstack-experimental) multiple datasets - [**Compose**](#data-compose-experimental) a single (labeled, partly labeled, or unlabeled) dataset from multiple datasets -- [**Snowball**](#snowball) a dataset to find incoming or outgoing citations. +- [**Snowball**](#snowball) a dataset to find incoming or outgoing citations +- [**Sample**](#sample) old, random, and new papers in order to check if the terminology has changed over time. Several [tutorials](Tutorials.md) are available that show how `ASReview-Datatools` can be used in different scenarios. @@ -288,6 +289,15 @@ One thing to note is that OpenAlex will handle data requests faster if the sende asreview data snowball input_dataset.csv output_dataset.csv --backward --email my_email@provider.com ``` +## Sample + +This datatool is used to sample old, random and new records from your dataset by using the `asreview data sample` command. The sampled records are then stored in an output file. This can be useful for detecting concept drift, meaning that the words used for certain concepts change over time. This script assumes that the dataset includes a column named `publication_year`. An example would be: + +```bash +asreview data sample input_dataset.xlsx output_dataset.xslx 50 +``` +This samples the `50` oldest and `50` newest records from `input_dataset.xlsx` and samples `50` records randomly (without overlap from the old and new partitions!). The resulting 150 records are written to `output_dataset.xlsx`. + ## License This extension is published under the [MIT license](/LICENSE). diff --git a/asreviewcontrib/datatools/entrypoint.py b/asreviewcontrib/datatools/entrypoint.py index 562bea7..647bc6a 100644 --- a/asreviewcontrib/datatools/entrypoint.py +++ b/asreviewcontrib/datatools/entrypoint.py @@ -10,12 +10,14 @@ from asreviewcontrib.datatools.convert import convert from asreviewcontrib.datatools.describe import _parse_arguments_describe from asreviewcontrib.datatools.describe import describe +from asreviewcontrib.datatools.sample import _parse_arguments_sample +from asreviewcontrib.datatools.sample import sample from asreviewcontrib.datatools.snowball import _parse_arguments_snowball from asreviewcontrib.datatools.snowball import snowball from asreviewcontrib.datatools.stack import _parse_arguments_vstack from asreviewcontrib.datatools.stack import vstack -DATATOOLS = ["describe", "dedup", "convert", "compose", "vstack", "snowball"] +DATATOOLS = ["describe", "dedup", "convert", "compose", "vstack", "snowball", "sample"] class DataEntryPoint(BaseEntryPoint): @@ -104,6 +106,10 @@ def execute(self, argv): args_snowballing_parser = _parse_arguments_snowball() args_snowballing = vars(args_snowballing_parser.parse_args(argv[1:])) snowball(**args_snowballing) + if argv[0] == "sample": + args_sample_parser = _parse_arguments_sample() + args_sample = vars(args_sample_parser.parse_args(argv[1:])) + sample(**args_sample) if argv[0] == "vstack": args_vstack_parser = _parse_arguments_vstack() args_vstack = args_vstack_parser.parse_args(argv[1:]) diff --git a/asreviewcontrib/datatools/sample.py b/asreviewcontrib/datatools/sample.py new file mode 100644 index 0000000..a9d253f --- /dev/null +++ b/asreviewcontrib/datatools/sample.py @@ -0,0 +1,70 @@ +import argparse + +import pandas as pd +from asreview import ASReviewData +from asreview.data.base import load_data + + +def sample(input_path, output_path, nr_records, year_column="publication_year"): + df_input = load_data(input_path).df + + # Check for presence of any variation of a year column + if year_column not in df_input.columns: + raise ValueError(f"• The input file should have a {year_column} column.") + + # Check if k is not too large + if nr_records * 3 > len(df_input): + raise ValueError( + f"• The number of records to sample is too large." + f"Only {len(df_input)} records are present in the input file." + f" You are trying to sample {nr_records*3} records." + ) + + if nr_records < 1: + raise ValueError("• The number of records to sample should be at least 1.") + + # Sort by year + dated_records = df_input[df_input[year_column].notnull()] + + if dated_records.empty: + raise ValueError(f"• The input file has no {year_column} values.") + + if len(dated_records) < nr_records * 2: + raise ValueError("• Not enough dated records to sample from.") + + sorted_records = dated_records.sort_values(year_column, ascending=True) + + # Take k old and k new records + old_records = sorted_records.head(nr_records) + new_records = sorted_records.tail(nr_records) + + # Sample k records without overlap with old/new records + records_to_exclude = pd.concat([old_records, new_records]).index + remaining_records = df_input[~df_input.index.isin(records_to_exclude)] + + sampled_records = remaining_records.sample(nr_records) + + # Combine old, new, and sampled records + df_out = pd.concat([old_records, sampled_records, new_records]) + + asdata = ASReviewData(df=df_out) + asdata.to_file(output_path) + + +def _parse_arguments_sample(): + parser = argparse.ArgumentParser(prog="asreview data sample") + parser.add_argument("input_path", type=str, help="The input file path.") + parser.add_argument("output_path", type=str, help="The output file path.") + parser.add_argument( + "nr_records", + type=int, + help="The amount of records for old, random, and new records each.", + ) + parser.add_argument( + "--year_column", + default="publication_year", + type=str, + help="The name of the column containing the publication year.", + ) + + return parser diff --git a/tests/demo_data/sample_data.csv b/tests/demo_data/sample_data.csv new file mode 100644 index 0000000..2019c34 --- /dev/null +++ b/tests/demo_data/sample_data.csv @@ -0,0 +1,7 @@ +title, doi, publication_year +title1, doi1, 2005 +title2, doi2, 2001 +title3, doi3, +title4, doi4, 2003 +title5, doi5, 2004 +title6, doi6, 2000 \ No newline at end of file diff --git a/tests/test_sample.py b/tests/test_sample.py new file mode 100644 index 0000000..19d2c8c --- /dev/null +++ b/tests/test_sample.py @@ -0,0 +1,17 @@ +# create unit tests for the sample.py file +from pathlib import Path + +import pandas as pd + +from asreviewcontrib.datatools.sample import sample + +INPUT_DIR = Path(__file__).parent / "demo_data" / "sample_data.csv" + + +def test_sample(tmpdir): + sample(INPUT_DIR, tmpdir / "output.csv", 1, "publication_year") + df = pd.read_csv(tmpdir / "output.csv") + assert len(df) == 3 + assert "publication_year" in df.columns + assert df.iloc[0]["publication_year"] == 2000 + assert df.iloc[2]["publication_year"] == 2005