diff --git a/.github/workflows/checks.yaml b/.github/workflows/checks.yaml index a9567293..ac7c28d7 100644 --- a/.github/workflows/checks.yaml +++ b/.github/workflows/checks.yaml @@ -39,7 +39,15 @@ jobs: load: true - name: Run prediction - run: docker run --rm -v "$(pwd)/data:/data" eyra-rank:latest predict /data/fake_data.csv --out=/data/predictions.csv + run: docker run --rm -v "$(pwd)/.:/data" eyra-rank:latest /data/PreFer_fake_data.csv /data/PreFer_fake_background_data.csv --out=/data/predictions.csv + + - name: Build Docker scoring image + uses: docker/build-push-action@v4 + with: + context: . + file: python.Dockerfile + tags: eyra-rank:scoring + load: true - name: Run scoring - run: docker run --rm -v "$(pwd)/data:/data" eyra-rank:latest score /data/predictions.csv /data/fake_data_ground_truth.csv + run: docker run --rm -v "$(pwd):/data" --entrypoint "conda run -n eyra-rank python /app/score.py" eyra-rank:scoring /data/predictions.csv /data/PreFer_fake_data.csv diff --git a/.gitignore b/.gitignore index af67e4d1..822b2b4c 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ .DS_Store .AppleDouble .LSOverride +__pycache__/ +.tool-versions diff --git a/environment.yml b/environment.yml index 65f3a303..753f6ff7 100644 --- a/environment.yml +++ b/environment.yml @@ -1,8 +1,9 @@ name: eyra-rank channels: - defaults + - conda-forge dependencies: - - pandas=1.5 - - scikit-learn=1.2 - - joblib=1.1 - - matplotlib=3.7 + - pandas=2.2.1 + - scikit-learn=1.4.1.post1 + - joblib=1.3.2 + - matplotlib=3.8.3 \ No newline at end of file diff --git a/model.joblib b/model.joblib index 56cd6c46..21a679bd 100644 Binary files a/model.joblib and b/model.joblib differ diff --git a/model.rds b/model.rds index aa96beb4..5a37c01e 100644 Binary files a/model.rds and b/model.rds differ diff --git a/python.Dockerfile b/python.Dockerfile index ef32f759..8978711c 100644 --- a/python.Dockerfile +++ b/python.Dockerfile @@ -4,10 +4,11 @@ COPY environment.yml / RUN conda env create -f /environment.yml RUN mkdir /app +WORKDIR /app -COPY data /data -COPY *.py / -COPY *.joblib / +COPY *.csv /app +COPY *.py /app +COPY *.joblib /app -ENTRYPOINT ["conda", "run", "-n", "eyra-rank", "python", "/run.py"] +ENTRYPOINT ["conda", "run", "-n", "eyra-rank", "python", "/app/run.py"] CMD ["predict", "/data/fake_data.csv"] \ No newline at end of file diff --git a/run.R b/run.R index 1ffa7c78..e8dce752 100644 --- a/run.R +++ b/run.R @@ -16,32 +16,19 @@ source("submission.R") print_usage <- function() { cat("Usage:\n") - cat(" Rscript script.R predict INPUT_FILE [--output OUTPUT_FILE]\n") - cat(" Rscript script.R score --prediction PREDICTION_FILE --ground_truth GROUND_TRUTH_FILE [--output OUTPUT_FILE]\n") + cat(" Rscript script.R DATA_FILE BACKGROUND_DATA_FILE [--output OUTPUT_FILE]\n") } parse_arguments <- function() { args <- list() command_args <- commandArgs(trailingOnly = TRUE) - if (length(command_args) > 0) { - args$command <- command_args[1] + if (length(command_args) < 2) { + return(args) + } - if (is.null(args$command)) { - stop("Error: No command provided.") - } - - if (args$command == "predict") { - args$input <- commandArgs(trailingOnly = TRUE)[2] - args$output <- get_argument("--output") - } else if (args$command == "score") { - args$prediction <- get_argument("--prediction") - args$ground_truth <- get_argument("--ground_truth") - args$output <- get_argument("--output") - } - } else { - stop("Error: No command provided. Run the script with predict or score.") - } - + args$data <- commandArgs(trailingOnly = TRUE)[1] + args$background_data <- commandArgs(trailingOnly = TRUE)[2] + args$output <- get_argument("--output") return(args) } @@ -56,41 +43,25 @@ get_argument <- function(arg_name) { } parse_and_run_predict <- function(args) { - if (is.null(args$input)) { - stop("Error: Please provide --input argument for prediction.") + if (is.null(args$data)||is.null(args$background_data)) { + stop("Error: Please provide data and background_data argument for prediction.") } - cat("Processing input data for prediction from:", args$input, "\n") + cat("Processing input data for prediction from:", args$data, " ", args$background_data, "\n") if (!is.null(args$output)) { cat("Output will be saved to:", args$output, "\n") } - run_predict(args$input, args$output) -} - -run_score <- function(args) { - if (is.null(args$prediction) || is.null(args$ground_truth)) { - stop("Error: Please provide --prediction and --ground_truth arguments for scoring.") - } - - cat("Scoring predictions from:", args$prediction, "\n") - cat("Ground truth data from:", args$ground_truth, "\n") - if (!is.null(args$output)) { - cat("Evaluation score will be saved to:", args$output, "\n") - } - # Call your submission function for scoring here + run_predict(args$data, args$background_data, args$output) } -run_predict <- function(input_path, output=NULL) { +run_predict <- function(data_path, background_data_path, output=NULL) { if (is.null(output)) { output <- stdout() } + df <- read.csv(data_path, encoding="latin1") + background_df <- read.csv(background_data_path, encoding="latin1") - - # Read data from input file - df <- read.csv(input_path, encoding="latin1") - - # Make predictions - predictions <- predict_outcomes(df) # Assuming predict_outcomes is a function in the submission package + predictions <- predict_outcomes(df, background_df) # Check if predictions have the required format stopifnot(ncol(predictions) == 2, @@ -105,13 +76,7 @@ run_predict <- function(input_path, output=NULL) { main <- function() { args <- parse_arguments() - if (args$command == "predict") { - parse_and_run_predict(args) - } else if (args$command == "score") { - run_score(args) - } else { - stop("Error: Invalid command. Use 'predict' or 'score'.") - } + parse_and_run_predict(args) } # Call main function diff --git a/run.py b/run.py index 50d0ba54..92369b34 100644 --- a/run.py +++ b/run.py @@ -20,31 +20,21 @@ import pandas as pd import submission -parser = argparse.ArgumentParser(description="Process and score data.") -subparsers = parser.add_subparsers(dest="command") +parser = argparse.ArgumentParser(description="Process data.") -# Process subcommand -process_parser = subparsers.add_parser( - "predict", help="Process input data for prediction." +parser.add_argument("data_path", help="Path to data data CSV file.") +parser.add_argument( + "background_data_path", help="Path to background data data CSV file." ) -process_parser.add_argument("input_path", help="Path to input data CSV file.") -process_parser.add_argument("--output", help="Path to prediction output CSV file.") - -# Score subcommand -score_parser = subparsers.add_parser("score", help="Score (evaluate) predictions.") -score_parser.add_argument("prediction_path", help="Path to predicted outcome CSV file.") -score_parser.add_argument( - "ground_truth_path", help="Path to ground truth outcome CSV file." -) -score_parser.add_argument("--output", help="Path to evaluation score output CSV file.") +parser.add_argument("--output", help="Path to prediction output CSV file.") args = parser.parse_args() -def predict(input_path, output): +def predict(data_path, background_data_path, output): """Predict Score (evaluate) the predictions and write the metrics. - This function takes the path to an input CSV file containing the input data. + This function takes the path to an data CSV file containing the data data. It calls submission.py clean_df and predict_outcomes writes the predictions to a new output CSV file. @@ -53,10 +43,17 @@ def predict(input_path, output): if output is None: output = sys.stdout - df = pd.read_csv( - input_path, encoding="latin-1", encoding_errors="replace", low_memory=False + data_df = pd.read_csv( + data_path, encoding="latin-1", encoding_errors="replace", low_memory=False + ) + background_data_df = pd.read_csv( + background_data_path, + encoding="latin-1", + encoding_errors="replace", + low_memory=False, ) - predictions = submission.predict_outcomes(df) + + predictions = submission.predict_outcomes(data_df, background_data_df) assert ( predictions.shape[1] == 2 ), "Predictions must have two columns: nomem_encr and prediction" @@ -131,11 +128,4 @@ def score(prediction_path, ground_truth_path, output): if __name__ == "__main__": args = parser.parse_args() - if args.command == "predict": - predict(args.input_path, args.output) - elif args.command == "score": - score(args.prediction_path, args.ground_truth_path, args.output) - else: - parser.print_help() - predict(args.input_path, args.output) - sys.exit(1) + predict(args.data_path, args.background_data_path, args.output) diff --git a/score.py b/score.py new file mode 100644 index 00000000..5d7ce735 --- /dev/null +++ b/score.py @@ -0,0 +1,97 @@ +""" +This script calls submission.py. Add your method to submission.py to run your +prediction method. + +To test your submission use the following command: + +python run.py predict + +For example: + +python run.py predict data/PreFer_fake_data.csv + +Optionally, you can use the score function to calculate evaluation scores given +your predictions and the ground truth within the training dataset. + +""" + +import sys +import argparse +import pandas as pd +import submission + +parser = argparse.ArgumentParser(description="Score data.") +# Score subcommand +parser.add_argument("prediction_path", help="Path to predicted outcome CSV file.") +# Score subcommand +parser.add_argument("ground_truth_path", help="Path to ground truth outcome CSV file.") +# Score subcommand +parser.add_argument("--output", help="Path to evaluation score output CSV file.") + +args = parser.parse_args() + + +def score(prediction_path, ground_truth_path, output): + """Score (evaluate) the predictions and write the metrics. + + This function takes the path to a CSV file containing predicted outcomes and the + path to a CSV file containing the ground truth outcomes. It calculates the overall + prediction accuracy, and precision, recall, and F1 score for having a child + and writes these scores to a new output CSV file. + + This function should not be modified. + """ + + if output is None: + output = sys.stdout + # Load predictions and ground truth into dataframes + predictions_df = pd.read_csv(prediction_path) + ground_truth_df = pd.read_csv(ground_truth_path) + + # Merge predictions and ground truth on the 'id' column + merged_df = pd.merge(predictions_df, ground_truth_df, on="nomem_encr", how="right") + + # Calculate accuracy + accuracy = len(merged_df[merged_df["prediction"] == merged_df["new_child"]]) / len( + merged_df + ) + + # Calculate true positives, false positives, and false negatives + true_positives = len( + merged_df[(merged_df["prediction"] == 1) & (merged_df["new_child"] == 1)] + ) + false_positives = len( + merged_df[(merged_df["prediction"] == 1) & (merged_df["new_child"] == 0)] + ) + false_negatives = len( + merged_df[(merged_df["prediction"] == 0) & (merged_df["new_child"] == 1)] + ) + + # Calculate precision, recall, and F1 score + try: + precision = true_positives / (true_positives + false_positives) + except ZeroDivisionError: + precision = 0 + try: + recall = true_positives / (true_positives + false_negatives) + except ZeroDivisionError: + recall = 0 + try: + f1_score = 2 * (precision * recall) / (precision + recall) + except ZeroDivisionError: + f1_score = 0 + # Write metric output to a new CSV file + metrics_df = pd.DataFrame( + { + "accuracy": [accuracy], + "precision": [precision], + "recall": [recall], + "f1_score": [f1_score], + } + ) + metrics_df.to_csv(output, index=False) + + +if __name__ == "__main__": + args = parser.parse_args() + score(args.prediction_path, args.ground_truth_path, args.output) diff --git a/submission.R b/submission.R index b1acb03a..0936c212 100644 --- a/submission.R +++ b/submission.R @@ -16,7 +16,7 @@ # List your packages here. Don't forget to update packages.R! library(dplyr) # as an example, not used here -clean_df <- function(df, background = NULL){ +clean_df <- function(df, background_df){ # Preprocess the input dataframe to feed the model. ### If no cleaning is done (e.g. if all the cleaning is done in a pipeline) leave only the "return df" command @@ -45,7 +45,7 @@ clean_df <- function(df, background = NULL){ return(df) } -predict_outcomes <- function(df, model_path = "./model.rds"){ +predict_outcomes <- function(df, background_df, model_path = "./model.rds"){ # Generate predictions using the saved model and the input dataframe. # The predict_outcomes function accepts a dataframe as an argument @@ -58,7 +58,8 @@ predict_outcomes <- function(df, model_path = "./model.rds"){ # they did. # Parameters: - # df (dataframe): The input dataframe for which predictions are to be made. + # df (dataframe): The data dataframe for which predictions are to be made. + # df (dataframe): The background data dataframe for which predictions are to be made. # model_path (str): The path to the saved model file (which is the output of training.R). # Returns: @@ -73,7 +74,7 @@ predict_outcomes <- function(df, model_path = "./model.rds"){ model <- readRDS(model_path) # Preprocess the fake / holdout data - df <- clean_df(df) + df <- clean_df(df, background_df) # IMPORTANT: the outcome `new_child` should NOT be in the data from this point onwards # get list of variables *without* the outcome: @@ -87,9 +88,9 @@ predict_outcomes <- function(df, model_path = "./model.rds"){ predictions <- ifelse(predictions > 0.5, 1, 0) # Output file should be data.frame with two columns, nomem_enc and predictions - df_predict <- data.frame("nomem_encr" = df[ , "nomem_encr" ], "predictions" = predictions) + df_predict <- data.frame("nomem_encr" = df[ , "nomem_encr" ], "prediction" = predictions) # Force columnnames (overrides names that may be given by `predict`) - names(df_predict) <- c("nomem_encr", "predictions") + names(df_predict) <- c("nomem_encr", "prediction") # Return only dataset with predictions and identifier return( df_predict ) diff --git a/submission.py b/submission.py index fda659b6..b58f9037 100644 --- a/submission.py +++ b/submission.py @@ -15,14 +15,13 @@ run.py can be used to test your submission. """ - # List your libraries and modules here. Don't forget to update environment.yml! import pandas as pd from sklearn.linear_model import LogisticRegression import joblib -def clean_df(df, background=None): +def clean_df(df, background_df): """ Preprocess the input dataframe to feed the model. # If no cleaning is done (e.g. if all the cleaning is done in a pipeline) leave only the "return df" command @@ -34,31 +33,33 @@ def clean_df(df, background=None): Returns: pd.DataFrame: The cleaned dataframe with only the necessary columns and processed variables. """ - + ## This script contains a bare minimum working example # Create new variable with age - df['age'] = 2024 - df['birthyear_bg'] - + df["age"] = 2024 - df["birthyear_bg"] + # Imputing missing values in age with the mean - df['age'] = df['age'].fillna(df['age'].mean()) + df["age"] = df["age"].fillna(df["age"].mean()) # Filter cases for whom the outcome is not available - df = df[~df['new_child'].isna()] - + df = df[~df["new_child"].isna()] + # Selecting variables for modelling - keepcols = ['nomem_encr', # ID variable required for predictions, - 'age', # newly created variable - 'new_child'] # outcome variable - + keepcols = [ + "nomem_encr", # ID variable required for predictions, + "age", # newly created variable + "new_child", + ] # outcome variable + # Keeping data with variables selected df = df[keepcols] return df -def predict_outcomes(df, model_path="model.joblib"): +def predict_outcomes(df, background_df, model_path="model.joblib"): """Generate predictions using the saved model and the input dataframe. - + The predict_outcomes function accepts a Pandas DataFrame as an argument and returns a new DataFrame with two columns: nomem_encr and prediction. The nomem_encr column in the new DataFrame replicates the @@ -67,34 +68,37 @@ def predict_outcomes(df, model_path="model.joblib"): prediction is represented as a binary value: '0' indicates that the individual did not have a child during 2021-2023, while '1' implies that they did. - + Parameters: df (pd.DataFrame): The input dataframe for which predictions are to be made. + background_df (pd.DataFrame): The background dataframe for which predictions are to be made. model_path (str): The path to the saved model file (which is the output of training.py). Returns: pd.DataFrame: A dataframe containing the identifiers and their corresponding predictions. """ - + ## This script contains a bare minimum working example - if 'nomem_encr' not in df.columns: + if "nomem_encr" not in df.columns: print("The identifier variable 'nomem_encr' should be in the dataset") # Load the model model = joblib.load(model_path) - + # Preprocess the fake / holdout data - df = clean_df(df) + df = clean_df(df, background_df) # IMPORTANT: the outcome `new_child` should NOT be in the data from this point onwards # get list of variables *without* the outcome: - vars_without_outcome = df.columns[df.columns != 'new_child'] - + vars_without_outcome = df.columns[~df.columns.isin(["new_child", "nomem_encr"])] + # Generate predictions from model, should be 0 (no child) or 1 (had child) predictions = model.predict(df[vars_without_outcome]) # Output file should be DataFrame with two columns, nomem_encr and predictions - df_predict = pd.DataFrame({'nomem_encr': df['nomem_encr'], 'prediction': predictions}) + df_predict = pd.DataFrame( + {"nomem_encr": df["nomem_encr"], "prediction": predictions} + ) # Return only dataset with predictions and identifier return df_predict