summarise_snp_differences.R

################################################################################
#### Summarise pairwise SNP differences                                    #####
####                                                                       #####
####  An R script to summarise pairwise SNP differences from nullabor      #####
####  output.                                                              #####
####                                                                       #####
####  Minimum input:                                                       #####
####    - cat: a CSV or tab-delimited file with a column of sequence IDs   #####
####        that match the IDs in the FASTA file, and one or more columns  #####
####        indicating to which group the sequence belongs to.             #####
####    - seqs: a FASTA alignment file with all the SNP differences        #####
####        outputted by nullabor <optional>                               #####
####    - diff: a CSV file containing the pairwise SNP difference count    #####
####        outputted by nullabor <optional>                               #####
####                                                                       #####
####  Output:                                                              #####  
####    - tab: a table with summary information for eqch possible          #####
####      pairwise category comparison, including:                         #####
####          -mu: mean differences                                        #####
####          -sd: standard deviation                                      #####
####          -min: minimum difference observed                            #####
####          -max: maximum difference observed                            #####
####    - fig: a figure depicting the information in the table             #####
####                                                                       #####
####  Running from the command line:                                       #####
####                                                                       #####
####    Rscript summarise_snp_differences.R cat <seqs | diff>              #####
####                                                                       #####
####  Running from within R:                                               #####
####                                                                       #####
####    Change the appropriate parameters below to point to the            #####  
####    necessary files.                                                   #####
####                                                                       #####
####  History:                                                             #####
####    - version 0.1: 27 August 2015                                      #####
################################################################################


################################################################################
# If not running off the command line, change these parameters to point to the 
# approriate files, e.g.:
#   cat = '/home/user/cat.csv'
#   
#   To test the script substitute below as follows:
#   
#     cat_file = 'test/woodm_grouping.csv'
#     seq_file = 'test/woodm.fa'

cat_file = NULL

# Only one of these files needs to be specified. If both are specified, the
# diff file will have precedence
seq_file = NULL
diff_file = NULL

# Options:
#   Change the following options to set output
#   
out_basename = 'snp_diff'
model = 'N'   # model to use when calculating pairwise differences between
# sequences when an alignment is supplied. Options: 
#     'N': count of SNP differences
#     'raw': proportion of SNP differences
#     'JC69': Jukes-Cantor 1969
#     'K80': Kimura 1980
#     additional options specified in ape::dist.dna() function
tab_fmt = "csv" # options are "csv" or "md"
tab_type = "raw" # options "pretty" or "raw" --- "pretty" formats numbers in
# scientific format (e.g., 1.05e-9), while "raw" gives
# raw value outputs
fig_fmt = "png"     # options are "png" or "pdf"
fig_type = "median" # option to print out mean +/- sd and range ("mean") or
# median +/- interquartile range ("median")
fig_minmax = TRUE # whether to plot the min/max in the plot. Plotting 
# the min/max can lead to difficult to read plots,
# espectially if the max is very different from 
# the median.
fig_filter = "both" # produce plot with either "inter" or "intra" or "both"
# comparisons
# it will also accept "specific". If this flag is set
# then comparison below must also be set. This flag 
# will limit to plotting only comparisons that include
# the group specified in "comp" below.
fig_comp = NULL     # A string that specifies a unique cat:group pair found in your
# categories file.
# here cat refers to a column name heading, and group refers
# to one of the grouping units within that column.
# (e.g., 'clade:clade_a' in the test file). Note it must be
# specified with the colon mark.
exclude_ids = NULL # a string to a path to a file with one sequence ID per
# line. these sequences will be excluded from the 
# analysis.
save_long = FALSE  # 

################################################################################

################################################################################
# for testing purposes
#      cat_file = 'test/woodm_grouping.csv'
#      seq_file = 'test/woodm.fa'
#      fig_filter = 'specific'
#      fig_comp = 'clade:clade_a'
#      save_long = TRUE

################################################################################

################################################################################
# The function that summarises the data
#  
summ_distances <- function(categories, dist_obj, save_long, outfile){
  #dist_obj is a distance object produced by using the dist.dna() function of ape
  #categories is a data.frame with two columns:
  #   - seq_id: that matches the sequence ids in dist_obj
  #   - groups: that assigns the individual seq_ids to a group
  
  # some sanity checks
  if(class(dist_obj) != 'dist' & class(dist_obj) != 'matrix') {
    stop("dist_obj is not an object of type dist or matrix! 
         Please use dist.dna() to create a distance object first OR
         input a CSV file with count of differences produced by 
         nullabor")
  }
  
  if(!is.data.frame(categories)){
    stop("categories must be a data.frame! 
         Please create a data.frame with the metadata first.")
  }
  
  if(ncol(categories) > 2) {
    warning("Number of colums in categories is >2, 
            taking the first two columns only")
    categories <- categories[,c(1,2)]
  }
  
  if(!all(sort(names(categories)) == sort(c("seq_id", "groups")))) {
    warning("The columns of categories do not have names this function 
            recognizes. It will assume that the first column contains seq_ids, 
            and the second column the relevant categories.")
    names(categories) <- c("seq_id", "groups")
  }
  
  # calculations
  dat <- as.matrix(dist_obj)
  taxa <- unique(as.character(categories[,'groups']))
  n_taxa <- length(taxa)
  total_comp <- (n_taxa^2 + n_taxa)/2
  out <- data.frame(grp1 = character(total_comp), 
                    grp2 = character(total_comp),
                    comp = character(total_comp),
                    N = numeric(total_comp),
                    type = rep("inter-group", total_comp),
                    mu = numeric(total_comp), # store the mean
                    sd = numeric(total_comp), # store the standard deviation
                    med = numeric(total_comp), # store the median
                    lqr = numeric(total_comp), # store the lower quartile
                    uqr = numeric(total_comp), # store the upper quartile
                    min_dist = numeric(total_comp), # store the min value
                    max_dist = numeric(total_comp), # store the maximum value
                    stringsAsFactors = F)
  if (save_long) {
    n_inds <- nrow(dat)
    n_entries = n_inds^2
    out_long <- data.frame(taxa1 = character(n_entries),
                           taxa2 = character(n_entries),
                           grp1 = character(n_entries),
                           grp2 = character(n_entries),
                           type = rep("inter-group", n_entries),
                           count = numeric(n_entries),
                           stringsAsFactors = F)
    outlong_fn = paste(outfile, "_long.csv", sep = "")
    count_entries = 1
  }
  n_comp = 1
  for(i in 1:n_taxa){
    g1 <- taxa[i]
    seq_1 <- as.character(categories[categories$groups == g1, 'seq_id'])
    for(j in i:n_taxa){
      g2 <- taxa[j]
      seq_2 <- as.character(categories[categories$groups == g2, 'seq_id'])
      tmp_dat <- dat[seq_1, seq_2]
      if (save_long) {
        for (ii in 1:length(seq_1)) {
          for (jj in 1:length(seq_2)) {
            out_long[count_entries,'taxa1'] = seq_1[ii]
            out_long[count_entries,'taxa2'] = seq_2[jj]
            out_long[count_entries,'grp1'] = g1
            out_long[count_entries,'grp2'] = g2
            if (i == j) {
              out_long[count_entries,'type'] = 'intra-group'
            }
            out_long[count_entries,'count'] = dat[seq_1[ii], seq_2[jj]]
            count_entries = count_entries + 1
          }
        }
        if (n_entries - count_entries < 100) {
          # in case we start to run out of space
          tmp_long <- data.frame(taxa1 = character(n_entries),
                                 taxa2 = character(n_entries),
                                 grp1 = character(n_entries),
                                 grp2 = character(n_entries),
                                 type = rep("inter-group", n_entries),
                                 count = numeric(n_entries),
                                 stringsAsFactors = F)
          out_long <- rbind(out_long, tmp_long)
        }
      }
      out[n_comp, "grp1"] <- g1
      out[n_comp, "grp2"] <- g2
      out[n_comp, "N"] <- length(tmp_dat)
      out[n_comp, "comp"] <- paste(g1, g2, sep='_')
      if(i == j) {
        if(length(tmp_dat) > 1){
          #if length is one, this results in a empty set. 
          #so, added this condition to fix the problem
          tmp_dat <- tmp_dat[lower.tri(tmp_dat)]
        }
        out[n_comp, 'type'] <- 'intra-group'
        out[n_comp, "comp"] <- g1
      }
      if (length(tmp_dat) > 1 & max(tmp_dat) > min(tmp_dat)) {
        out[n_comp, "mu"] <- mean(tmp_dat)
        out[n_comp, "sd"] <- sd(tmp_dat)
        out[n_comp, "med"] <- quantile(tmp_dat, p = 0.50)
        out[n_comp, "lqr"] <- quantile(tmp_dat, p = 0.25)
        out[n_comp, "uqr"] <- quantile(tmp_dat, p = 0.75)
        out[n_comp, "min_dist"] <- min(tmp_dat)
        out[n_comp, "max_dist"] <- max(tmp_dat)
      } else {
        out[n_comp, "mu"] <- mean(tmp_dat)
        out[n_comp, "sd"] <- 0
        out[n_comp, "med"] <- quantile(tmp_dat, p = 0.50)
        out[n_comp, "lqr"] <- quantile(tmp_dat, p = 0.25)
        out[n_comp, "uqr"] <- quantile(tmp_dat, p = 0.75)
        out[n_comp, "min_dist"] <- min(tmp_dat)
        out[n_comp, "max_dist"] <- max(tmp_dat)
      }
      n_comp = n_comp + 1
    }
  }
  out$type <- factor(out$type, levels = c("intra-group", "inter-group"))
  out$comp <- factor(out$comp, levels = out$comp[order(out$type, out$comp)])
  if (save_long) {
    out_long <- out_long[1:(count_entries-1),]
    write.table(x = out_long, file = outlong_fn, quote = F, sep = ",", row.names = F)
  }
  return(out)
  }

################################################################################

################################################################################
# read the categories table
#

read_cat_file <- function(categories, exclude_ids = NULL) {
  if(!file.exists(categories)) {
    stop(paste("Could not find file:", categories, "\n"))
  }
  if(!is.null(exclude_ids) && !file.exists(exclude_ids)) {
    stop(paste("Could not find file:", exclude_ids, "\n"))
  }
  file_sep = ','
  if(!grepl(pattern = 'csv', x = tolower(categories))) {
    file_sep = '\t'
  }
  cat_df <- read.table(file = categories, 
                       header = TRUE, 
                       check.names = F, 
                       sep = file_sep,
                       stringsAsFactors = F,
                       ## added to remove any white spaces that might
                       ## cause problems when comparing with sequence data
                       strip.white = TRUE)
  if(!is.null(exclude_ids)) {
    exclude_ids <- read.table(file = exclude_ids, 
                              header = F, 
                              stringsAsFactors = F)
  }
  #if file is an mlst.tab output from nullabor
  if(all(c("FILE", "ST") %in% names(cat_df))) {
    seq_id <- gsub(pattern = "\\/contigs\\.fa", replacement = "", cat_df$FILE)
    ST <- cat_df$ST
    ix_calls <- which(ST != "-")
    cat_df <- data.frame(seq_id = seq_id[ix_calls], ST = ST[ix_calls], stringsAsFactors = F)
    if(!is.null(exclude_ids)) {
      cat_df <- cat_df[!(cat_df[,1] %in% exclude_ids), ]
    }
    cat_list <- list(ST = cat_df)
    return(cat_list)
  }
  #otherwise, treat as a file prepared by the user
  if(!is.null(exclude_ids)) {
    cat_df <- cat_df[!(cat_df[,1] %in% exclude_ids), ]
  }
  if(ncol(cat_df) == 2) {
    cat_list <- list(cat_df)
    names(cat_list) <- names(cat_df)[2]
  } else if(ncol(cat_df) > 2) {
    warning("Number of identified columns in category file is >2, 
            assuming that the first column contains the sequence IDs")
    cats <- names(cat_df)[-1]
    # added gsub to strip any white spaces. found it causes error when
    # matching cats to sequence data
    seqid <- names(cat_df)[1]
    cat_list <- lapply(cats, function(cat) subset(cat_df, select = c(seqid, cat)))
    names(cat_list) <- cats
  }
  return(cat_list)
}

################################################################################

################################################################################
# if running off a FASTA file, it is necessary to calculate the pairwise distance
# matrix.

calc_pairwise_distance <- function(seq_file, model = 'N') {
  # this function takes as input a string defining a path to a FASTA file
  # and a string to pass on to the function dna.dist() specifying the distance
  # model to use. 
  # The model is normally assumed to be 'raw', which means it takes the 
  # proportional pairwise differences. In most cases at MDU, this is a reasonable
  # measure if, however, finite mutation models are required to account for 
  # mutation saturation, other methods can be used. Type ?dna.dist to read the 
  # manual page.
  
  if(!file.exists(seq_file)) {
    stop(paste("Could not find file:", seq_file, "\n"))  
  }
  seq_data <- read.FASTA(file = seq_file)
  raw_dist <- dist.dna(x = seq_data, model = model)
  return(raw_dist)
}

################################################################################
# if running off a CSV/TSV file with counts of SNP differences
#

read_diff_file <- function(diff_file) {
  if(!file.exists(diff_file)) {
    stop(paste("Could not find file:", diff_file, "\n"))
  }
  file_sep = ','
  if(!grepl(pattern = 'csv', x = tolower(diff_file))) {
    file_sep = '\t'
  }
  diff_mat <- as.matrix(
    read.table(file = diff_file, 
               header = TRUE, 
               row.names = 1, 
               check.names = F, 
               sep = file_sep)
  )
  return(diff_mat)
}

################################################################################

################################################################################
# output the results to a pretty table
#

write_summ_table <- function(summ_table, 
                             outfile, 
                             file_type = 'csv', 
                             method = "pretty") {
  # here, we take the output from the summarise_snp_differences function and 
  # make a CSV table, which can be read into EXCEL, or someother spreadsheet
  # application. 
  # If the file_type is 'md', the table will be outputted in markdown format.
  
  outfile = paste(outfile, file_type, sep = ".")
  if(method == 'pretty') {
    pretty_column_names <- c("Group 1", "Group 2", "N", "Comparison", "Mean (±SD)", "Median", "Inter-Quartile Range", "Range")
    pretty_mean <- format(summ_table[,'mu'], scientific = T, digits = 3)
    pretty_sd <- format(summ_table[,'sd'], scientific = T, digits = 3)
    pretty_musd <- paste(pretty_mean, " (±", pretty_sd,")", sep = "")
    pretty_median <- format(summ_table[,'med'], scientific = T, digits = 3)
    pretty_lqr <- format(summ_table[,'lqr'], scientific = T, digits = 3)
    pretty_uqr <- format(summ_table[,'uqr'], scientific = T, digits = 3)
    pretty_iqr <- paste(pretty_lqr, pretty_uqr, sep = "; ")
    pretty_min <- format(summ_table[,'min_dist'], scientific = T, digits = 3)
    pretty_max <- format(summ_table[,'max_dist'], scientific = T, digits = 3)
    pretty_range <- paste(pretty_min, pretty_max, sep = "; ")
    tab <- data.frame(summ_table$grp1, summ_table$grp2, summ_table$type, summ_table$N)
    tab$musd <- pretty_musd
    tab$median <- pretty_median
    tab$iqr <- pretty_iqr
    tab$range <- pretty_range
    names(tab) <- pretty_column_names
  } else {
    column_names <- c("Group 1", "Group 2", "Comparison", "N", "Mean", "SD", "Median", "Lower IQR", "Upper IQR", "Min", "Max")
    tab <- summ_table[,c(1,2,5,4,6,7,8,9,10,11,12)]
    names(tab) <- column_names
  }
  if(file_type == 'md') {
    require(pander)
    capture.output(pander(tab, split.tables = Inf), file = outfile)
  } else {
    write.table(x = tab, file = outfile, sep = ",", quote = F, row.names = F)
  }
}

################################################################################

################################################################################
# output the results to a pretty figure
#

plot_figure <- function(summ_table, outfile = NULL, 
                        file_type = 'png', 
                        fig_type = "mean", 
                        fig_filter = "both",
                        fig_minmax = TRUE,
                        fig_comp = NULL) {
  # here, we take the output from the summarise_snp_differences function and 
  # make a plot that includes the mean, the sd, and the min/max for each of 
  # the possible comparisons
  # If outfile is specified, the figure is outputted as a png or pdf.
  require(ggplot2)
  tmp_tab <- summ_table
  if(fig_filter == 'inter') {
    tmp_tab <- summ_table[summ_table$type == "inter-group",]
    outfile = paste(outfile, "inter", sep = "_")
  } else if (fig_filter == 'intra') {
    tmp_tab <- summ_table[summ_table$type == "intra-group",]
    outfile = paste(outfile, "intra", sep = "_")
  } else if (fig_filter == "specific" && !is.null(fig_comp)) {
    tmp_tab <- summ_table[grepl(pattern = fig_comp, x = summ_table$comp),]
    outfile = paste(outfile, fig_comp, sep = "_")
  }
  
  if(fig_type == 'mean'){
    p1 <- ggplot(tmp_tab, aes(x = comp, y = mu, colour = type)) + 
      geom_point(size = 3 ,shape = 18) + 
      geom_errorbar(aes(ymax = mu + sd, ymin = mu - sd, width = 0.05)) + 
      #             geom_point(aes(x = comp, min_dist), size = 2) +
      #             geom_point(aes(x = comp, max_dist), size = 2) +
      xlab("Pairwise comparisons") + 
      #      ylab("Mean SNP distance\n(errorbars: sd; points: min/max)") +
      scale_colour_discrete(name = "Comparison type") +
      theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 3), 
            legend.position="bottom")
    ylab <- "Mean SNP distance\n(errorbars: sd"
  } else if(fig_type == 'median') {
    p1 <- ggplot(tmp_tab, aes(x = comp, y = med, colour = type)) + 
      geom_point(size = 3 ,shape = 18) + 
      geom_errorbar(aes(ymax = uqr, ymin = lqr, width = 0.05)) + 
      #       geom_point(aes(x = comp, min_dist), size = 2) +
      #       geom_point(aes(x = comp, max_dist), size = 2) +
      xlab("Pairwise comparisons") + 
      #      ylab("Median SNP distance\n(errorbars: inter-quartile range; points: min/max)") +
      scale_colour_discrete(name = "Comparison type") +
      theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10), 
            legend.position="bottom")
    ylab <- "Median SNP distance\n(errorbars: inter-quartile range"
  }
  
  if (fig_minmax) {
    p1 <- p1 +
      geom_point(aes(x = comp, min_dist), size = 2) +
      geom_point(aes(x = comp, max_dist), size = 2) +
      ylab(paste(ylab," ; points: min/max)", sep = ""))
  } else {
    outfile <- paste(outfile, "_no_minmax", sep = "")
    p1 <- p1 +
      ylab(paste(ylab, ")", sep = ""))
  }
  
  if(is.null(outfile)) {
    print(p1)
  } else {
    outfile = paste(outfile, file_type, sep = ".")
    if (file_type == 'png') {
      png(filename = outfile, width = 2048, height = 1536, res = 300)
    } else {
      pdf(file = outfile, width = 7, height = 5.5)
    }
    print(p1)
    dev.off()
  }
}

################################################################################

################################################################################
# the main() function
# 
main <- function(categories, 
                 seq_file = NULL, 
                 diff_file = NULL, 
                 out_base = NULL,
                 model = NULL,
                 tab_fmt = NULL,
                 tab_type = NULL,
                 fig_filter = NULL,
                 fig_type = NULL,
                 fig_fmt = NULL,
                 fig_comp = NULL,
                 fig_minmax = NULL,
                 exclude_ids = NULL,
                 save_long = NULL) {
  ################################################################################
  ## check dependencies
  
  miss_packages = c()
  
  if(!require(ggplot2, quietly = T)) {
    miss_packages = c(miss_packages, "ggplot2")
  }
  
  if(!require(ape, quietly = T)){
    miss_packages = c(miss_packages, "ape")
  }
  
  #   if(!require(DT, quietly = T)) {
  #     miss_packages = c(miss_packages, "DT")
  #   }
  
  if (length(miss_packages) >= 1) {
    mp <- paste(miss_packages, sep = ",", collapse = "")
    stop(paste("It seems we are missing some dependencies: ", mp, ". To install type the following:\n
               install.packages(c(\'",mp,"\'))", sep = ""))
  }
  # Load some necessary libraries
  library(ape)
  library(ggplot2)
  
  #load the categories
  cats_list <- read_cat_file(categories = categories, exclude_ids = exclude_ids)
  
  #load the data
  if(is.null(diff_file)) {
    dist_obj <- calc_pairwise_distance(seq_file = seq_file, model = model)
  } else {
    dist_obj <- read_diff_file(diff_file = diff_file)
  }
  
  #summarise the information
  cats <- names(cats_list)
  #check fig.comp, and parse
  if (!is.null(fig_comp)) {
    tmp <- strsplit(x = fig_comp, ":")
    cat_keep <- tmp[[1]][1]
    fig_comp <- tmp[[1]][2]
    cats <- cats[which(cats %in% cat_keep)]
  }
  for(cat in cats) {
    outf_b <- paste(out_base, cat, sep = "_")
    results <- summ_distances(categories = cats_list[[cat]], 
                              dist_obj = dist_obj, 
                              save_long = save_long,
                              outfile = outf_b)
    
    #output table
    write_summ_table(summ_table = results, 
                     file_type = tab_fmt, 
                     method = tab_type,
                     outfile = outf_b)
    
    #output figure
    plot_figure(summ_table = results,
                fig_filter = fig_filter,
                fig_type = fig_type,
                file_type = fig_fmt,
                fig_comp = fig_comp,
                fig_minmax = fig_minmax,
                outfile = outf_b)
  }
  }

################################################################################
# If running from the command line:
# 

if(!interactive()) {
  # collect the arguments
  args <- commandArgs(trailingOnly = TRUE)
  
  # check the argument length
  if(length(args) < 2 | length(args) > 7) {
    args <- c("--help")
  }
  
  ## Help section
  if("--help" %in% args) {
    cat("
        Summarise SNP differences
        
        Necessary arguments:
        filename    - a string defining the path to the categories file
        
        Optional arguments:
        --seq=filename    - a string defining the path to
        the FASTA file (ignored if --diff is defined)
        --diff=filename   - a string defining the path to 
        the SNP differences matrix (must be defined if no
        --seq is defined)
        --out_basename    - basename for output files (default: snp_diff)
        --tab_fmt         - output format for table (default: \"csv\")
        \"csv\" or \"md\" for CSV or Markdown, respectively
        --tab_type        - make table \"pretty\" by formatting numbers or 
        or output \"raw\" numbers (default: \"pretty\")
        --fig_filter      - one of \"both\", \"intra\", \"inter\", or \"specific\".
        use \"both\", \"intra\" or \"inter\" if wanting to plot 
        both inter and intra distance comparisons on 
        the same plot, or only intra or inter, respectively.
        if wanting to plot just comparisons that include a 
        single category, use \"specific\", and then specify
        the group name in --fig_comp.
        (default: \"both\")
        --fig_comp        - when specifying --fig_filter=\"specific\", this must
        be specified. A string identifying one category in the 
        comp file to plot (e.g., \"clade:clade_a\"). To avoid saving
        over previous analyses, the group name is added as an
        extension. Note the use of the colon to specify the 
        category (i.e., column in the cat file) and group 
        (i.e., name of the grouping unit within that column).
        --fig_type        - output figure of mean +/- sd and range (\"mean\") or
        median +/- inter-quartile range (\"median\")
        --fig_minmax      - whether or not to plot the mix/max points. Either
        \"TRUE\" or \"FALSE\" (default: \"TRUE\")
        --fig_fmt         - output format for figure (default: \"png\")
        \"png\" or \"pdf\" for PNG or PDF, respectively
        --save_long       - \"TRUE\" or \"FALSE\" if wanting to save the raw data
        in long format, along with with all the additional
        metadata (default: \"FALSE\")
        --exclude_ids     - string defining the path to a file with sequence
        ids to be excluded, one per line (default: None)
        --help            - print this text
        
        Example:
        ./summarise_snp_differences.R \"/home/user/cat.csv\" --seq=\"/home/user/seq.fa\"\n\n")
    
    q(save="no")
  }
  
  #parse arguments
  cat_file = args[1]
  args <- args[2:length(args)]
  parse_args <- function(x) strsplit(sub("^--", "", x), "=")
  args_df <- as.data.frame(
    do.call("rbind", parse_args(args)), 
    stringsAsFactors = F)
  names(args_df) <- c("key", "value")
  if('diff' %in% args_df$key) {
    diff_file = args_df[args_df$key == 'diff', 'value']
  } else {
    seq_file = args_df[args_df$key == 'seq', 'value']
  }
  for(arg in c("out_basename", "tab_fmt", "tab_type", "fig_fmt", "exclude_ids")) {
    if (arg %in% args_df$key) {
      assign(arg, args_df[args_df$key == arg, 'value'])
    }
  }
}

#run the main function
main(categories = cat_file, 
     seq_file = seq_file, 
     diff_file = diff_file, 
     out_base = out_basename, 
     model = model,
     tab_fmt = tab_fmt, 
     tab_type = tab_type,
     fig_filter = fig_filter,
     fig_type = fig_type,
     fig_fmt = fig_fmt,
     fig_comp = fig_comp,
     fig_minmax = fig_minmax,
     save_long = save_long,
     exclude_ids = exclude_ids)

cat("The script has ended successfully!\n")
################################################################################