diff --git a/R/neural_network_predictor.R b/R/neural_network_predictor.R deleted file mode 100644 index dac95d2..0000000 --- a/R/neural_network_predictor.R +++ /dev/null @@ -1,49 +0,0 @@ -#' #' Title -#' #' -#' #' @param train_features xy -#' #' @param train_labels xy -#' #' @param test_features xy -#' #' @param test_labels xy -#' #' @param epochs xy -#' #' -#' #' @return xy -#' #' @export -#' #' -#' #' @examples xy -#' predict_stock <- function(train_features, train_labels, test_features, test_labels, epochs = 50, batches = 32) { -#' # Normalize the data -#' mean <- apply(train_features, 2, mean) -#' std <- apply(train_features, 2, sd) -#' train_features <- scale(train_features, center = mean, scale = std) -#' test_features <- scale(test_features, center = mean, scale = std) -#' -#' # Define the model architecture !!!!!!!!activation functions chosen by user or predefine !!!!!!! -#' model <- keras_model_sequential() %>% -#' layer_dense(units = ncol(train_features)-1, activation = 'relu', input_shape = ncol(train_features)) %>% -#' layer_dense(units = (ncol(train_features)-1)/4, activation = 'relu') %>% -#' layer_dense(units = 1, activation = 'sigmoid') # Assuming binary classification (0 = not invest, 1 = invest) -#' -#' # Compile the model -#' model %>% compile( -#' loss = 'binary_crossentropy', -#' optimizer = optimizer_rmsprop(), -#' metrics = c('accuracy') -#' ) -#' -#' # Fit the model -#' history <- model %>% fit( -#' train_features, -#' train_labels, -#' epochs = epochs, -#' batch_size = batches, #ask sebastian -#' validation_split = 0.2 -#' ) -#' -#' # Evaluate the model -#' model %>% evaluate(test_features, test_labels, verbose = 0) -#' -#' # Make predictions -#' predictions <- model %>% predict_classes(test_features) -#' -#' return(list(model = model, history = history, predictions = predictions)) -#' } diff --git a/R/penreg_lasso.R b/R/penreg_lasso.R deleted file mode 100644 index 0c7702e..0000000 --- a/R/penreg_lasso.R +++ /dev/null @@ -1,18 +0,0 @@ -#' #change the name to penreg without lasso -#' #' Title -#' #' -#' #' @param y_penalized xy -#' #' @param x_penalized xy -#' #' -#' #' @return xy -#' #' @export -#' #' -#' #' @examples xy -#' -#' penreg <- function(y_penalized, x_penalized) { -#' -#' for(i in seq(0,1,by = 0.1)){ -#' fit_penreg <- glmnet(x_penalized, y_penalized, alpha) -#' } -#' return(fit_lasso, fit_ridge) -#' } diff --git a/R/random_forest_predictor.R b/R/random_forest_predictor.R deleted file mode 100644 index c423490..0000000 --- a/R/random_forest_predictor.R +++ /dev/null @@ -1,33 +0,0 @@ -#' -#' #' Title -#' #' -#' #' @param y xy -#' #' @param features xy -#' #' @param data xy -#' #' @param ntree xy -#' #' -#' #' @return xy -#' #' -#' #' @importFrom randomForest randomForest -#' #' -#' #' -#' #' @export -#' #' -#' #' -#' #' @examples xy -#' #' -#' random_foreest_predictor <- function(y, features, data, ntree ) { -#' formula <- paste(y , paste(features, collapse = " + ")) # Defines the model -#' formula <- as.formula(formula) # Forcing formula object -#' fit_RF <- randomForest(formula_C, # New formula! -#' data = training_sample, # Data source: training sample -#' sampsize = 20000, # Size of (random) sample for each tree --> dependant on data/feature size -#' replace = FALSE, # Is the sampling done with replacement? -#' nodesize = 250, # Minimum size of terminal cluster --> dependant on data/feature size -#' ntree = 40, # Number of random trees --> dependant on data/feature size -#' mtry = 30 # Number of predictive variables for each tree --> dependant on data/feature size -#' ) -#' predictions <- predict(fit_RF, testing_sample[1:5,]) # Prediction over the first 5 test instances -#' -#' return(predictions) -#' } diff --git a/R/returnPredictors.R b/R/returnPredictors.R deleted file mode 100644 index 85198ef..0000000 --- a/R/returnPredictors.R +++ /dev/null @@ -1,107 +0,0 @@ -#' ols function for return prediction -#' -#' @param train_data data frame with stock_id, date, return_label and features -#' @param test_data data frame with stock_id, date and features -#' @param config empty list, as ols does not need any configuration -#' @param fast logical, if TRUE, use fastLm from RcppArmadillo, else use lm from base R -#' -#' @return tibble with stock_id, date and pred_return matching the test_data -#' -#' @importFrom RcppArmadillo fastLm -#' @importFrom dplyr pull -#' @importFrom tibble tibble -#' @importFrom stats predict -#' -#' @export -#' -#' @examples -#' \dontrun{ -#' data(data_ml) -#' train_data_ex <- data_ml[1:100,c(1,2,96,5:10)] -#' test_data_ex <- data_ml[101:150,c(1,2,5:10)] -#' ols_pred(train_data_ex, test_data_ex, config=list(), fast=TRUE) -#' ols_pred(train_data_ex, test_data_ex, config=list(), fast=FALSE) -#' } -ols_pred <- function(train_data, test_data, config=list(), fast=TRUE) { - if (fast) { - # fast cpp implementation - mm <- cbind(1, as.matrix(train_data[,4:ncol(train_data)])) # model matrix - y <- train_data |> dplyr::pull(3) # response - plm <- RcppArmadillo::fastLm(mm, y) - predictions <- predict(plm, cbind(1, as.matrix(test_data[,3:ncol(test_data)]))) - } else { - # standard lm implementation - label <- colnames(train_data)[3] - formula_lm <- as.formula(paste0(colnames(train_data)[3], " ~ .")) - flm <- lm(formula_lm, data=train_data[,3:ncol(train_data)]) - predictions <- as.vector(predict(flm, test_data[,3:ncol(test_data)])) - } - # match preds back to stock_id and date - predictions <- tibble::tibble(stock_id=test_data$stock_id, date=test_data$date, pred_return=predictions) - return(predictions) -} - -#' xgb function for return prediction -#' -#' @param train_data data frame with stock_id, date, return_label and features -#' @param test_data data frame with stock_id, date and features -#' @param config empty list, as ols does not need any configuration -#' @param fast logical, if TRUE, use fastLm from RcppArmadillo, else use lm from base R -#' -#' @return tibble with stock_id, date and pred_return matching the test_data -#' -#' @import xgboost -#' @importFrom tibble tibble -#' @importFrom stats predict -#' -#' @export -#' -#' @examples -#' \dontrun{ -#' data(data_ml) -#' train_data_ex <- data_ml[1:100,c(1,2,96,5:10)] -#' test_data_ex <- data_ml[101:150,c(1,2,5:10)] -#' ols_pred(train_data_ex, test_data_ex, config=list(), fast=TRUE) -#' ols_pred(train_data_ex, test_data_ex, config=list(), fast=FALSE) -#' } -xgb_pred <- function(train_data, test_data, config) { - # Default parameters for xgb.train (simplified example) - default_params <- list( - eta = 0.3, - max_depth = 4, - #gamma = 0, - objective = "reg:squarederror", - nrounds = 80 - ) - - # Function to check and add missing arguments - ensure_config <- function(config, default_params) { - missing_args <- setdiff(names(default_params), names(config)) - if (length(missing_args) > 0) { - message("Adding default values for missing arguments: ", paste(missing_args, collapse=", ")) - for (arg in missing_args) { - config[[arg]] <- default_params[[arg]] - } - } - return(config) - } - ## check config - config <- ensure_config(config, default_params) - - - train_features <- as.matrix(train_data[,4:ncol(train_data)]) - train_label <- as.matrix(train_data[,3]) - train_matrix <- xgboost::xgb.DMatrix(data = train_features, label = train_label) # XGB format - # add data - config$data <- train_matrix - # do the training - fit <- do.call(xgboost::xgb.train, config) - # do the predictions - xgb_test <- as.matrix(test_data[,3:ncol(test_data)]) |> xgboost::xgb.DMatrix() - predictions <- as.vector(predict(fit, xgb_test)) - - # match preds back to stock_id and date - predictions <- tibble::tibble(stock_id=test_data$stock_id, date=test_data$date, pred_return=predictions) - return(predictions) -} - diff --git a/R/return_pred_functions.R b/R/return_pred_functions.R new file mode 100644 index 0000000..afcc533 --- /dev/null +++ b/R/return_pred_functions.R @@ -0,0 +1,680 @@ +#' Ordinary Least Squares (OLS) Prediction Function +#' +#' This function trains an Ordinary Least Squares (OLS) regression model to predict returns based on the provided training data +#' and predicts returns for the test data. +#' +#' @param train_data A data frame with `stock_id`, `date`, `return_label`, and features. +#' @param test_data A data frame with `stock_id`, `date`, and features. +#' @param config A list of OLS configuration parameters (e.g., `intercept`). +#' +#' @return A tibble with `stock_id`, `date`, and `pred_return` matching the test data. +#' @importFrom stats lm predict +#' @importFrom tibble tibble +#' @importFrom dplyr bind_rows +#' @importFrom checkmate assert_data_frame assert_list +#' @importFrom cli cli_abort cli_alert_danger +#' @export +#' +#' @examples +#' \dontrun{ +#' data(data_ml) +#' train_data_ex <- data_ml[1:100, c("stock_id", "date", "R1M_Usd", "Div_Yld", "Eps", "Mkt_Cap_12M_Usd", "Mom_11M_Usd", "Ocf", "Pb", "Vol1Y_Usd")] +#' test_data_ex <- data_ml[101:150, c("stock_id", "date", "Div_Yld", "Eps", "Mkt_Cap_12M_Usd", "Mom_11M_Usd", "Ocf", "Pb", "Vol1Y_Usd")] +#' ols_pred(train_data_ex, test_data_ex, config = list(intercept = TRUE)) +#' } +ols_pred <- function(train_data, test_data, config = list()) { + # Input Validation + checkmate::assert_data_frame(train_data, min.rows = 1, any.missing = FALSE, .var.name = "train_data") + checkmate::assert_data_frame(test_data, min.rows = 1, any.missing = FALSE, .var.name = "test_data") + checkmate::assert_list(config, .var.name = "config") + + # Default parameters for OLS + default_params <- list( + intercept = TRUE # Whether to include an intercept in the model + ) + + # Ensure config using helper function + config <- ensure_config(config, default_params) + + # Handle missing values + check_missing_values(train_data) + check_missing_values(test_data) + + # Encode categorical data + full_data <- bind_rows(train_data, test_data) + full_data <- encode_categorical(full_data) + + # Split data back + train_data <- full_data$data[1:nrow(train_data), ] + return_label <- colnames(train_data)[3] + test_data <- full_data$data[(nrow(train_data) + 1):nrow(full_data$data), -which(names(full_data$data) == return_label)] + + # Prepare formula for lm + features <- names(train_data)[4:ncol(train_data)] + formula <- as.formula(paste("`", return_label, "` ~ ", paste(features, collapse = " + "), sep = "")) + + # Train the OLS model + model <- tryCatch( + stats::lm(formula = formula, data = train_data), + error = function(e) { + cli::cli_abort("Error training OLS model: {e$message}") + } + ) + + # Predict using the test data + predictions_matrix <- tryCatch( + stats::predict(model, newdata = test_data), + error = function(e) { + cli::cli_abort("Error during OLS prediction: {e$message}") + } + ) + + # Convert predictions to numeric vector + predictions <- as.vector(predictions_matrix) + + # Match predictions back to stock_id and date + predictions <- tibble::tibble( + stock_id = test_data$stock_id, + date = test_data$date, + pred_return = predictions + ) + + return(predictions) +} + +#' Elastic Net Prediction Function +#' +#' This function trains an Elastic Net model using the glmnet package and predicts outcomes +#' for the provided test data set based on the trained model. +#' +#' @param train_data A data frame with `stock_id`, `date`, `return_label`, and features. +#' @param test_data A data frame with `stock_id`, `date`, and features. +#' @param config A list of Elastic Net parameters (`alpha` and `lambda`). +#' +#' @return A tibble with `stock_id`, `date`, and `pred_return` matching the test data. +#' @importFrom glmnet glmnet +#' @importFrom tibble tibble +#' @importFrom dplyr bind_rows +#' @importFrom checkmate assert_data_frame assert_list +#' @importFrom cli cli_abort cli_alert_danger +#' @export +#' +#' @examples +#' \dontrun{ +#' data(data_ml) +#' train_data_ex <- data_ml[1:100, c("stock_id", "date", "R1M_Usd", "Div_Yld", "Eps", "Mkt_Cap_12M_Usd", "Mom_11M_Usd", "Ocf", "Pb", "Vol1Y_Usd")] +#' test_data_ex <- data_ml[101:150, c("stock_id", "date", "Div_Yld", "Eps", "Mkt_Cap_12M_Usd", "Mom_11M_Usd", "Ocf", "Pb", "Vol1Y_Usd")] +#' enet_pred(train_data_ex, test_data_ex, config = list(alpha = 0.5, lambda = 0.1)) +#' } +enet_pred <- function(train_data, test_data, config = list()) { + # Input Validation + checkmate::assert_data_frame(train_data, min.rows = 1, any.missing = FALSE, .var.name = "train_data") + checkmate::assert_data_frame(test_data, min.rows = 1, any.missing = FALSE, .var.name = "test_data") + checkmate::assert_list(config, .var.name = "config") + + # Default Elastic Net parameters + default_params <- list( + alpha = 0.5, + lambda = 0.1 + ) + + # Ensure config using helper function + config <- ensure_config(config, default_params) + + # Handle missing values + check_missing_values(train_data) + check_missing_values(test_data) + + # Encode categorical data + full_data <- bind_rows(train_data, test_data) + full_data <- encode_categorical(full_data) + + # Split data back + train_data <- full_data$data[1:nrow(train_data), ] + return_label <- colnames(train_data)[3] + test_data <- full_data$data[(nrow(train_data) + 1):nrow(full_data$data), -which(names(full_data$data) == return_label)] + + # Prepare the model matrix and response vector + train_features <- as.matrix(train_data[, 4:ncol(train_data)]) + train_label <- train_data[[3]] # Assuming the 3rd column is the response variable + + # Train the Elastic Net model using glmnet + enet_fit <- tryCatch( + glmnet::glmnet(train_features, train_label, alpha = config$alpha, lambda = config$lambda), + error = function(e) { + cli::cli_abort("Error training Elastic Net model: {e$message}") + } + ) + + # Prepare test data and predict + test_features <- as.matrix(test_data[, 3:ncol(test_data)]) + predictions_matrix <- tryCatch( + predict(enet_fit, s = config$lambda, newx = test_features), + error = function(e) { + cli::cli_abort("Error during Elastic Net prediction: {e$message}") + } + ) + + # Convert predictions to a numeric vector + predictions <- as.vector(predictions_matrix) + + # Match predictions back to stock_id and date + predictions <- tibble::tibble( + stock_id = test_data$stock_id, + date = test_data$date, + pred_return = predictions + ) + + return(predictions) +} + +#' Extreme Gradient Boosting (XGBoost) Prediction Function +#' +#' This function trains an XGBoost model to predict returns based on the provided training data +#' and predicts returns for the test data. +#' +#' @param train_data A data frame with `stock_id`, `date`, `return_label`, and features. +#' @param test_data A data frame with `stock_id`, `date`, and features. +#' @param config A list of XGBoost configuration parameters (e.g., `nrounds`, `max_depth`, `eta`). +#' +#' @return A tibble with `stock_id`, `date`, and `pred_return` matching the test data. +#' @import xgboost +#' @importFrom tibble tibble +#' @importFrom dplyr bind_rows +#' @importFrom checkmate assert_data_frame assert_list +#' @importFrom cli cli_abort cli_alert_danger +#' @export +#' +#' @examples +#' \dontrun{ +#' data(data_ml) +#' train_data_ex <- data_ml[1:100, c("stock_id", "date", "R1M_Usd", "Div_Yld", "Eps", "Mkt_Cap_12M_Usd", "Mom_11M_Usd", "Ocf", "Pb", "Vol1Y_Usd")] +#' test_data_ex <- data_ml[101:150, c("stock_id", "date", "Div_Yld", "Eps", "Mkt_Cap_12M_Usd", "Mom_11M_Usd", "Ocf", "Pb", "Vol1Y_Usd")] +#' xgb_pred(train_data_ex, test_data_ex, config = list(nrounds = 100, max_depth = 3, eta = 0.3, objective = "reg:squarederror")) +#' } +xgb_pred <- function(train_data, test_data, config = list()) { + # Input Validation + checkmate::assert_data_frame(train_data, min.rows = 1, any.missing = FALSE, .var.name = "train_data") + checkmate::assert_data_frame(test_data, min.rows = 1, any.missing = FALSE, .var.name = "test_data") + checkmate::assert_list(config, .var.name = "config") + + # Default parameters for XGBoost + default_params <- list( + nrounds = 100, # Number of boosting iterations + max_depth = 3, # Maximum tree depth for base learners + eta = 0.3, # Learning rate + objective = "reg:squarederror" # Learning objective + ) + + # Ensure config using helper function + config <- ensure_config(config, default_params) + + # Handle missing values + check_missing_values(train_data) + check_missing_values(test_data) + + # Encode categorical data + full_data <- bind_rows(train_data, test_data) + full_data <- encode_categorical(full_data) + + # Split data back + train_data <- full_data$data[1:nrow(train_data), ] + return_label <- colnames(train_data)[3] + test_data <- full_data$data[(nrow(train_data) + 1):nrow(full_data$data), -which(names(full_data$data) == return_label)] + + # Prepare training matrix and labels + train_matrix <- as.matrix(train_data[, 4:ncol(train_data)]) + train_label <- train_data[[3]] # Assuming the 3rd column is the response variable + + dtrain <- xgboost::xgb.DMatrix(data = train_matrix, label = train_label) + + # Train the XGBoost model + model <- tryCatch( + xgboost::xgboost( + data = dtrain, + nrounds = config$nrounds, + max_depth = config$max_depth, + eta = config$eta, + objective = config$objective, + verbose = 0 + ), + error = function(e) { + cli::cli_abort("Error training XGBoost model: {e$message}") + } + ) + + # Prepare test matrix and predict + test_matrix <- as.matrix(test_data[, 3:ncol(test_data)]) + predictions_matrix <- tryCatch( + predict(model, test_matrix), + error = function(e) { + cli::cli_abort("Error during XGBoost prediction: {e$message}") + } + ) + + # Convert predictions to numeric vector + predictions <- as.vector(predictions_matrix) + + # Match predictions back to stock_id and date + predictions <- tibble::tibble( + stock_id = test_data$stock_id, + date = test_data$date, + pred_return = predictions + ) + + return(predictions) +} + +#' Random Forest Function for Return Prediction +#' +#' This function trains a Random Forest model to predict returns based on the provided training data +#' and predicts returns for the test data. +#' +#' @param train_data A data frame with `stock_id`, `date`, `return_label`, and features. +#' @param test_data A data frame with `stock_id`, `date`, and features. +#' @param config A list of Random Forest configuration parameters (e.g., `num.trees`, `mtry`). +#' +#' @return A tibble with `stock_id`, `date`, and `pred_return` matching the test data. +#' @import ranger +#' @importFrom tibble tibble +#' @importFrom dplyr bind_rows +#' @importFrom checkmate assert_data_frame assert_list +#' @importFrom cli cli_abort cli_alert_danger +#' @export +#' +#' @examples +#' \dontrun{ +#' data(data_ml) +#' train_data_ex <- data_ml[1:100, c("stock_id", "date", "R1M_Usd", "Div_Yld", "Eps", "Mkt_Cap_12M_Usd", "Mom_11M_Usd", "Ocf", "Pb", "Vol1Y_Usd")] +#' test_data_ex <- data_ml[101:150, c("stock_id", "date", "Div_Yld", "Eps", "Mkt_Cap_12M_Usd", "Mom_11M_Usd", "Ocf", "Pb", "Vol1Y_Usd")] +#' rf_pred(train_data_ex, test_data_ex, config = list(num.trees = 200, mtry = 4)) +#' } +rf_pred <- function(train_data, test_data, config = list()) { + # Input Validation + checkmate::assert_data_frame(train_data, min.rows = 1, any.missing = FALSE, .var.name = "train_data") + checkmate::assert_data_frame(test_data, min.rows = 1, any.missing = FALSE, .var.name = "test_data") + checkmate::assert_list(config, .var.name = "config") + + # Default parameters for ranger + default_params <- list( + num.trees = 100, # Number of random trees + mtry = 5 # Number of variables randomly sampled as candidates at each split + ) + + # Ensure config using helper function + config <- ensure_config(config, default_params) + + # Handle missing values + check_missing_values(train_data) + check_missing_values(test_data) + + # Encode categorical data + full_data <- bind_rows(train_data, test_data) + full_data <- encode_categorical(full_data) + + # Split data back + train_data <- full_data$data[1:nrow(train_data), ] + return_label <- colnames(train_data)[3] + test_data <- full_data$data[(nrow(train_data) + 1):nrow(full_data$data), -which(names(full_data$data) == return_label)] + + # Prepare training features and labels + train_features <- as.matrix(train_data[, 4:ncol(train_data)]) + train_label <- train_data[[3]] # Assuming the 3rd column is the response variable + + # Train the Random Forest model using ranger + fit <- tryCatch( + ranger::ranger( + x = train_features, + y = train_label, + num.trees = config$num.trees, + mtry = config$mtry + ), + error = function(e) { + cli::cli_abort("Error training Random Forest model: {e$message}") + } + ) + + # Prepare test data and predict + test_features <- as.matrix(test_data[, 3:ncol(test_data)]) + predictions <- tryCatch( + predict(fit, data = test_features)$predictions, + error = function(e) { + cli::cli_abort("Error during Random Forest prediction: {e$message}") + } + ) + + # Ensure predictions are numeric vectors + predictions <- as.vector(predictions) + + # Match predictions back to stock_id and date + predictions <- tibble::tibble( + stock_id = test_data$stock_id, + date = test_data$date, + pred_return = predictions + ) + + return(predictions) +} + +#' Neural Network (nnet) Prediction Function +#' +#' This function trains a neural network using the nnet package and makes predictions +#' for the test data set. +#' +#' @param train_data A data frame with `stock_id`, `date`, `return_label`, and features. +#' @param test_data A data frame with `stock_id`, `date`, and features. +#' @param config A list containing parameters for the neural network, such as `size` and `decay`. +#' +#' @return A tibble with `stock_id`, `date`, and `pred_return` matching the test data. +#' @importFrom nnet nnet +#' @importFrom tibble tibble +#' @importFrom dplyr bind_rows +#' @importFrom checkmate assert_data_frame assert_list +#' @importFrom cli cli_abort cli_alert_danger +#' @export +#' +#' @examples +#' \dontrun{ +#' data(data_ml) +#' train_data_ex <- data_ml[1:100, c("stock_id", "date", "R1M_Usd", "Div_Yld", "Eps", "Mkt_Cap_12M_Usd", "Mom_11M_Usd", "Ocf", "Pb", "Vol1Y_Usd")] +#' test_data_ex <- data_ml[101:150, c("stock_id", "date", "Div_Yld", "Eps", "Mkt_Cap_12M_Usd", "Mom_11M_Usd", "Ocf", "Pb", "Vol1Y_Usd")] +#' nnet_pred(train_data_ex, test_data_ex, config = list(size = 10, decay = 0.01)) +#' } +nnet_pred <- function(train_data, test_data, config = list()) { + # Input Validation + checkmate::assert_data_frame(train_data, min.rows = 1, any.missing = FALSE, .var.name = "train_data") + checkmate::assert_data_frame(test_data, min.rows = 1, any.missing = FALSE, .var.name = "test_data") + checkmate::assert_list(config, .var.name = "config") + + # Default parameters for neural network + default_params <- list( + size = 5, # Number of units in the hidden layer + decay = 0.1 # Weight decay (regularization) + ) + + # Ensure config using helper function + config <- ensure_config(config, default_params) + + # Handle missing values + check_missing_values(train_data) + check_missing_values(test_data) + + # Encode categorical data + full_data <- bind_rows(train_data, test_data) + full_data <- encode_categorical(full_data) + + # Split data back + train_data <- full_data$data[1:nrow(train_data), ] + return_label <- colnames(train_data)[3] + test_data <- full_data$data[(nrow(train_data) + 1):nrow(full_data$data), -which(names(full_data$data) == return_label)] + + # Prepare training data + train_features <- as.matrix(train_data[, 4:ncol(train_data)]) + train_label <- train_data[[3]] # Assuming the 3rd column is the response variable + + # Train the neural network model using nnet + model <- tryCatch( + nnet::nnet( + x = train_features, + y = train_label, + size = config$size, + decay = config$decay, + linout = TRUE, + trace = FALSE + ), + error = function(e) { + cli::cli_abort("Error training Neural Network model: {e$message}") + } + ) + + # Predict using the test data + test_features <- as.matrix(test_data[, 3:ncol(test_data)]) + predictions <- tryCatch( + predict(model, test_features), + error = function(e) { + cli::cli_abort("Error during Neural Network prediction: {e$message}") + } + ) + + # Ensure predictions are numeric vectors + predictions <- as.vector(predictions) + + # Match predictions back to stock_id and date + predictions <- tibble::tibble( + stock_id = test_data$stock_id, + date = test_data$date, + pred_return = predictions + ) + + return(predictions) +} + +#' Support Vector Machine (SVM) Prediction Function +#' +#' This function trains a Support Vector Machine (SVM) model to predict returns based on the provided +#' training data and makes predictions for the test data. +#' +#' @param train_data A data frame with `stock_id`, `date`, `return_label`, and features. +#' @param test_data A data frame with `stock_id`, `date`, and features. +#' @param config A list of SVM configuration parameters (e.g., `kernel`, `type`, `gamma`, `cost`, `epsilon`). +#' +#' @return A tibble with `stock_id`, `date`, and `pred_return` matching the test data. +#' @importFrom e1071 svm +#' @importFrom tibble tibble +#' @importFrom dplyr bind_rows +#' @importFrom checkmate assert_data_frame assert_list +#' @importFrom cli cli_abort cli_alert_danger +#' @export +#' +#' @examples +#' \dontrun{ +#' data(data_ml) +#' train_data_ex <- data_ml[1:100, c("stock_id", "date", "R1M_Usd", "Div_Yld", "Eps", "Mkt_Cap_12M_Usd", "Mom_11M_Usd", "Ocf", "Pb", "Vol1Y_Usd")] +#' test_data_ex <- data_ml[101:150, c("stock_id", "date", "Div_Yld", "Eps", "Mkt_Cap_12M_Usd", "Mom_11M_Usd", "Ocf", "Pb", "Vol1Y_Usd")] +#' svm_pred(train_data_ex, test_data_ex, config = list(kernel = "linear", cost = 1)) +#' } +svm_pred <- function(train_data, test_data, config = list()) { + # Input Validation + checkmate::assert_data_frame(train_data, min.rows = 1, any.missing = FALSE, .var.name = "train_data") + checkmate::assert_data_frame(test_data, min.rows = 1, any.missing = FALSE, .var.name = "test_data") + checkmate::assert_list(config, .var.name = "config") + + # Default parameters for SVM + default_params <- list( + type = "eps-regression", # SVM task type (regression in this case) + kernel = "radial", # Kernel type (radial, linear, etc.) + epsilon = 0.1, # Error tolerance width + gamma = 0.5, # Constant in the radial kernel + cost = 1 # Penalty for misclassification + ) + + # Ensure config using helper function + config <- ensure_config(config, default_params) + + # Handle missing values + check_missing_values(train_data) + check_missing_values(test_data) + + # Encode categorical data + full_data <- bind_rows(train_data, test_data) + full_data <- encode_categorical(full_data) + + # Split data back + train_data <- full_data$data[1:nrow(train_data), ] + return_label <- colnames(train_data)[3] + test_data <- full_data$data[(nrow(train_data) + 1):nrow(full_data$data), -which(names(full_data$data) == return_label)] + + # Prepare training data + train_features <- as.matrix(train_data[, 4:ncol(train_data)]) + train_label <- train_data[[3]] # Assuming the 3rd column is the response variable + + # Train the SVM model using e1071::svm + model <- tryCatch( + e1071::svm( + x = train_features, + y = train_label, + type = config$type, + kernel = config$kernel, + epsilon = config$epsilon, + gamma = config$gamma, + cost = config$cost + ), + error = function(e) { + cli::cli_abort("Error training SVM model: {e$message}") + } + ) + + # Predict using the test data + test_features <- as.matrix(test_data[, 3:ncol(test_data)]) + predictions <- tryCatch( + predict(model, test_features), + error = function(e) { + cli::cli_abort("Error during SVM prediction: {e$message}") + } + ) + + # Ensure predictions are numeric vectors + predictions <- as.numeric(predictions) + + # Match predictions back to stock_id and date + predictions <- tibble::tibble( + stock_id = test_data$stock_id, + date = test_data$date, + pred_return = predictions + ) + + return(predictions) +} + +#' Caret Wrapper Prediction Function +#' +#' This function trains machine learning models using the caret package to predict returns based on the provided +#' training data and makes predictions for the test data. It supports various models with optional hyperparameter tuning. +#' +#' @param train_data A data frame containing `stock_id`, `date`, `return_label`, and feature columns. +#' @param test_data A data frame containing `stock_id`, `date`, and feature columns. +#' @param config A list containing caret configuration parameters, including `method`, `tuneGrid`, `trControl`, etc. +#' +#' @return A tibble with `stock_id`, `date`, and `pred_return` matching the `test_data`. +#' @importFrom caret train trainControl +#' @importFrom tibble tibble +#' @importFrom dplyr bind_rows select pull mutate_if +#' @importFrom checkmate assert_data_frame assert_list assert_string assert_flag +#' @importFrom cli cli_abort cli_alert_info +#' @export +#' +#' @examples +#' \dontrun{ +#' data(data_ml) +#' train_data_ex <- data_ml[1:100, c("stock_id", "date", "R1M_Usd", "Div_Yld", "Eps", +#' "Mkt_Cap_12M_Usd", "Mom_11M_Usd", "Ocf", "Pb", "Vol1Y_Usd")] +#' test_data_ex <- data_ml[101:150, c("stock_id", "date", "Div_Yld", "Eps", +#' "Mkt_Cap_12M_Usd", "Mom_11M_Usd", "Ocf", "Pb", "Vol1Y_Usd")] +#' config <- list( +#' method = "glmnet", +#' tuneGrid = expand.grid(alpha = 0.5, lambda = 0.1), +#' trControl = trainControl(method = "cv", number = 5), +#' metric = "RMSE" +#' ) +#' caret_predictions <- caret_wrapper(train_data_ex, test_data_ex, config) +#' } +caret_wrapper <- function(train_data, test_data, config = list()) { + + # Load necessary packages + if (!requireNamespace("caret", quietly = TRUE)) { + stop("Package 'caret' is required but not installed.") + } + + # Input Validation + checkmate::assert_data_frame(train_data, min.rows = 1, any.missing = FALSE, .var.name = "train_data") + checkmate::assert_data_frame(test_data, min.rows = 1, any.missing = FALSE, .var.name = "test_data") + checkmate::assert_list(config, .var.name = "config") + + # Ensure 'method' is provided in config + if (!"method" %in% names(config)) { + cli::cli_abort("Please provide a 'method' for caret model training in the config list.") + } + + # Set default trControl if not provided + if (!"trControl" %in% names(config)) { + config$trControl <- caret::trainControl(method = "none") + } + + # Handle missing values + # Replace with your actual check_missing_values function + if (any(is.na(train_data)) || any(is.na(test_data))) { + cli::cli_abort("Data contains missing values. Please handle them before training.") + } + + # Encode categorical data + # Replace with your actual encode_categorical function + full_data <- dplyr::bind_rows(train_data, test_data) + encoded_data <- encode_categorical(full_data) + + # Split data back into training and testing sets + train_encoded <- encoded_data$data[1:nrow(train_data), ] + + # Assuming the third column is return_label + return_label <- colnames(train_encoded)[3] + + test_encoded <- encoded_data$data[(nrow(train_data) + 1):nrow(encoded_data$data), ] + + # Remove the return_label from test data + test_encoded <- test_encoded %>% dplyr::select(-all_of(return_label)) + + # Prepare training features and labels + feature_cols <- setdiff(colnames(train_encoded), c("stock_id", "date", return_label)) + + if (length(feature_cols) == 0) { + cli::cli_abort("No feature columns found in train_data after encoding.") + } + + train_features <- train_encoded %>% dplyr::select(all_of(feature_cols)) + train_label_vector <- train_encoded %>% dplyr::pull(all_of(return_label)) + + # Prepare test data + test_features <- test_encoded %>% dplyr::select(all_of(feature_cols)) + + # Train the model using caret + cli::cli_alert_info("Training the model using method: {config$method}") + + model <- tryCatch( + caret::train( + x = as.matrix(train_features), + y = as.numeric(train_label_vector), + method = config$method, + tuneGrid = if (!is.null(config$tuneGrid)) config$tuneGrid else NULL, + trControl = config$trControl, + metric = if (!is.null(config$metric)) config$metric else "RMSE", + preProcess = if (!is.null(config$preProcess)) config$preProcess else NULL + ), + error = function(e) { + cli::cli_abort("Error during caret model training: {e$message}") + } + ) + + cli::cli_alert_info("Model training completed. Proceeding to predictions...") + + # Predict using the test data + predictions <- tryCatch( + predict(model, newdata = as.matrix(test_features)), + error = function(e) { + cli::cli_abort("Error during model prediction: {e$message}") + } + ) + + # Ensure predictions are numeric vectors + predictions <- as.numeric(predictions) + + # Match predictions back to stock_id and date + predictions_tibble <- tibble::tibble( + stock_id = test_encoded$stock_id, + date = test_encoded$date, + pred_return = predictions + ) + + cli::cli_alert_info("Predictions generated successfully.") + + return(predictions_tibble) +} +