added descriptive plots

MediaComem · Mar 28, 2024 · 747c190 · 747c190
1 parent 4d3258a
commit 747c190
Show file tree

Hide file tree

Showing 5 changed files with 108 additions and 7 deletions.
diff --git a/analysis/descriptive.R b/analysis/descriptive.R
@@ -4,10 +4,11 @@
 # Author: Giovanni Colavizza
 
 # Set your own working directory here
-setwd("~/das-public/analysis")
+setwd("~/Dropbox/db_projects/Odoma_projects/das-public/analysis")
 
 options(scipen=999) # prevents excessive use of scientific notation
 
+require(reshape2)
 require(ggplot2)
 require(GGally)
 require(dplyr)
@@ -18,14 +19,14 @@ DATASET <- read.csv("dataset/DATASET.csv")
 summary(DATASET)
 
 # correlations
-corr <- round(cor(DATASET[, c("n_cit_tot", "n_cit_2", "n_authors", "n_references_tot", "p_year", "h_index_mean")], method = "pearson", use="complete.obs"), 2)
+corr <- round(cor(DATASET[, c("n_cit_tot", "n_cit_2", "n_authors", "n_references_tot", "p_year", "p_month", "h_index_mean")], method = "pearson", use="complete.obs"), 2)
 upper <- corr
 upper[upper.tri(corr, diag = TRUE)] <- ""
 upper <- as.data.frame(upper)
 upper
-ggpairs(DATASET[, c("n_cit_tot", "n_cit_2", "n_authors", "n_references_tot", "p_year", "h_index_mean")])
+ggpairs(DATASET[, c("n_cit_tot", "n_cit_2", "n_authors", "n_references_tot", "p_year", "p_month", "h_index_mean")])
 
-corr <- round(cor(DATASET[, c("n_cit_tot", "n_cit_2", "n_authors", "n_references_tot", "p_year", "h_index_mean", "Data_Shared","Repositories_data_bool","Code_Generated","Code_Shared","Preprint_Match")], method = "pearson", use="complete.obs"), 2)
+corr <- round(cor(DATASET[, c("n_cit_tot", "n_cit_2", "n_authors", "n_references_tot", "p_year", "p_month", "h_index_mean", "Data_Shared","Repositories_data_bool","Code_Generated","Code_Shared","Preprint_Match")], method = "pearson", use="complete.obs"), 2)
 upper <- corr
 upper[upper.tri(corr, diag = TRUE)] <- ""
 upper <- as.data.frame(upper)
@@ -36,7 +37,7 @@ qqnorm(DATASET$n_cit_tot_log)
 qex <- function(x) qexp((rank(x)-.375)/(length(x)+.25))
 plot(qex(DATASET$n_cit_tot),DATASET$n_cit_tot_log)
 
-# check value count distributions
+# check value counts
 
 mat <- stack(table(DATASET$Country)) # USE, filtering low value counts
 mat <- mat[order(mat$values), ]
@@ -97,3 +98,90 @@ tail(mat,10)
 mat <- stack(table(DATASET$journal_subfield)) # Not useful
 mat <- mat[order(mat$values), ]
 tail(mat,10)
+
+# DESCRIPTIVE PLOTS
+
+# Load necessary libraries
+library(dplyr)
+library(tidyr)
+library(ggplot2)
+
+# 1: % of OSI over time
+
+# Calculate the percentage of 1s for each variable by year
+DATASET_aggregated <- DATASET %>%
+  group_by(p_year) %>%
+  summarise(across(c(Code_Shared,Repositories_data_bool,Preprint_Match), ~mean(.x) * 100)) # Calculate the mean and convert to percentage
+
+# Reshape the data from wide to long format for plotting
+DATASET_long <- reshape2::melt(DATASET_aggregated, id.vars = "p_year", variable.name = "variable", value.name = "percentage")
+
+# Plotting the data
+ggplot(DATASET_long, aes(x = p_year, y = percentage, linetype = variable)) +
+  geom_line(aes(color = variable)) + # Drawing the lines
+  scale_color_manual(values = rep("black", 3)) + # Set the colors to black
+  theme_minimal(base_size = 16) + # Minimal theme
+  labs(x = "Year", y = "Percentage of publications", title = "Adoption of OSI over time") +
+  theme(legend.title = element_blank()) + # Remove legend title
+  scale_linetype_manual(values=c("solid", "dotted", "twodash")) # Custom line types
+
+# 2: OSI by DIVISION
+
+# Replace "True" with 1 and "False" with 0 in division_1 to division_18 columns
+DATASET <- DATASET %>%
+  mutate(across(starts_with("division_"), ~ as.integer(. == "True")))
+
+# Step 1: Filter for division_1 being 1
+division_1_data <- DATASET %>% filter(division_1 == 1)
+
+# Step 2: Calculate percentages
+percentages <- division_1_data %>%
+  summarise(across(c(Code_Shared,Repositories_data_bool,Preprint_Match), ~mean(.x, na.rm = TRUE) * 100)) %>%
+  pivot_longer(cols = c(Code_Shared,Repositories_data_bool,Preprint_Match), names_to = "variable", values_to = "percentage")
+
+# Step 3: Plot
+ggplot(percentages, aes(x = variable, y = percentage, fill = variable)) +
+  geom_bar(stat = "identity", position = position_dodge()) +
+  theme_minimal(base_size = 14) +
+  labs(x = "Variable", y = "Percentage of 1s", title = "Percentage of 1s in Division 1") +
+  theme(axis.text.x = element_text(angle = 45, hjust = 1))
+
+# ALL divisions
+
+# Initialize an empty dataframe to store the aggregated percentages
+aggregated_data <- data.frame(division = character(), 
+                              variable = character(), 
+                              percentage = numeric())
+
+# Loop through each division
+for(i in 1:18) {
+  division_col <- paste("division_", i, sep = "")
+
+  # Calculate the percentage of 1s for Code_Shared,Repositories_data_bool,Preprint_Match within the current division
+  temp_data <- DATASET %>%
+    filter(.[[division_col]] == 1) %>%
+    summarise(Code_Shared = mean(Code_Shared) * 100,
+              Repositories_data_bool = mean(Repositories_data_bool) * 100,
+              Preprint_Match = mean(Preprint_Match) * 100) %>%
+    pivot_longer(cols = c(Code_Shared,Repositories_data_bool,Preprint_Match), names_to = "variable", values_to = "percentage") %>%
+    mutate(division = division_col)
+
+  # Append the results to the aggregated_data dataframe
+  aggregated_data <- bind_rows(aggregated_data, temp_data)
+}
+
+# Ensure division is a factor with levels sorted as desired
+# Directly setting levels in numeric order will sort them from 1 to 18 in the plot
+aggregated_data$division <- factor(aggregated_data$division, levels = paste("division_", 1:18, sep = ""))
+
+# Plotting, ensuring black and white output and correct ordering of divisions
+ggplot(aggregated_data, aes(x = division, y = percentage, fill = variable)) +
+  geom_bar(stat = "identity", position = position_dodge(), width = 0.7) +
+  scale_fill_manual(values=c("black", "grey50", "grey80"), 
+                    labels = c("Code_Shared", "Repository_data_bool", "Preprint_Match")) +
+  theme_minimal(base_size = 14) +
+  labs(x = "Division", y = "Percentage of publications", title = "Adoption of OSI by Division") +
+  theme(axis.text.x = element_text(angle = 65, hjust = 1), # Adjust for readability
+        legend.title = element_blank()) + # Clean legend
+  scale_x_discrete(limits = paste("division_", 1:18, sep = "")) # Ensure correct order
+
diff --git a/analysis/plots/Rplot.png b/analysis/plots/Rplot.png
diff --git a/analysis/plots/Rplot04.pdf b/analysis/plots/Rplot04.pdf
diff --git a/analysis/plots/Rplot05.pdf b/analysis/plots/Rplot05.pdf
diff --git a/analysis/r_models.R b/analysis/r_models.R
@@ -4,7 +4,7 @@
 # Author: Giovanni Colavizza
 
 # Set your own working directory here
-setwd("~/das-public/analysis")
+setwd("~/Dropbox/db_projects/Odoma_projects/das-public/analysis")
 
 options(scipen=999) # prevents excessive use of scientific notation
 
@@ -205,10 +205,16 @@ summary(m_ols <- lm(n_cit_tot_log ~ n_authors_log + n_references_tot_log + p_yea
 # Control for OSI: Code Generated
 summary(m_ols <- lm(n_cit_tot_log ~ n_authors_log + n_references_tot_log + p_year + p_month + h_index_mean_log + C(is_plos) + C(is_plos_one) + 
                       C(Data_Shared) + C(Data_Location) + C(Repositories_data_bool) + C(Code_Generated) + C(Code_Shared) + C(Code_Location) + C(Preprint_Match) , data = DATASET))
+summary(m_rols <- rlm(n_cit_tot_log ~ n_authors_log + n_references_tot_log + p_year + p_month + h_index_mean_log + C(is_plos) + C(is_plos_one) + 
+                      C(Data_Shared) + C(Data_Location) + C(Repositories_data_bool) + C(Code_Generated) + C(Code_Shared) + C(Code_Location) + C(Preprint_Match) , data = DATASET))
+
 
 # Control for interactions
 summary(m_ols <- lm(n_cit_tot_log ~ n_authors_log + n_references_tot_log + p_year + p_month + h_index_mean_log + C(is_plos) + C(is_plos_one) + 
                       C(Data_Shared) + C(Data_Location) + C(Repositories_data_bool)*C(Preprint_Match) + C(Code_Generated)*C(Code_Shared) + C(Code_Location) , data = DATASET))
+summary(m_rols <- rlm(n_cit_tot_log ~ n_authors_log + n_references_tot_log + p_year + p_month + h_index_mean_log + C(is_plos) + C(is_plos_one) + 
+                      C(Data_Shared) + C(Data_Location) + C(Repositories_data_bool)*C(Preprint_Match) + C(Code_Generated)*C(Code_Shared) + C(Code_Location) , data = DATASET))
+
 
 # Using jitter, different shapes, and alpha blending
 ggplot(DATASET, aes(x = Repositories_data_bool, y = n_cit_tot_log, 
@@ -306,7 +312,11 @@ DATASET_2020 <- DATASET[(DATASET$p_year<2021),]
 DATASET_2021 <- DATASET[(DATASET$p_year<2022),]
 DATASET_2022 <- DATASET[(DATASET$p_year<2023),]
 
-summary(m_ols <- lm(n_cit_2_log ~ n_authors_log + n_references_tot_log + p_year + p_month + h_index_mean_log + C(is_plos) + C(is_plos_one) + 
+summary(m_ols <- lm(n_cit_1_log ~ n_authors_log + n_references_tot_log + p_year + p_month + h_index_mean_log + C(is_plos) + C(is_plos_one) + 
+                      C(Data_Shared) + C(Data_Location) + C(Repositories_data_bool) + C(Code_Shared) + C(Code_Location) + C(Preprint_Match) 
+                    + C(division_1) + C(division_2) + C(division_3) + C(division_4) + C(division_5) + C(division_6) + C(division_7) + C(division_8) + C(division_9) + C(division_10) + C(division_11) + C(division_12) + C(division_13) + C(division_14) + C(division_15) + C(division_16) + C(division_17) + C(division_18)
+                    , data = DATASET_2022)) # change DATASET here
+summary(m_rols <- rlm(n_cit_1_log ~ n_authors_log + n_references_tot_log + p_year + p_month + h_index_mean_log + C(is_plos) + C(is_plos_one) + 
                       C(Data_Shared) + C(Data_Location) + C(Repositories_data_bool) + C(Code_Shared) + C(Code_Location) + C(Preprint_Match) 
                     + C(division_1) + C(division_2) + C(division_3) + C(division_4) + C(division_5) + C(division_6) + C(division_7) + C(division_8) + C(division_9) + C(division_10) + C(division_11) + C(division_12) + C(division_13) + C(division_14) + C(division_15) + C(division_16) + C(division_17) + C(division_18)
                     , data = DATASET_2022)) # change DATASET here
@@ -343,6 +353,9 @@ DATASET_PS <- merge(x = DATASET, y = j_freq, by = "Preprint_Server")
 summary(m_ols <- lm(n_cit_tot_log ~ n_authors_log + n_references_tot_log + p_year + p_month + h_index_mean_log + C(is_plos) + C(is_plos_one) + 
                       C(Data_Shared) + C(Data_Location) + C(Repositories_data_bool) + C(Code_Shared) + C(Code_Location) + C(Preprint_Match)
                     + C(Preprint_Server), data = DATASET_PS))
+summary(m_rols <- rlm(n_cit_tot_log ~ n_authors_log + n_references_tot_log + p_year + p_month + h_index_mean_log + C(is_plos) + C(is_plos_one) + 
+                      C(Data_Shared) + C(Data_Location) + C(Repositories_data_bool) + C(Code_Shared) + C(Code_Location) + C(Preprint_Match)
+                    + C(Preprint_Server), data = DATASET_PS))
 
 ######################
 # More models checks #