Skip to content

Commit

Permalink
added descriptive plots
Browse files Browse the repository at this point in the history
  • Loading branch information
Giovanni1085 committed Mar 28, 2024
1 parent 4d3258a commit 747c190
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 7 deletions.
98 changes: 93 additions & 5 deletions analysis/descriptive.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
# Author: Giovanni Colavizza

# Set your own working directory here
setwd("~/das-public/analysis")
setwd("~/Dropbox/db_projects/Odoma_projects/das-public/analysis")

options(scipen=999) # prevents excessive use of scientific notation

require(reshape2)
require(ggplot2)
require(GGally)
require(dplyr)
Expand All @@ -18,14 +19,14 @@ DATASET <- read.csv("dataset/DATASET.csv")
summary(DATASET)

# correlations
corr <- round(cor(DATASET[, c("n_cit_tot", "n_cit_2", "n_authors", "n_references_tot", "p_year", "h_index_mean")], method = "pearson", use="complete.obs"), 2)
corr <- round(cor(DATASET[, c("n_cit_tot", "n_cit_2", "n_authors", "n_references_tot", "p_year", "p_month", "h_index_mean")], method = "pearson", use="complete.obs"), 2)
upper <- corr
upper[upper.tri(corr, diag = TRUE)] <- ""
upper <- as.data.frame(upper)
upper
ggpairs(DATASET[, c("n_cit_tot", "n_cit_2", "n_authors", "n_references_tot", "p_year", "h_index_mean")])
ggpairs(DATASET[, c("n_cit_tot", "n_cit_2", "n_authors", "n_references_tot", "p_year", "p_month", "h_index_mean")])

corr <- round(cor(DATASET[, c("n_cit_tot", "n_cit_2", "n_authors", "n_references_tot", "p_year", "h_index_mean", "Data_Shared","Repositories_data_bool","Code_Generated","Code_Shared","Preprint_Match")], method = "pearson", use="complete.obs"), 2)
corr <- round(cor(DATASET[, c("n_cit_tot", "n_cit_2", "n_authors", "n_references_tot", "p_year", "p_month", "h_index_mean", "Data_Shared","Repositories_data_bool","Code_Generated","Code_Shared","Preprint_Match")], method = "pearson", use="complete.obs"), 2)
upper <- corr
upper[upper.tri(corr, diag = TRUE)] <- ""
upper <- as.data.frame(upper)
Expand All @@ -36,7 +37,7 @@ qqnorm(DATASET$n_cit_tot_log)
qex <- function(x) qexp((rank(x)-.375)/(length(x)+.25))
plot(qex(DATASET$n_cit_tot),DATASET$n_cit_tot_log)

# check value count distributions
# check value counts

mat <- stack(table(DATASET$Country)) # USE, filtering low value counts
mat <- mat[order(mat$values), ]
Expand Down Expand Up @@ -97,3 +98,90 @@ tail(mat,10)
mat <- stack(table(DATASET$journal_subfield)) # Not useful
mat <- mat[order(mat$values), ]
tail(mat,10)

# DESCRIPTIVE PLOTS

# Load necessary libraries
library(dplyr)
library(tidyr)
library(ggplot2)

# 1: % of OSI over time

# Calculate the percentage of 1s for each variable by year
DATASET_aggregated <- DATASET %>%
group_by(p_year) %>%
summarise(across(c(Code_Shared,Repositories_data_bool,Preprint_Match), ~mean(.x) * 100)) # Calculate the mean and convert to percentage

# Reshape the data from wide to long format for plotting
DATASET_long <- reshape2::melt(DATASET_aggregated, id.vars = "p_year", variable.name = "variable", value.name = "percentage")

# Plotting the data
ggplot(DATASET_long, aes(x = p_year, y = percentage, linetype = variable)) +
geom_line(aes(color = variable)) + # Drawing the lines
scale_color_manual(values = rep("black", 3)) + # Set the colors to black
theme_minimal(base_size = 16) + # Minimal theme
labs(x = "Year", y = "Percentage of publications", title = "Adoption of OSI over time") +
theme(legend.title = element_blank()) + # Remove legend title
scale_linetype_manual(values=c("solid", "dotted", "twodash")) # Custom line types

# 2: OSI by DIVISION

# Replace "True" with 1 and "False" with 0 in division_1 to division_18 columns
DATASET <- DATASET %>%
mutate(across(starts_with("division_"), ~ as.integer(. == "True")))

# Step 1: Filter for division_1 being 1
division_1_data <- DATASET %>% filter(division_1 == 1)

# Step 2: Calculate percentages
percentages <- division_1_data %>%
summarise(across(c(Code_Shared,Repositories_data_bool,Preprint_Match), ~mean(.x, na.rm = TRUE) * 100)) %>%
pivot_longer(cols = c(Code_Shared,Repositories_data_bool,Preprint_Match), names_to = "variable", values_to = "percentage")

# Step 3: Plot
ggplot(percentages, aes(x = variable, y = percentage, fill = variable)) +
geom_bar(stat = "identity", position = position_dodge()) +
theme_minimal(base_size = 14) +
labs(x = "Variable", y = "Percentage of 1s", title = "Percentage of 1s in Division 1") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# ALL divisions

# Initialize an empty dataframe to store the aggregated percentages
aggregated_data <- data.frame(division = character(),
variable = character(),
percentage = numeric())

# Loop through each division
for(i in 1:18) {
division_col <- paste("division_", i, sep = "")

# Calculate the percentage of 1s for Code_Shared,Repositories_data_bool,Preprint_Match within the current division
temp_data <- DATASET %>%
filter(.[[division_col]] == 1) %>%
summarise(Code_Shared = mean(Code_Shared) * 100,
Repositories_data_bool = mean(Repositories_data_bool) * 100,
Preprint_Match = mean(Preprint_Match) * 100) %>%
pivot_longer(cols = c(Code_Shared,Repositories_data_bool,Preprint_Match), names_to = "variable", values_to = "percentage") %>%
mutate(division = division_col)

# Append the results to the aggregated_data dataframe
aggregated_data <- bind_rows(aggregated_data, temp_data)
}

# Ensure division is a factor with levels sorted as desired
# Directly setting levels in numeric order will sort them from 1 to 18 in the plot
aggregated_data$division <- factor(aggregated_data$division, levels = paste("division_", 1:18, sep = ""))

# Plotting, ensuring black and white output and correct ordering of divisions
ggplot(aggregated_data, aes(x = division, y = percentage, fill = variable)) +
geom_bar(stat = "identity", position = position_dodge(), width = 0.7) +
scale_fill_manual(values=c("black", "grey50", "grey80"),
labels = c("Code_Shared", "Repository_data_bool", "Preprint_Match")) +
theme_minimal(base_size = 14) +
labs(x = "Division", y = "Percentage of publications", title = "Adoption of OSI by Division") +
theme(axis.text.x = element_text(angle = 65, hjust = 1), # Adjust for readability
legend.title = element_blank()) + # Clean legend
scale_x_discrete(limits = paste("division_", 1:18, sep = "")) # Ensure correct order

Binary file modified analysis/plots/Rplot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added analysis/plots/Rplot04.pdf
Binary file not shown.
Binary file added analysis/plots/Rplot05.pdf
Binary file not shown.
17 changes: 15 additions & 2 deletions analysis/r_models.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# Author: Giovanni Colavizza

# Set your own working directory here
setwd("~/das-public/analysis")
setwd("~/Dropbox/db_projects/Odoma_projects/das-public/analysis")

options(scipen=999) # prevents excessive use of scientific notation

Expand Down Expand Up @@ -205,10 +205,16 @@ summary(m_ols <- lm(n_cit_tot_log ~ n_authors_log + n_references_tot_log + p_yea
# Control for OSI: Code Generated
summary(m_ols <- lm(n_cit_tot_log ~ n_authors_log + n_references_tot_log + p_year + p_month + h_index_mean_log + C(is_plos) + C(is_plos_one) +
C(Data_Shared) + C(Data_Location) + C(Repositories_data_bool) + C(Code_Generated) + C(Code_Shared) + C(Code_Location) + C(Preprint_Match) , data = DATASET))
summary(m_rols <- rlm(n_cit_tot_log ~ n_authors_log + n_references_tot_log + p_year + p_month + h_index_mean_log + C(is_plos) + C(is_plos_one) +
C(Data_Shared) + C(Data_Location) + C(Repositories_data_bool) + C(Code_Generated) + C(Code_Shared) + C(Code_Location) + C(Preprint_Match) , data = DATASET))


# Control for interactions
summary(m_ols <- lm(n_cit_tot_log ~ n_authors_log + n_references_tot_log + p_year + p_month + h_index_mean_log + C(is_plos) + C(is_plos_one) +
C(Data_Shared) + C(Data_Location) + C(Repositories_data_bool)*C(Preprint_Match) + C(Code_Generated)*C(Code_Shared) + C(Code_Location) , data = DATASET))
summary(m_rols <- rlm(n_cit_tot_log ~ n_authors_log + n_references_tot_log + p_year + p_month + h_index_mean_log + C(is_plos) + C(is_plos_one) +
C(Data_Shared) + C(Data_Location) + C(Repositories_data_bool)*C(Preprint_Match) + C(Code_Generated)*C(Code_Shared) + C(Code_Location) , data = DATASET))


# Using jitter, different shapes, and alpha blending
ggplot(DATASET, aes(x = Repositories_data_bool, y = n_cit_tot_log,
Expand Down Expand Up @@ -306,7 +312,11 @@ DATASET_2020 <- DATASET[(DATASET$p_year<2021),]
DATASET_2021 <- DATASET[(DATASET$p_year<2022),]
DATASET_2022 <- DATASET[(DATASET$p_year<2023),]

summary(m_ols <- lm(n_cit_2_log ~ n_authors_log + n_references_tot_log + p_year + p_month + h_index_mean_log + C(is_plos) + C(is_plos_one) +
summary(m_ols <- lm(n_cit_1_log ~ n_authors_log + n_references_tot_log + p_year + p_month + h_index_mean_log + C(is_plos) + C(is_plos_one) +
C(Data_Shared) + C(Data_Location) + C(Repositories_data_bool) + C(Code_Shared) + C(Code_Location) + C(Preprint_Match)
+ C(division_1) + C(division_2) + C(division_3) + C(division_4) + C(division_5) + C(division_6) + C(division_7) + C(division_8) + C(division_9) + C(division_10) + C(division_11) + C(division_12) + C(division_13) + C(division_14) + C(division_15) + C(division_16) + C(division_17) + C(division_18)
, data = DATASET_2022)) # change DATASET here
summary(m_rols <- rlm(n_cit_1_log ~ n_authors_log + n_references_tot_log + p_year + p_month + h_index_mean_log + C(is_plos) + C(is_plos_one) +
C(Data_Shared) + C(Data_Location) + C(Repositories_data_bool) + C(Code_Shared) + C(Code_Location) + C(Preprint_Match)
+ C(division_1) + C(division_2) + C(division_3) + C(division_4) + C(division_5) + C(division_6) + C(division_7) + C(division_8) + C(division_9) + C(division_10) + C(division_11) + C(division_12) + C(division_13) + C(division_14) + C(division_15) + C(division_16) + C(division_17) + C(division_18)
, data = DATASET_2022)) # change DATASET here
Expand Down Expand Up @@ -343,6 +353,9 @@ DATASET_PS <- merge(x = DATASET, y = j_freq, by = "Preprint_Server")
summary(m_ols <- lm(n_cit_tot_log ~ n_authors_log + n_references_tot_log + p_year + p_month + h_index_mean_log + C(is_plos) + C(is_plos_one) +
C(Data_Shared) + C(Data_Location) + C(Repositories_data_bool) + C(Code_Shared) + C(Code_Location) + C(Preprint_Match)
+ C(Preprint_Server), data = DATASET_PS))
summary(m_rols <- rlm(n_cit_tot_log ~ n_authors_log + n_references_tot_log + p_year + p_month + h_index_mean_log + C(is_plos) + C(is_plos_one) +
C(Data_Shared) + C(Data_Location) + C(Repositories_data_bool) + C(Code_Shared) + C(Code_Location) + C(Preprint_Match)
+ C(Preprint_Server), data = DATASET_PS))

######################
# More models checks #
Expand Down

0 comments on commit 747c190

Please sign in to comment.