modelling.rmd

---
title: "Analysis & Modelling"
output:
  html_document: default
  pdf_document: default
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(MASS)
library(dplyr)
library(rsq)
library(here)
library(corrplot)
library(Hmisc)
library(rcompanion)
library(pscl)
library(olsrr)
library (psych)
load(here('data.RData')) # adjust according to your naming scheme
```

# Descriptive Statistics

## Dependent variable `retweet_count`

```{r}
describe(data[, c("retweet_count")], trim = 0.2)
```

## Dependent variable `retweet_timelag` 

Applying 24 hours restriction (86400 sec)

```{r}

data_subset_retweeted_24h <- data[data$retweet_timelag < 86400 & data$retweet_count > 0,]
dim(data_subset_retweeted_24h)
```
  
Thus, with `retweet_timelag < 86400`, we have N=3645. 

```{r}
describe(data_subset_retweeted_24h[, c("retweet_timelag")], trim = 0.2)
boxplot(data_subset_retweeted_24h[, c("retweet_timelag")], trim = 0.2)
```

  
Convert `retweet_timelag`, originally measured in sec. into min as in S&D-X (2013). 

```{r}
data_subset_retweeted_24h$retweet_timelag_min<-(data_subset_retweeted_24h$retweet_timelag)/60
```
  
### Descriptive statistics 

```{r}
describe(data_subset_retweeted_24h[, c("retweet_timelag_min")], trim = 0.2)
boxplot(data_subset_retweeted_24h[, c("retweet_timelag_min")], trim = 0.2)
```
  

```{r,  fig.height = 3, fig.width = 5, fig.align = "center"}
hist(data_subset_retweeted_24h$retweet_timelag_min, breaks=5, 
     main="Time to 1st retweet",
xlab="Time lag in minutes",
col="darkmagenta",
freq=FALSE)
```

Density plot informs that most of tweet dissemination happens within 500 min = 8 hours 20 min. 


## Independent variables (full sample, N=9894)

```{r}
describe(data[, c("sentiment",
                  "hashtag_count",
                  "url_existence",
                  "user_activity",
                  "author_follower_count",
                  "total_liwc_sentiment",
                  "retweet_timelag")], trim = 0.2)
```

## Correlation Matrix of Independent Variables  (full sample, N=9894)
```{r}
used_subset <- select(data, sentiment, total_liwc_sentiment, hashtag_count, url_existence, user_activity, author_follower_count)
corrplot(cor(used_subset), method = 'number', type = 'lower')
Pearson<-rcorr(as.matrix(used_subset), type="pearson")
Pearson 
```

Correlation Matrix of independent variables suggests no high correlations, thus, we assume multicollinearity is not an issue in our data (N=9894).    


## Correlation Matrix of Independent Variables  (resticted sample, N=3645)

```{r}
used_subset <- select(data_subset_retweeted_24h, sentiment,total_liwc_sentiment, hashtag_count, url_existence, user_activity, author_follower_count)
corrplot(cor(used_subset), method = 'number', type = 'lower')
Pearson<-rcorr(as.matrix(used_subset), type="pearson")
Pearson 
```

Correlation Matrix of independent variables suggests no high correlations, thus, we assume multicollinearity is not an issue in our data (N=3645).

# Methodological Replication With Sentiment Analysis in SentiStrength (3.1.1.)

## H1

```{r H1 SentiStrength}
h1 <- glm.nb(retweet_count ~ 
               sentiment + hashtag_count + url_existence + log(user_activity) + log(author_follower_count),
             data = data)
summary(h1)
nagelkerke(h1) # the McFadden, Cox and Snell, and Nagelkerke pseudo R2
exp(h1$coefficients) # exponentiated coefficients for effectsize
odTest(h1) # testing model assumption (overdispersion)
```

Negative-binomial model instead of poisson is justified, i.e. count data is overdispersed. 

## H2

```{r H2 SentiStrength}
summary(h2<-lm(log(retweet_timelag_min) ~ 
                 sentiment + hashtag_count + url_existence + log(author_follower_count) + log(user_activity),
               data = data_subset_retweeted_24h))
```

## H3A

```{r H3A SentiStrength}
h3a <- glm.nb(retweet_count ~
                sentiment + is_Negative + interaction_term + hashtag_count + url_existence + log(user_activity) + log(author_follower_count),
              data = data)
summary(h3a)
nagelkerke(h3a) # adjusted r squared 
exp(h3a$coefficients) # exponentiated coefficients for effect size
```

## H3B

```{r H3B SentiStrength}
h3b <- lm(log(retweet_timelag_min) ~ sentiment + is_Negative + interaction_term + hashtag_count + url_existence + log(author_follower_count) + log(user_activity), data = data_subset_retweeted_24h)
summary(h3b)
```

# Conceptual Replication With Sentiment Analysis in LIWC

## H1

```{r H1LIWC}
h1_liwc <- glm.nb(retweet_count ~
                    total_liwc_sentiment + hashtag_count + url_existence + log(user_activity) + log(author_follower_count),
                  data = data)
summary(h1_liwc)
nagelkerke(h1_liwc) # adjusted r squared 
exp(h1_liwc$coefficients) # exponentiated coefficients for effect size
```

## H2

```{r H2LIWC}
summary(h2_liwc<-lm(log(retweet_timelag_min) ~
                      total_liwc_sentiment + hashtag_count + url_existence + log(author_follower_count) + log(user_activity),
                    data = data_subset_retweeted_24h))
```

## H3A

```{r H3ALIWC}
h3a_liwc<- glm.nb(retweet_count ~
                    total_liwc_sentiment + liwc_isNegative + liwc_interaction_term + hashtag_count + url_existence + log(user_activity) + log(author_follower_count),
                  data = data)
summary(h3a_liwc)
nagelkerke(h3a_liwc) # adjusted r squared 
exp(h3a_liwc$coefficients) # exponentiated coefficients for effect size
```

## H3B
```{r H3BLIWC}
h3b_liwc <- lm(log(retweet_timelag_min) ~
                 total_liwc_sentiment + liwc_isNegative + liwc_interaction_term + hashtag_count + url_existence + log(author_follower_count) + log(user_activity),
               data = data_subset_retweeted_24h)
summary(h3b_liwc)
```