SDM_BGE_Pipeline.R

rm(list = ls(all=T))
setwd("D:/R")
#save(list=ls(all=TRUE), file="D:/R/SDM.RData") # save RDATA for later use
#load("D:/R/SDM.RData")

#### Install required libraries ####
library(rgbif)
library(raster) 
library(sp)
library(mapr)
library(dismo)
library(rgeos)


#### Define extent and import study area shapefile ####
#assign CRS
P4S.latlon <- CRS("+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0")

#Import shapefile
countries <- rgdal::readOGR("GISDATA/Study Area SHP")
#dev.off()
proj4string(countries) <- P4S.latlon
plot(countries)
class(countries)
str(countries); countries@bbox

##BUFFER start
#2nd way
#Error: TopologyException: unable to assign free hole to a shell at 12.46467256644139 43.895551918479619
countriesBuffer <- buffer(countries, width=5, dissolve=TRUE) # 5 Degree buffer
plot(countriesBuffer, add=T, col='red')
plot(countries, add=T)
class(countriesBuffer)

#1st way:
countriesBuffer <- buffer(countries, width=3, dissolve=F)
plot(countriesBuffer, add=T, col='red')
#BUFFERED EXTENT IS THE SAME WITH THE ORIGINAL EXTENT??
extent(countriesBuffer);extent(countries)

plot(countries, add=T)
plot(countriesBuffer,col='darkred',main = 'b) 30 m buffer',lty=0)
plot(countries,col='darkred',main = 'b) 30 m buffer',lty=0)

countriesPolygons <- SpatialPolygons(countries@polygons)
class(countries@polygons)
class(countriesPolygons)
str(countriesPolygons)

#error?: Invalid geometry, may only be applied to polygons
countriesMerged <- gUnaryUnion(countries@polygons)
plot(countriesPolygons)

#??
countries_buf <- gBuffer(countries, width = 1000, quadsegs = 10)
plot(countries_buf)

extent(countries); extent(countriesBuffer); extent(countries_buf)
#Buffer end.

#Define extent
extent <- extent(-43, 109, 25, 82) ##NEED TO BE REARRANGE FOR EAST SIDE

#### 1. GBIF DATA ####
#get the key of species by its name
key = name_backbone(name="Oeneis jutta")$speciesKey

#search data according to key. **occ_data vs occ_Search
dat <- occ_data(taxonKey = key,limit=1000) #limit??
names(dat)
names(dat$data)

#assign data to a variable
dat.data <- dat$data
dat.data

#plot occurences
map_plot(dat.data, lon = "decimalLongitude", lat = "decimalLatitude", size = 1, pch = 3)

#take all the points in the extent of study area
dat.extent <- dat.data[dat.data$decimalLongitude >= -43 & dat.data$decimalLatitude <= 109, ]

#plot occurrences data and the study area
plot(countries); points(dat.extent$decimalLongitude , dat.extent$decimalLatitude, pch=19, col="red")

#Prepare the data
fe.gbif <- dat.extent[, c('species', 'decimalLongitude', 'decimalLatitude')]
head(fe.gbif); dim(fe.gbif)
duplicates <- duplicated(fe.gbif)
fe.gbif <- fe.gbif[!duplicates,]
head(fe.gbif); dim(fe.gbif)
#write.csv(fe.gbif, 'Oeneis_jutta.csv', row.names=F)

fe.gbif.maxent <- fe.gbif[, c("species", "decimalLongitude", "decimalLatitude")]
names(fe.gbif.maxent) <- c("species", "lon", "lat")
head(fe.gbif.maxent); dim(fe.gbif.maxent)
plot(countries); points(fe.gbif.maxent$lon, fe.gbif.maxent$lat, pch=19, col='red')
#write.csv(fe.gbif.maxent, 'Output/fe.gbif.maxent.csv', row.names=F) # write species points file

fe.gbif.bmod2 <- fe.gbif.maxent

#assign the fe.gbif as SpatialPointsDataFrame
coordinates(fe.gbif) <- ~decimalLongitude+decimalLatitude
str(fe.gbif)
fe.gbif@proj4string <- P4S.latlon
plot(countries); plot(fe.gbif, col='red', add=T, pch = 16)


### 2. Compile climate data ####
# WORLDCLIM DATA #

# Download precipitation/Bio/+++ data from WorldClim
global.clim <- getData("worldclim", var="bio", res=5, download=T, path="RDATA")

files.present.bil <- list.files('D:/Github/BGE-SDM/RDATA/wc5/', pattern="[.]bil$", full.names=T) # alternatives for pattern (c|C)(e|E)(l|L)$
files.present.bil

# Loop for cropping with extent and write as ascii
for(i in files.present.bil)  {
  raster <- raster(i)
  raster <- crop(raster, extent)
  writeRaster(raster,
              filename  = paste("D:/Github/BGE-SDM/RDATA/clipped/", "fe_buffer_", basename(i), sep = ""),
              format    = 'ascii',
              NAflag    = -9999,
              overwrite = T)
}

# read worldclim
files.present <- list.files('D:/Github/BGE-SDM/RDATA/clipped/', pattern="[.]asc$", full.names=T) # alternatives for pattern (c|C)(e|E)(l|L)$
files.present
present.stack <- stack(files.present)
head(present.stack)

present.df <- as.data.frame(present.stack, xy=T)
coordinates(present.df) <- ~x+y
gridded(present.df) <- T
present.df@proj4string <- P4S.latlon
present.df$grid.index <- present.df@grid.index # Add grid.index value
head(present.df)
image(present.df, 'fe_buffer_bio01') #Annual Mean Temperature
str(present.df)
class(present.df)
present.df@coords
present.df@proj4string <- P4S.latlon

### 3. Get abiotic bioclim data ####
str(fe.gbif); str(present.df)
fe.gbif.abiotic <- over(fe.gbif, present.df) # Get climate variables + grid.index
str(fe.gbif.abiotic) 
fe.gbif.abiotic
class(present.df)
head(fe.gbif.abiotic)
dim(fe.gbif.abiotic); dim(fe.gbif)
names(fe.gbif)
head(fe.gbif); str(fe.gbif)
fe.gbif <- cbind(fe.gbif, fe.gbif.abiotic) # Link species col and climate data
names(fe.gbif)
head(fe.gbif)
duplicates <- duplicated(fe.gbif@data[,c("species", "grid.index")]) # Duplicates on grid.index
str(fe.gbif@data)

fe.gbif <- fe.gbif[!duplicates,]
names(fe.gbif)
table(duplicates)
summary(fe.gbif)
fe.gbif <- na.omit(fe.gbif@data)
head(fe.gbif) ; dim(fe.gbif)

plot(raster(present.df, 'fe_buffer_bio01')); points(fe.gbif$decimalLongitude, fe.gbif$decimalLatitude) #Mean annual Temperature

boxplot(fe.gbif$fe_buffer_bio01, main = "Mean annual Temperature", ylab="Temperature x 10 (in °C)")
boxplot(fe.gbif$fe_buffer_bio12, main = "Annual precipitation", ylab="Precipitation (in mm)")


### 4. Run Maxent model with bioclim data in 500 km buffered area around presences to balance prevalence ####

# climate has priority over soil - hierarchical model, 1 climate, 2 soil
# Boucher-Lalonde, V., A. Morin and D. J. Currie (2012). "How are tree species distributed in climatic space? A simple and general pattern." Global Ecology and Biogeography 21(12): 1157-1166.

### Create empty mask layer

mask <- raster(files.present[1])
#dev.off()
plot(mask)
mask <- !is.na(mask) # all values to 1
mask[mask == 0] <- NA # zero values to NA
plot(mask)
summary(mask)
writeRaster(mask, filename  = "Output/mask.asc", format = 'ascii', NAflag = -9999, overwrite = T)
mask <- raster('Output/mask.asc')
plot(mask, col='red')

# add mask to present.df
present.df@data$mask <- as.data.frame(stack('Output/mask.asc')) # Add mask layer to spdf
head(present.df); dim(present.df)
head(fe.gbif)

# Convert to coll.locs spatial Points Data Frame and create 500 km buffer
coordinates(fe.gbif.maxent) <- ~lon+lat
proj4string(fe.gbif.maxent) <- P4S.latlon
plot(countries); points(fe.gbif.maxent, pch=19, cex=0.5, col='red')

x <- circles(fe.gbif.maxent, d=500000, lonlat=TRUE) # 500 km
pol <- gUnaryUnion(x@polygons) # dissolve polygons
extent(pol)
plot(pol, col='blue', add=T); points(fe.gbif.maxent, pch=19, cex=0.5, col='red')


# extract cell numbers for the circles
v <- extract(mask, x@polygons, cellnumbers=T)
str(v)
# use rbind to combine the elements in list v
v <- do.call(rbind, v)
head(v); dim(v) 

# remove ocean cells
v <- unique(na.omit(v))
head(v); dim(v)

# to display the results
m <- mask
m[] <- NA # empty mask
m[as.vector(v[,1])] <- 1
plot(m, col='purple')
extent(m)
str(m); summary(m)
plot(m, ext=extent(x@polygons)+1, col='blue')
plot(x@polygons, add=T)
points(fe.gbif.maxent, pch=19, cex=0.5, col='red')
plot(countries, add=T)
str(m) # rasterlayer

# Write mask for buffered areas
writeRaster(m, filename  = "D:/Github/BGE-SDM/Output/mask.500km.asc", format = 'ascii', NAflag = -9999, overwrite = T)

### Add mask.buffer to present.df
head(present.df); dim(present.df)
# present.df <- subset(present.df, select = -c(mask)) # remove mask column

present.species.df <- present.df # copy present.df
mask.buffer.df <- as.data.frame(stack('D:/Github/BGE-SDM/Output/mask.500km.asc'), xy=T) # read mask layer

names(mask.buffer.df)
names(mask.buffer.df)[3]
names(mask.buffer.df)[3] <- "mask.500km"

head(mask.buffer.df); dim(mask.buffer.df); colSums(mask.buffer.df, na.rm=T, dims=1)
head(present.species.df); str(present.species.df)
dim(present.species.df@data)
dim(mask.buffer.df)

present.species.df@data$mask <- mask.buffer.df[,'mask.500km'] # replace mask with mask.500km
head(mask.buffer.df)
present.species.df <- cbind(present.species.df@coords, present.species.df@data)


head(present.species.df); dim(present.species.df)
str(present.species.df) # SpatialPixelsDataFrame
present.species.df <- na.omit(present.species.df@data)
head(present.species.df); dim(present.species.df)


### 5. Select uncorrelated variables using VIF ####
### Variance Inflation Factor within buffered area
# A VIF for a single explanatory variable is obtained using the r-squared value of the regression of that variable against all other explanatory variables (http://www.r-bloggers.com/collinearity-and-stepwise-vif-selection/)

x <- sample(1:(dim(present.species.df)[1]), 10000, replace=F) # sample 10k background points for the VIF

sample.df <- present.species.df[x,]
#sample.df <- present.species.df
head(sample.df); dim(sample.df) 
plot(countries); points(sample.df$x, sample.df$y, col='green'); plot(countries, add=T)
head(sample.df)
sample.matrix <- as.matrix(sample.df)
head(sample.matrix); dim(sample.matrix)

### VIF ###

keep.dat <- colnames(present.species.df[,1:19]) # To use all variables

keep.dat <- c(keep.dat, 'mask')
str(keep.dat)
sample.matrix.keep <- sample.matrix[, (colnames(sample.matrix) %in% keep.dat)]
head(sample.matrix.keep); dim(sample.matrix.keep)
summary(sample.matrix.keep)

sample.df.keep <- data.frame(sample.matrix.keep) 
dim(sample.df.keep)
names(sample.df.keep)

# Species dataframe for keep.dat
head(fe.gbif); dim(fe.gbif); str(fe.gbif) 
fe.gbif.df <- fe.gbif[, (colnames(fe.gbif) %in% keep.dat)] # retrieve species records
head(fe.gbif.df); dim(fe.gbif.df) 
fe.gbif.df$mask <- 1 # Add mask column
names(fe.gbif.df)


### 6. MAXENT bioclim ####

### CHECK FOLDER NAMES !!!
### Logistic

library(biomod2)
library(ggplot2)
library(gridExtra)
library(raster)
library(rasterVis)


fe.gbif.bmod2["species"] <- 1
Oeneis_jutta_occ <- fe.gbif.bmod2
summary(Oeneis_jutta_occ)

## format the data ----
Oeneis_jutta_data <- 
  BIOMOD_FormatingData(
    resp.var = Oeneis_jutta_occ['species'],
    resp.xy = Oeneis_jutta_occ[, c('lon', 'lat')],
    expl.var = present.stack,
    resp.name = "Oeneis.jutta",
    PA.nb.rep = 2,
    PA.nb.absences = 500,
    PA.strategy = 'random'
  )

## formatted object summary
Oeneis_jutta_data
dim(Oeneis_jutta_data)
str(Oeneis_jutta_data)
## plot of selected pseudo-absences
plot(Oeneis_jutta_data)

## define individual models options ---- 
Oeneis_jutta_opt <- 
  BIOMOD_ModelingOptions(
    GLM = list(type = 'quadratic', interaction.level = 1),
    GBM = list(n.trees = 1000),
    GAM = list(algo = 'GAM_mgcv')
  )

##ERROR: argument 1 matches multiple formal arguments
## run the individual models ----
Oeneis_jutta_models <- 
  BIOMOD_Modeling(
    data = Oeneis_jutta_data,
    models = c("GLM", "GBM", "RF", "GAM"),
    models.options = BiomodOptions,
    NbRunEval = 2,
    DataSplit = 80,
    VarImport = 3,
    modeling.id = "demo1"
  )


### Null-model ###
## Run null-model from source

### 7. SDM on ISRIC Soil within climate niche ####

### 8. Get ISRIC soil data for fe gbif collections within bioclim model prediction ####
##??


### 9. Restrict ISRIC background to 500km buffer original dataset ####

### 10. MAXENT ISRIC ####

### 11. Project to future bioclim ####

### 11. Project to future ISRIC ####