From 7dd61144b88f4a1a9ec418ff56d53bbc89be339c Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Tue, 15 Aug 2023 08:51:07 +0000 Subject: [PATCH 01/83] Renaming file for better accuracy Adding WIP markdowns --- R/bias-correction-methods/ThreeCitiesQmap.RMD | 76 +++++ R/bias-correction-methods/WIP_EQM.RMD | 274 ++++++++++++++++++ ...rpd_df_fn.R => apply_qmap_to_crpd_df_fn.R} | 0 3 files changed, 350 insertions(+) create mode 100644 R/bias-correction-methods/ThreeCitiesQmap.RMD create mode 100644 R/bias-correction-methods/WIP_EQM.RMD rename R/bias-correction-methods/{apply_bias_correction_to_crpd_df_fn.R => apply_qmap_to_crpd_df_fn.R} (100%) diff --git a/R/bias-correction-methods/ThreeCitiesQmap.RMD b/R/bias-correction-methods/ThreeCitiesQmap.RMD new file mode 100644 index 00000000..f257fd7c --- /dev/null +++ b/R/bias-correction-methods/ThreeCitiesQmap.RMD @@ -0,0 +1,76 @@ +--- +title: "Quantile Mapping across three cities" +author: "Ruth C E Bowyer" +date: "`r format(Sys.Date())`" +output: + github_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + + +## 0. About + +Testing `Qmap` for the 3 cities + +```{r libraries dd} +rm(list=ls()) + +library(qmap) +library(terra) + +dd <- "/mnt/vmfileshare/ClimateData/" + +``` + +## 1. Convert data to dataframes + +Qmap uses df as inpt + +```{r} + +cities <- c("London", "Manchester", "Glasgow") + +cities.cpm.dfs <- lapply(cities, function(x){ + + fp <- paste0(dd, "/Interim/CPM/three.cities/",x,"/grouped/") + files <- list.files(fp) + files.paths <- paste0(fp, files) + + # Load and to df + dfL <- lapply(files.paths, function(i){ + r <- rast(i) + rdf <- as.data.frame(r, xy=T) + return(rdf) + }) + }) + +#HERE +names(cities.cpm.dfs) + + +cities.Hads.dfs <- lapply(cities, function(x){ + + fp <- paste0(dd, "/Interim/HadsUK/three.cities/",x,"/grouped/") + files <- list.files(fp) + files.paths <- paste0(fp, files) + + # Load and to df + dfL <- lapply(files.paths, function(i){ + r <- rast(i) + rdf <- as.data.frame(r, xy=T) + return(rdf) + }) + }) + + +``` + +## 2. Apply bias correction by variable/run + +```{r} + +``` + diff --git a/R/bias-correction-methods/WIP_EQM.RMD b/R/bias-correction-methods/WIP_EQM.RMD new file mode 100644 index 00000000..8fdb2241 --- /dev/null +++ b/R/bias-correction-methods/WIP_EQM.RMD @@ -0,0 +1,274 @@ +--- +title: "Linear.Scaling" +author: "Ruth C E Bowyer" +date: "`r format(Sys.Date())`" +output: + github_document + +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + + +## **0. About** + +Testing Bias Correction methods from the Qmap package in R + +Loading data as created in 'Data_Processing_todf.R' + +```{r libraries dd} +rm(list=ls()) + +library(MBC) +library(terra) +library(sf) +library(exactextractr) +library(reshape2) #melt +library(data.table) #for fread +library(tidyverse) + +#Loaded package versions +x <- c("MBC", "terra", "sf", "exactextractr") +lapply(x,packageVersion) + +#Path is "//vmfileshare/ClimateData +#dd <- "/Volumes/vmfileshare/ClimateData/" +dd <- "/mnt/vmfileshare/ClimateData/" +``` + + +## **1. Load data** + +As this method is univariate - applying seperately to each variable - starting with tasmax +Starting with smallest region - London - for testing + +```{r hads obs data} + +#HADs grid observational data +fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") +files <- list.files(fp) + +#Subset to London (UKI) +Ldn.obs <- files[grepl("UKI", files)] + +#start with tasmax +Ldn.obs.tasmax <- Ldn.obs[grepl("tasmax", Ldn.obs)] + +obs.df <- fread(paste0(fp, Ldn.obs.tasmax)) +obs.df <- as.data.frame(obs.df) + +row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) +obs.df$x <- NULL +obs.df$y <- NULL + +``` + +```{r cpm - calibration} + +#Using 1980 - 2010 as calibration period +fp <- paste0(dd, "Interim/CPM/Data_as_df/") +files <- list.files(fp) + +#Calibration years 1980 - 2010 +cpm.cal <- files[grepl("1980|2000", files)] + +#Subset to London (UKI) +Ldn.cpm.cal <- cpm.cal[grepl("UKI", cpm.cal)] + +#start with tasmax +Ldn.cpm.cal.tasmax <- Ldn.cpm.cal[grepl("tasmax", Ldn.cpm.cal)] + +#Load in all the runs +cal.dfs1 <- lapply(Ldn.cpm.cal.tasmax, function(x){ + df <- fread(paste0(fp, x)) + df <- as.data.frame(df) +}) + +names(cal.dfs1) <- Ldn.cpm.cal.tasmax + +#Sub out beyond cal period (2010 - 2020) +years <- 2000:2009 +lyrs <- paste0("_day_", years, collapse = "|") + +cdfs2 <- lapply(cal.dfs1[5:8], function(x){ + x2 <- x[,grepl(lyrs, names(x))] +}) + +names(cdfs2) <- names(cal.dfs1[5:8]) + +cal.dfs <- lapply(c("Run05", "Run06", "Run07", "Run08"), function(x){ + i1 <- paste0("CPM_UKI1980_1999tasmax_", x, ".2023.07.16.csv") + i2 <- paste0("CPM_UKI2000_2020tasmax_", x, ".2023.07.16.csv") + #This does assume x and y are in same order but because converted from raster should be sanity checked + df.f <- list(cal.dfs1[[i1]], cdfs2[[i2]]) %>% reduce(cbind) + row.names(df.f)<- paste0(df.f$x, "_", df.f$y) + df.f$x <- NULL + df.f$y <- NULL + return(df.f) +}) + +names(cal.dfs) <- c("Run05", "Run06", "Run07", "Run08") +``` + + +```{r} +#Seeing if can load All the proj data - 2010 - 2020 is test, so treating with proj - 2020 - 2080 is the rest of the data + +proj.df1 <- lapply(cal.dfs1[5:8], function(x){ + x2 <- x[,!grepl(lyrs, names(x))] +}) + +cpm.proj <- files[grepl("UKI2020|UKI2040|UKI2060", files)] + +#Subset to London (UKI) +Ldn.cpm.proj <- cpm.proj[grepl("UKI", cpm.proj)] + +#start with tasmax +Ldn.cpm.proj.tasmax <- Ldn.cpm.proj[grepl("tasmax", Ldn.cpm.proj)] + +#Load in all the runs +proj.df2 <- lapply(Ldn.cpm.proj.tasmax, function(x){ + df <- fread(paste0(fp, x)) + df <- as.data.frame(df) +}) + +names(proj.df2) <- Ldn.cpm.proj.tasmax +``` + +```{r} + +#reduce however you did above but adding it in first series as above + +proj.dfs <- lapply(c("Run05", "Run06", "Run07", "Run08"), function(x){ + + i1 <- paste0("CPM_UKI2000_2020tasmax_", x, ".2023.07.16.csv") + i2 <- paste0("CPM_UKI2020_2040tasmax_", x, ".2023.07.17.csv") + i3 <- paste0("CPM_UKI2040_2060tasmax_", x, ".2023.07.17.csv") + #This does assume x and y are in same order but because converted from raster should be sanity checked + #Remove x and y from proj df + df2 <- proj.df2[[i2]][c(3:ncol(proj.df2[[i2]]))] + df3 <- proj.df2[[i3]][c(3:ncol(proj.df2[[i3]]))] + df.f <- list(proj.df1[[i1]], df2, df3) %>% + reduce(cbind) + row.names(df.f) <- paste0(df.f$x, "_", df.f$y) + df.f$x <- NULL + df.f$y <- NULL + return(df.f) +}) + +names(proj.dfs) <- c("Run05", "Run06", "Run07", "Run08") + +``` + +## **2. Wrangle the data** + +### Ensure dfs have same grid references + +```{r} +#Note there are a few squares missing in the HADs grid, I'm not sure why (could be predom water which I think looses it?) +cal.Run05 <- cal.dfs$Run05 +proj.Run05 <- proj.dfs$Run05 +missing.in.hads.cpm.cal <- cal.Run05[-which(row.names(cal.Run05)%in%row.names(obs.df)),] +missing.in.hads.cpm.proj <- proj.Run05[-which(row.names(proj.Run05)%in%row.names(obs.df)),] + +cal.Run05 <- cal.Run05[which(row.names(cal.Run05)%in%row.names(obs.df)),] +proj.Run05 <- proj.Run05[which(row.names(proj.Run05)%in%row.names(obs.df)),] +``` + +### Update obs data to 360 days + +The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted + +```{r} +#Convert obs to 360 day year - has 40 more vars so remove the ones not in cal +names(obs.df)[c(28:64)] +names(obs.df)[c(388:440)] + +had365day <- obs.df[,grepl("_1980", names(obs.df))] +had365dates <- gsub("tasmax_1980|1980", "",names(had365day)) + +had360day <- obs.df[,grepl("_1981", names(obs.df))] +had360dates <- gsub("tasmax_1981|1981", "",names(had360day)) + +rem <- had365dates[-which(had365dates%in%had360dates)] #Pulls out all the Feb dates - +#Have added as issue to repo - for now going to remove: "0201-0229_29" "0401-0430_30" "0701-0731_31" "0901-0930_30" "1101-1130_30" - but needs to be checked as should just be _31s removed? + +remove <- c("0229_29", "0430_30", "0731_31", "0930_30", "1130_30") +remove <- paste0(remove, collapse = "|") + +removed.dates <- obs.df[,grepl(remove, names(obs.df))] +obs.df2 <- obs.df[,!grepl(remove, names(obs.df))] + +``` + +### Transpose the data sets + +Obs grid should be cols, observations (time) should be rows for linear scaling + +```{r} +t.cal.Run05 <- t(cal.Run05) +t.proj.Run05 <- t(proj.Run05) +t.obs.df2 <- t(obs.df2) + +``` + + +## **3. Empirical Quantile Mapping** + +(from qmap vignette) - fitQmapQUANT estimates values of the empirical cumulative distribution function of observed and +modelled time series for regularly spaced quantiles. doQmapQUANT uses these estimates to perform +quantile mapping + +```{r} +library('qmap') + +qm1.fit <- fitQmapQUANT(t.obs.df2, t.cal.Run05, + wet.day = FALSE, + qstep = 0.01, + nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. + + +qm1.hist.a <- doQmapQUANT(t.cal.Run05, qm1.fit, type="linear") +qm1.hist.b <- doQmapQUANT(t.cal.Run05, qm1.fit, type="tricub") + +qm1.proj.a <- doQmapQUANT(t.proj.Run05, qm1.fit, type="linear") +qm1.proj.b <- doQmapQUANT(t.proj.Run05, qm1.fit, type="tricub") +``` + + + +## **4. Save the data** + +```{r} +# Save data as .csv +results.L <- list(t.obs.df2, t.cal.Run05, t.proj.Run05, qm1.hist.a, qm1.hist.b, qm1.proj.a, qm1.proj.b) + +names(results.L) <- c("t.obs.df2", "t.cal.Run05", "t.proj.Run05", "qm1.hist.a", "qm1.hist.b", "qm1.proj.a", "qm1.proj.b") + +saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/results.L.london.250724.RDS")) +``` + +```{r} + +library(raster) +source("~/Desktop/clim-recal/clim-recal/R/misc/HAD.CRS.for.sourcing.R") + +# Save data as raster + +test <- t(results.L$qm1.hist.a) +x <- gsub("_.*", "", row.names(test)) +y <- gsub(".*_", "", row.names(test)) +xydf <- data.frame(x=x, y=y) +t2 <- cbind(xydf, test) + + dfr <- rasterFromXYZ(t2, crs=HAD.CRS) #Convert first two columns as lon-lat and third as value + fn <- paste0(dd, "Debiased/R/QuantileMapping/", x, ".tif") + dfr2 <- rast(dfr) #Converting from a raster brick to a terra::rast means when writing layer names are preserved + terra::writeRaster(dfr2, fn, overwrite=T) + +``` + + + diff --git a/R/bias-correction-methods/apply_bias_correction_to_crpd_df_fn.R b/R/bias-correction-methods/apply_qmap_to_crpd_df_fn.R similarity index 100% rename from R/bias-correction-methods/apply_bias_correction_to_crpd_df_fn.R rename to R/bias-correction-methods/apply_qmap_to_crpd_df_fn.R From 6d8875baa531f6c9406df0794043e0753d28adf2 Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Tue, 15 Aug 2023 09:14:08 +0000 Subject: [PATCH 02/83] Renaming file for clarity --- ...f_fn.R => apply_qmapQuant_to_crpd_df_fn.R} | 337 ++++++++++++++++++ 1 file changed, 337 insertions(+) rename R/bias-correction-methods/{apply_qmap_to_crpd_df_fn.R => apply_qmapQuant_to_crpd_df_fn.R} (50%) diff --git a/R/bias-correction-methods/apply_qmap_to_crpd_df_fn.R b/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R similarity index 50% rename from R/bias-correction-methods/apply_qmap_to_crpd_df_fn.R rename to R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R index d4b4d54f..7960db30 100644 --- a/R/bias-correction-methods/apply_qmap_to_crpd_df_fn.R +++ b/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R @@ -342,3 +342,340 @@ for(r in Runs){ } } } + + + +apply_bias_correction_to_cropped_df <- function(region, #Region code - needs to relate to the file name in a unique way to subset + var, #Meterological variables + Runs){ + + i <- region + +for(r in Runs){ + for(v in var){ + if(v!="pr"){ + dd <- "/mnt/vmfileshare/ClimateData/" + + #Subset to Area + #HADs grid observational data + fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") + files <- list.files(fp) + obs <- files[grepl(i, files)] + + #subset file list to var + obs.var <- obs[grepl(v,obs)] + + obs.df <- fread(paste0(fp, obs.var)) + obs.df <- as.data.frame(obs.df) + + row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) + obs.df$x <- NULL + obs.df$y <- NULL + + #Using 1980 - 2010 as calibration period + fp <- paste0(dd, "Interim/CPM/Data_as_df/") + cpm.files <- list.files(fp) + + #Calibration years 1980 - 2010 - load in full one for 1980 - 2000 + cpm.cal <- cpm.files[grepl("1980|2000", cpm.files)] + + #Subset file list to area + cpm.cal <- cpm.cal[grepl(i, cpm.cal)] + + #subset to var and run + cpm.cal.var <- cpm.cal[grepl(v, cpm.cal)&grepl(r, cpm.cal)] + + #Load in + cal.dfs1 <- lapply(cpm.cal.var, function(x){ + df <- fread(paste0(fp, x)) + df <- as.data.frame(df) + }) + + names(cal.dfs1) <- cpm.cal.var + + + #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here + years <- 2000:2009 + lyrs <- paste0("_day_", years, collapse = "|") + + cal.df2 <- cal.dfs1[[2]][,grepl(lyrs, names(cal.dfs1[[2]]))] + + #Create final cal.df for this run + cal.df <- list(cal.dfs1[[1]], cal.df2) %>% reduce(cbind) + row.names(cal.df)<- paste0(cal.df$x, "_", cal.df$y) + cal.df$x <- NULL + cal.df$y <- NULL + + #Clean up + remove("cal.df2") + + #Subset out the test years (2010-2020) + proj.df1 <- cal.dfs1[[2]][,!grepl(lyrs, names(cal.dfs1[[2]]))] + + #Clean up + remove("cal.dfs1") + gc() + + yi <- paste0(i,c(2020,2040,2060), collapse="|") + cpm.proj <- cpm.files[grepl(yi, cpm.files)] + + #Subset to Area, var and run + cpm.proj <- cpm.proj[grepl(i, cpm.proj)&grepl(v, cpm.proj)&grepl(r, cpm.proj)] + + #Load in + proj.df2 <- lapply(cpm.proj, function(x){ + df <- as.data.frame(fread(paste0(fp, x))) + #Remove x and y cols + df[c(3:ncol(df))] + }) + + names(proj.df2) <- cpm.proj + + proj.df <- c(list(proj.df1), proj.df2) %>% reduce(cbind) + + row.names(proj.df) <- paste0(proj.df$x, "_", proj.df$y) + proj.df$x <- NULL + proj.df$y <- NULL + + remove("proj.df1") + remove("proj.df2") + +## **2. Wrangle the data** + + missing.in.hads.cpm.cal <- cal.df[-which(row.names(cal.df)%in%row.names(obs.df)),] + missing.in.hads.cpm.proj <- proj.df[-which(row.names(proj.df)%in%row.names(obs.df)),] + + #mnt/vmfileshare/ClimateData/Debiased/R/QuantileMapping + + cal.df <- cal.df[which(row.names(cal.df)%in%row.names(obs.df)),] + proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] + + #save the missing outputs + p <- paste0("checkpoint1", v, "_", i, "_", r, "_") + print(p) + write.csv(missing.in.hads.cpm.cal, paste0(dd, "Debiased/R/QuantileMapping/missing.in.hads/",r,"_",i,"_",v, ".csv")) + + ### Update obs data to 360 days + + #The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted + + #Convert obs to 360 day year - has 40 more vars so remove the ones not in cal + remove <- c("0229_29", "0430_30", "0731_31", "0930_30", "1130_30") + remove <- paste0(remove, collapse = "|") + + obs.df <- obs.df[,!grepl(remove, names(obs.df))] + +### Transpose the data sets + + #Obs grid should be cols, observations (time) should be rows for linear scaling + + cal.df <- t(cal.df) + proj.df <- t(proj.df) + obs.df <- t(obs.df) + + +## **3. Empirical Quantile Mapping** + +#(from qmap vignette) - fitQmapQUANT estimates values of the empirical cumulative distribution function of observed and +#modelled time series for regularly spaced quantiles. doQmapQUANT uses these estimates to perform +#quantile mapping + p <- paste0("checkpoint2", v, "_", i, "_", r, "_") + print(p) + + library(qmap) + qm1.fit <- fitQmapQUANT(obs.df, cal.df, + wet.day = FALSE, + qstep = 0.01, + nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. + + + qm1.hist.a <- doQmapQUANT(cal.df, qm1.fit, type="linear") + qm1.hist.b <- doQmapQUANT(cal.df, qm1.fit, type="tricub") + + qm1.proj.a <- doQmapQUANT(proj.df, qm1.fit, type="linear") + qm1.proj.b <- doQmapQUANT(proj.df, qm1.fit, type="tricub") + +## **4. Save the data** + p <- paste0("checkpoint3", v, "_", i, "_", r, "_") + print(p) + # Save data - lists of dfs for now (will be easier for assessment) + results.L <- list(obs.df, cal.df, proj.df, qm1.hist.a, qm1.hist.b, qm1.proj.a, qm1.proj.b) + + names(results.L) <- c("t.obs", "t.cal", "t.proj", "qm1.hist.a", "qm1.hist.b", "qm1.proj.a", "qm1.proj.b") + p <- paste0("checkpoint4", v, "_", i, "_", r, "_") + print(p) + base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/resultsL", r,"_",i,"_",v, ".RDS")) + + p <- paste0("checkpoint5", v, "_", i, "_", r, "_") + print(p) + rm(list=setdiff(ls(), c("v", "i", "r", "var", "Runs"))) + + gc(reset=TRUE) + + + } else { + +#### Precipitation - the HADs variable has is called 'rainfall' + + dd <- "/mnt/vmfileshare/ClimateData/" + + #Subset to Area + #HADs grid observational data + fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") + files <- list.files(fp) + obs <- files[grepl(i, files)] + + #subset file list to var + obs.var <- obs[grepl("rainfall",obs)] + + obs.df <- fread(paste0(fp, obs.var)) + obs.df <- as.data.frame(obs.df) + + row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) + obs.df$x <- NULL + obs.df$y <- NULL + + #Using 1980 - 2010 as calibration period + fp <- paste0(dd, "Interim/CPM/Data_as_df/") + cpm.files <- list.files(fp) + + #Calibration years 1980 - 2010 - load in full one for 1980 - 2000 + cpm.cal <- cpm.files[grepl("1980|2000", cpm.files)] + + #Subset file list to area + cpm.cal <- cpm.cal[grepl(i, cpm.cal)] + + #subset to var and run + cpm.cal.var <- cpm.cal[grepl(v, cpm.cal)&grepl(r, cpm.cal)] + + #Load in + cal.dfs1 <- lapply(cpm.cal.var, function(x){ + df <- fread(paste0(fp, x)) + df <- as.data.frame(df) + }) + + names(cal.dfs1) <- cpm.cal.var + + + #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here + years <- 2000:2009 + lyrs <- paste0("_day_", years, collapse = "|") + + cal.df2 <- cal.dfs1[[2]][,grepl(lyrs, names(cal.dfs1[[2]]))] + + #Create final cal.df for this run + cal.df <- list(cal.dfs1[[1]], cal.df2) %>% reduce(cbind) + row.names(cal.df)<- paste0(cal.df$x, "_", cal.df$y) + cal.df$x <- NULL + cal.df$y <- NULL + + #Clean up + remove("cal.df2") + + #Subset out the test years (2010-2020) + proj.df1 <- cal.dfs1[[2]][,!grepl(lyrs, names(cal.dfs1[[2]]))] + + #Clean up + remove("cal.dfs1") + gc() + + yi <- paste0(i,c(2020,2040,2060), collapse="|") + cpm.proj <- cpm.files[grepl(yi, cpm.files)] + + #Subset to Area, var and run + cpm.proj <- cpm.proj[grepl(i, cpm.proj)&grepl(v, cpm.proj)&grepl(r, cpm.proj)] + + #Load in + proj.df2 <- lapply(cpm.proj, function(x){ + df <- as.data.frame(fread(paste0(fp, x))) + #Remove x and y cols + df[c(3:ncol(df))] + }) + + names(proj.df2) <- cpm.proj + + proj.df <- c(list(proj.df1), proj.df2) %>% reduce(cbind) + + row.names(proj.df) <- paste0(proj.df$x, "_", proj.df$y) + proj.df$x <- NULL + proj.df$y <- NULL + + remove("proj.df1") + remove("proj.df2") + + ## **2. Wrangle the data** + + missing.in.hads.cpm.cal <- cal.df[-which(row.names(cal.df)%in%row.names(obs.df)),] + missing.in.hads.cpm.proj <- proj.df[-which(row.names(proj.df)%in%row.names(obs.df)),] + + #mnt/vmfileshare/ClimateData/Debiased/R/QuantileMapping + + cal.df <- cal.df[which(row.names(cal.df)%in%row.names(obs.df)),] + proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] + + #save the missing outputs + p <- paste0("checkpoint1", v, "_", i, "_", r, "_") + print(p) + write.csv(missing.in.hads.cpm.cal, paste0(dd, "Debiased/R/QuantileMapping/missing.in.hads/",r,"_",i,"_",v, ".csv")) + + ### Update obs data to 360 days + + #The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted + + #Convert obs to 360 day year - has 40 more vars so remove the ones not in cal + remove <- c("0229_29", "0430_30", "0731_31", "0930_30", "1130_30") + remove <- paste0(remove, collapse = "|") + + obs.df <- obs.df[,!grepl(remove, names(obs.df))] + + ### Transpose the data sets + + #Obs grid should be cols, observations (time) should be rows for linear scaling + + cal.df <- t(cal.df) + proj.df <- t(proj.df) + obs.df <- t(obs.df) + + + ## **3. Empirical Quantile Mapping** + + #(from qmap vignette) - fitQmapQUANT estimates values of the empirical cumulative distribution function of observed and + #modelled time series for regularly spaced quantiles. doQmapQUANT uses these estimates to perform + #quantile mapping + p <- paste0("checkpoint2", v, "_", i, "_", r, "_") + print(p) + + + qm1.fit <- fitQmapQUANT(obs.df, cal.df, + wet.day = TRUE, #If wet.day=TRUE the empirical probability of nonzero observations is found (obs>=0) and the corresponding modelled value is selected as a threshold. All modelled values below this threshold are set to zero. If wet.day is numeric the same procedure is performed after setting all obs to zero. + qstep = 0.01, + nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. + + + qm1.hist.a <- doQmapQUANT(cal.df, qm1.fit, type="linear") + qm1.hist.b <- doQmapQUANT(cal.df, qm1.fit, type="tricub") + + qm1.proj.a <- doQmapQUANT(proj.df, qm1.fit, type="linear") + qm1.proj.b <- doQmapQUANT(proj.df, qm1.fit, type="tricub") + + ## **4. Save the data** + p <- paste0("checkpoint3", v, "_", i, "_", r, "_") + print(p) + # Save data - lists of dfs for now (will be easier for assessment) + results.L <- list(obs.df, cal.df, proj.df, qm1.hist.a, qm1.hist.b, qm1.proj.a, qm1.proj.b) + + names(results.L) <- c("t.obs", "t.cal", "t.proj", "qm1.hist.a", "qm1.hist.b", "qm1.proj.a", "qm1.proj.b") + p <- paste0("checkpoint4", v, "_", i, "_", r, "_") + print(p) + base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/resultsL", r,"_",i,"_",v, ".RDS")) + + p <- paste0("checkpoint5", v, "_", i, "_", r, "_") + print(p) + rm(list=setdiff(ls(), c("v", "i", "r", "var", "Runs"))) + + gc(reset=TRUE) + + + } + } + } From e9f7a85349d4ec42cca49621971432c4d46e4299 Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Tue, 15 Aug 2023 18:23:30 +0000 Subject: [PATCH 03/83] Updating func and appling BC qmapQuant to three cities --- R/bias-correction-methods/ThreeCitiesQmap.RMD | 37 +- .../apply_qmapQuant_to_crpd_df_fn.R | 332 ++++++------------ 2 files changed, 145 insertions(+), 224 deletions(-) diff --git a/R/bias-correction-methods/ThreeCitiesQmap.RMD b/R/bias-correction-methods/ThreeCitiesQmap.RMD index f257fd7c..7ab21753 100644 --- a/R/bias-correction-methods/ThreeCitiesQmap.RMD +++ b/R/bias-correction-methods/ThreeCitiesQmap.RMD @@ -45,16 +45,22 @@ cities.cpm.dfs <- lapply(cities, function(x){ rdf <- as.data.frame(r, xy=T) return(rdf) }) + + names(dfL) <- files + return(dfL) + }) -#HERE -names(cities.cpm.dfs) - +names(cities.cpm.dfs) <- cities +``` + +```{r} cities.Hads.dfs <- lapply(cities, function(x){ - fp <- paste0(dd, "/Interim/HadsUK/three.cities/",x,"/grouped/") + fp <- paste0(dd, "Interim/HadsUK/three.cities/",x, "/") files <- list.files(fp) + files <- files[!grepl("aux.json", files)] files.paths <- paste0(fp, files) # Load and to df @@ -62,15 +68,34 @@ cities.Hads.dfs <- lapply(cities, function(x){ r <- rast(i) rdf <- as.data.frame(r, xy=T) return(rdf) - }) + }) + + names(dfL) <- files + return(dfL) }) - +names(cities.Hads.dfs) <- cities ``` ## 2. Apply bias correction by variable/run +The called function was written to apply the following models: + +For tasmax and tasmin: + + +For precip + ```{r} +source("/home/dyme/Desktop/clim-recal/clim-recal/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R") + +var <- c("tasmax", "tasmin", "pr") +Runs <-c("05", "06", "07", "08") + +lapply(cities, function(x){ + apply_qmap_to_cropped_dfL(region=x, var=var, Runs = Runs)}) + + ``` diff --git a/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R b/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R index 7960db30..835f4e1d 100644 --- a/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R +++ b/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R @@ -27,8 +27,10 @@ for(r in Runs){ #subset file list to var obs.var <- obs[grepl(v,obs)] - - obs.df <- fread(paste0(fp, obs.var)) + + #subset to calibration years + obs.varc <- obs.var[grepl("1980_", obs.var)] + obs.df <- fread(paste0(fp, obs.varc)) obs.df <- as.data.frame(obs.df) row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) @@ -342,12 +344,15 @@ for(r in Runs){ } } } +} + ########################## + #Function for applying the bias correction to a list of dfs (ie rather than reading in the csvs, as above) -apply_bias_correction_to_cropped_df <- function(region, #Region code - needs to relate to the file name in a unique way to subset +apply_qmap_to_cropped_dfL <- function(region, #Region code - needs to relate to the file name in a unique way to subset var, #Meterological variables - Runs){ + Runs){ #Runs as 05 not Run05 i <- region @@ -356,115 +361,61 @@ for(r in Runs){ if(v!="pr"){ dd <- "/mnt/vmfileshare/ClimateData/" - #Subset to Area - #HADs grid observational data - fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") - files <- list.files(fp) - obs <- files[grepl(i, files)] - - #subset file list to var - obs.var <- obs[grepl(v,obs)] - - obs.df <- fread(paste0(fp, obs.var)) - obs.df <- as.data.frame(obs.df) - + obs.df <- cities.Hads.dfs[[i]] + n <- names(obs.df) + obs.df <- obs.df[[n[grepl(v, n)&grepl("1980",n)]]] #change to rainfall in this #1980 is the calibration period - pulls out of the list + row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) obs.df$x <- NULL obs.df$y <- NULL + + ci <- min(grep("19801201",names(obs.df))) + obs.df <- obs.df[,ci:ncol(obs.df)] + + #Remove the extra dates -- can be removed in future + remove <- c(paste0("0229_", v,"_29"), paste0("0430_",v,"_30"), paste0("0731_",v,"_31"), paste0("0930_",v,"_30"), + paste0("1130_",v,"_30")) + remove <- paste0(remove, collapse = "|") + + obs.df <- obs.df[,!grepl(remove, names(obs.df))] + - #Using 1980 - 2010 as calibration period - fp <- paste0(dd, "Interim/CPM/Data_as_df/") - cpm.files <- list.files(fp) - - #Calibration years 1980 - 2010 - load in full one for 1980 - 2000 - cpm.cal <- cpm.files[grepl("1980|2000", cpm.files)] - - #Subset file list to area - cpm.cal <- cpm.cal[grepl(i, cpm.cal)] - - #subset to var and run - cpm.cal.var <- cpm.cal[grepl(v, cpm.cal)&grepl(r, cpm.cal)] - - #Load in - cal.dfs1 <- lapply(cpm.cal.var, function(x){ - df <- fread(paste0(fp, x)) - df <- as.data.frame(df) - }) - - names(cal.dfs1) <- cpm.cal.var - - - #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here - years <- 2000:2009 - lyrs <- paste0("_day_", years, collapse = "|") - - cal.df2 <- cal.dfs1[[2]][,grepl(lyrs, names(cal.dfs1[[2]]))] - - #Create final cal.df for this run - cal.df <- list(cal.dfs1[[1]], cal.df2) %>% reduce(cbind) - row.names(cal.df)<- paste0(cal.df$x, "_", cal.df$y) + n <- names(cities.cpm.dfs) + cal.df <- cities.cpm.dfs[[i]] + n <- names(cal.df) + ii <- n[grepl(paste0("_",v,"_"),n)&grepl("calibration",n)&grepl(paste0("_",r,"_"), n)] + cal.df <- cal.df[[ii]] + row.names(cal.df) <- paste0(cal.df$x, "_", cal.df$y) cal.df$x <- NULL cal.df$y <- NULL - #Clean up - remove("cal.df2") - - #Subset out the test years (2010-2020) - proj.df1 <- cal.dfs1[[2]][,!grepl(lyrs, names(cal.dfs1[[2]]))] - - #Clean up - remove("cal.dfs1") - gc() - - yi <- paste0(i,c(2020,2040,2060), collapse="|") - cpm.proj <- cpm.files[grepl(yi, cpm.files)] - - #Subset to Area, var and run - cpm.proj <- cpm.proj[grepl(i, cpm.proj)&grepl(v, cpm.proj)&grepl(r, cpm.proj)] - - #Load in - proj.df2 <- lapply(cpm.proj, function(x){ - df <- as.data.frame(fread(paste0(fp, x))) - #Remove x and y cols - df[c(3:ncol(df))] - }) - - names(proj.df2) <- cpm.proj - - proj.df <- c(list(proj.df1), proj.df2) %>% reduce(cbind) + #Accidently added in too many dates to the the calibration period -- and need to start the obs period from 1st Dec 1980 -- + #This should be removed for next re-running when the HADs 360 cal and regrouping is run + ci <- min(grep("19801201",names(obs.df))) + obs.df <- obs.df[,ci:ncol(obs.df)] + add.to.proj <- cal.df[,ncol(obs.df):ncol(cal.df)] + cal.df <- cal.df[,1:ncol(obs.df)] + n <- names(cities.cpm.dfs) + proj.df <- cities.cpm.dfs[[i]] + n <- names(proj.df) + ii <- n[grepl(paste0("_",v,"_"),n)&grepl("projection",n)&grepl(paste0("_",r,"_"), n)] + proj.df <- proj.df[[ii]] row.names(proj.df) <- paste0(proj.df$x, "_", proj.df$y) proj.df$x <- NULL proj.df$y <- NULL - - remove("proj.df1") - remove("proj.df2") + proj.df <- cbind(add.to.proj, proj.df) + + #clean up + remove(add.to.proj) ## **2. Wrangle the data** - - missing.in.hads.cpm.cal <- cal.df[-which(row.names(cal.df)%in%row.names(obs.df)),] - missing.in.hads.cpm.proj <- proj.df[-which(row.names(proj.df)%in%row.names(obs.df)),] - - #mnt/vmfileshare/ClimateData/Debiased/R/QuantileMapping cal.df <- cal.df[which(row.names(cal.df)%in%row.names(obs.df)),] proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] - #save the missing outputs - p <- paste0("checkpoint1", v, "_", i, "_", r, "_") - print(p) - write.csv(missing.in.hads.cpm.cal, paste0(dd, "Debiased/R/QuantileMapping/missing.in.hads/",r,"_",i,"_",v, ".csv")) - - ### Update obs data to 360 days - - #The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted - - #Convert obs to 360 day year - has 40 more vars so remove the ones not in cal - remove <- c("0229_29", "0430_30", "0731_31", "0930_30", "1130_30") - remove <- paste0(remove, collapse = "|") - - obs.df <- obs.df[,!grepl(remove, names(obs.df))] - + #This all needs to be updated as currently the dates are not aligning very well at all + ### Transpose the data sets #Obs grid should be cols, observations (time) should be rows for linear scaling @@ -504,7 +455,7 @@ for(r in Runs){ names(results.L) <- c("t.obs", "t.cal", "t.proj", "qm1.hist.a", "qm1.hist.b", "qm1.proj.a", "qm1.proj.b") p <- paste0("checkpoint4", v, "_", i, "_", r, "_") print(p) - base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/resultsL", r,"_",i,"_",v, ".RDS")) + base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/three.cities/resultsL", r,"_",i,"_",v, ".RDS")) p <- paste0("checkpoint5", v, "_", i, "_", r, "_") print(p) @@ -518,125 +469,69 @@ for(r in Runs){ #### Precipitation - the HADs variable has is called 'rainfall' dd <- "/mnt/vmfileshare/ClimateData/" - - #Subset to Area - #HADs grid observational data - fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") - files <- list.files(fp) - obs <- files[grepl(i, files)] - - #subset file list to var - obs.var <- obs[grepl("rainfall",obs)] - - obs.df <- fread(paste0(fp, obs.var)) - obs.df <- as.data.frame(obs.df) - - row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) - obs.df$x <- NULL - obs.df$y <- NULL - - #Using 1980 - 2010 as calibration period - fp <- paste0(dd, "Interim/CPM/Data_as_df/") - cpm.files <- list.files(fp) - - #Calibration years 1980 - 2010 - load in full one for 1980 - 2000 - cpm.cal <- cpm.files[grepl("1980|2000", cpm.files)] - - #Subset file list to area - cpm.cal <- cpm.cal[grepl(i, cpm.cal)] - - #subset to var and run - cpm.cal.var <- cpm.cal[grepl(v, cpm.cal)&grepl(r, cpm.cal)] - - #Load in - cal.dfs1 <- lapply(cpm.cal.var, function(x){ - df <- fread(paste0(fp, x)) - df <- as.data.frame(df) - }) - - names(cal.dfs1) <- cpm.cal.var - - - #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here - years <- 2000:2009 - lyrs <- paste0("_day_", years, collapse = "|") - - cal.df2 <- cal.dfs1[[2]][,grepl(lyrs, names(cal.dfs1[[2]]))] - - #Create final cal.df for this run - cal.df <- list(cal.dfs1[[1]], cal.df2) %>% reduce(cbind) - row.names(cal.df)<- paste0(cal.df$x, "_", cal.df$y) - cal.df$x <- NULL - cal.df$y <- NULL - - #Clean up - remove("cal.df2") - - #Subset out the test years (2010-2020) - proj.df1 <- cal.dfs1[[2]][,!grepl(lyrs, names(cal.dfs1[[2]]))] - - #Clean up - remove("cal.dfs1") - gc() - - yi <- paste0(i,c(2020,2040,2060), collapse="|") - cpm.proj <- cpm.files[grepl(yi, cpm.files)] - - #Subset to Area, var and run - cpm.proj <- cpm.proj[grepl(i, cpm.proj)&grepl(v, cpm.proj)&grepl(r, cpm.proj)] - - #Load in - proj.df2 <- lapply(cpm.proj, function(x){ - df <- as.data.frame(fread(paste0(fp, x))) - #Remove x and y cols - df[c(3:ncol(df))] - }) - - names(proj.df2) <- cpm.proj - - proj.df <- c(list(proj.df1), proj.df2) %>% reduce(cbind) - - row.names(proj.df) <- paste0(proj.df$x, "_", proj.df$y) - proj.df$x <- NULL - proj.df$y <- NULL - - remove("proj.df1") - remove("proj.df2") - - ## **2. Wrangle the data** - - missing.in.hads.cpm.cal <- cal.df[-which(row.names(cal.df)%in%row.names(obs.df)),] - missing.in.hads.cpm.proj <- proj.df[-which(row.names(proj.df)%in%row.names(obs.df)),] - - #mnt/vmfileshare/ClimateData/Debiased/R/QuantileMapping - - cal.df <- cal.df[which(row.names(cal.df)%in%row.names(obs.df)),] - proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] - - #save the missing outputs - p <- paste0("checkpoint1", v, "_", i, "_", r, "_") - print(p) - write.csv(missing.in.hads.cpm.cal, paste0(dd, "Debiased/R/QuantileMapping/missing.in.hads/",r,"_",i,"_",v, ".csv")) - - ### Update obs data to 360 days - - #The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted - - #Convert obs to 360 day year - has 40 more vars so remove the ones not in cal - remove <- c("0229_29", "0430_30", "0731_31", "0930_30", "1130_30") - remove <- paste0(remove, collapse = "|") - - obs.df <- obs.df[,!grepl(remove, names(obs.df))] - - ### Transpose the data sets - - #Obs grid should be cols, observations (time) should be rows for linear scaling - - cal.df <- t(cal.df) - proj.df <- t(proj.df) - obs.df <- t(obs.df) - - + + obs.df <- cities.Hads.dfs[[i]] + n <- names(obs.df) + obs.df <- obs.df[[n[grepl("rainfall", n)&grepl("1980",n)]]] #change to rainfall in this #1980 is the calibration period - pulls out of the list + + row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) + obs.df$x <- NULL + obs.df$y <- NULL + + ci <- min(grep("19801201",names(obs.df))) + obs.df <- obs.df[,ci:ncol(obs.df)] + + #Remove the extra dates -- can be removed in future + remove <- c("0229_rainfall_29", "0430_rainfall_30", "0731_rainfall_31", "0930_rainfall_30", "1130_rainfall_30") + remove <- paste0(remove, collapse = "|") + + obs.df <- obs.df[,!grepl(remove, names(obs.df))] + + + n <- names(cities.cpm.dfs) + cal.df <- cities.cpm.dfs[[i]] + n <- names(cal.df) + ii <- n[grepl(paste0("_",v,"_"),n)&grepl("calibration",n)&grepl(paste0("_",r,"_"), n)] + cal.df <- cal.df[[ii]] + row.names(cal.df) <- paste0(cal.df$x, "_", cal.df$y) + cal.df$x <- NULL + cal.df$y <- NULL + + #Accidently added in too many dates to the the calibration period -- and need to start the obs period from 1st Dec 1980 -- + #This should be removed for next re-running when the HADs 360 cal and regrouping is run + ci <- min(grep("19801201",names(obs.df))) + obs.df <- obs.df[,ci:ncol(obs.df)] + add.to.proj <- cal.df[,ncol(obs.df):ncol(cal.df)] + cal.df <- cal.df[,1:ncol(obs.df)] + + n <- names(cities.cpm.dfs) + proj.df <- cities.cpm.dfs[[i]] + n <- names(proj.df) + ii <- n[grepl(paste0("_",v,"_"),n)&grepl("projection",n)&grepl(paste0("_",r,"_"), n)] + proj.df <- proj.df[[ii]] + row.names(proj.df) <- paste0(proj.df$x, "_", proj.df$y) + proj.df$x <- NULL + proj.df$y <- NULL + proj.df <- cbind(add.to.proj, proj.df) + + #clean up + remove(add.to.proj) + + ## **2. Wrangle the data** + + cal.df <- cal.df[which(row.names(cal.df)%in%row.names(obs.df)),] + proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] + + #This all needs to be updated as currently the dates are not aligning very well at all + + ### Transpose the data sets + + #Obs grid should be cols, observations (time) should be rows for linear scaling + + cal.df <- t(cal.df) + proj.df <- t(proj.df) + obs.df <- t(obs.df) + ## **3. Empirical Quantile Mapping** #(from qmap vignette) - fitQmapQUANT estimates values of the empirical cumulative distribution function of observed and @@ -667,7 +562,7 @@ for(r in Runs){ names(results.L) <- c("t.obs", "t.cal", "t.proj", "qm1.hist.a", "qm1.hist.b", "qm1.proj.a", "qm1.proj.b") p <- paste0("checkpoint4", v, "_", i, "_", r, "_") print(p) - base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/resultsL", r,"_",i,"_",v, ".RDS")) + base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/three.cities/resultsL", r,"_",i,"_",v, ".RDS")) p <- paste0("checkpoint5", v, "_", i, "_", r, "_") print(p) @@ -678,4 +573,5 @@ for(r in Runs){ } } - } +} +} From 687eb12f6a5188e1b26dfc4a75f12fbddd4aa724 Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Thu, 24 Aug 2023 11:23:36 +0000 Subject: [PATCH 04/83] Moving some files to new folder --- R/LCAT/Processing.data.for.LCAT.R | 14 ++++++++++++++ R/LCAT/Region.Refs.csv | 13 +++++++++++++ Region.Refs.csv | 13 +++++++++++++ 3 files changed, 40 insertions(+) create mode 100644 R/LCAT/Processing.data.for.LCAT.R create mode 100644 R/LCAT/Region.Refs.csv create mode 100644 Region.Refs.csv diff --git a/R/LCAT/Processing.data.for.LCAT.R b/R/LCAT/Processing.data.for.LCAT.R new file mode 100644 index 00000000..8a811468 --- /dev/null +++ b/R/LCAT/Processing.data.for.LCAT.R @@ -0,0 +1,14 @@ + +source("/home/dyme/Desktop/clim-recal/clim-recal/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R") + +library(terra) +library(tidyverse) +library(data.table) +library(qmap) + +Region.Refs.csv + +apply_bias_correction_to_cropped_df(region="UKM", + var=c("tasmin", "tasmax", "pr"), + Runs=c("Run07", "Run08"), + crop_further = TRUE) diff --git a/R/LCAT/Region.Refs.csv b/R/LCAT/Region.Refs.csv new file mode 100644 index 00000000..024b9405 --- /dev/null +++ b/R/LCAT/Region.Refs.csv @@ -0,0 +1,13 @@ +"RegionName","Regioncd" +"North East (England)","UKC" +"North West (England)","UKD" +"Yorkshire and The Humber","UKE" +"East Midlands (England)","UKF" +"West Midlands (England)","UKG" +"East of England","UKH" +"London","UKI" +"South East (England)","UKJ" +"South West (England)","UKK" +"Wales","UKL" +"Scotland","UKM" +"Northern Ireland","UKN" diff --git a/Region.Refs.csv b/Region.Refs.csv new file mode 100644 index 00000000..dfc5be12 --- /dev/null +++ b/Region.Refs.csv @@ -0,0 +1,13 @@ +"","n","RegionName","Regioncd" +"1",1,"North East (England)","UKC" +"2",2,"North West (England)","UKD" +"3",3,"Yorkshire and The Humber","UKE" +"4",4,"East Midlands (England)","UKF" +"5",5,"West Midlands (England)","UKG" +"6",6,"East of England","UKH" +"7",7,"London","UKI" +"8",8,"South East (England)","UKJ" +"9",9,"South West (England)","UKK" +"10",10,"Wales","UKL" +"11",11,"Scotland","UKM" +"12",12,"Northern Ireland","UKN" From 05ad4c99f5626367259d29ab1eaba006fef8ab2a Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Thu, 24 Aug 2023 14:53:14 +0000 Subject: [PATCH 05/83] Script updates for grouping etc --- R/LCAT/Processing.data.for.LCAT.R | 12 +- .../apply_qmapQuant_to_crpd_df_fn.R | 297 +++++++++--------- Region.Refs.csv | 13 - 3 files changed, 153 insertions(+), 169 deletions(-) delete mode 100644 Region.Refs.csv diff --git a/R/LCAT/Processing.data.for.LCAT.R b/R/LCAT/Processing.data.for.LCAT.R index 8a811468..40298bb6 100644 --- a/R/LCAT/Processing.data.for.LCAT.R +++ b/R/LCAT/Processing.data.for.LCAT.R @@ -6,9 +6,13 @@ library(tidyverse) library(data.table) library(qmap) -Region.Refs.csv +Region.Refs <- read.csv("R/LCAT/Region.Refs.csv") +Regioncds <- Region.Refs$Regioncd -apply_bias_correction_to_cropped_df(region="UKM", +#Scotland (UKM) needs to be broked down, so running on everyone else +Regioncds.2 <- Regioncds[c(1:10, 12)] + +lapply(Regioncds.2, function(i){ + apply_bias_correction_to_cropped_df(region=i, var=c("tasmin", "tasmax", "pr"), - Runs=c("Run07", "Run08"), - crop_further = TRUE) + Runs=c("Run05", "Run06", "Run07", "Run08"))}) diff --git a/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R b/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R index 835f4e1d..586f49ae 100644 --- a/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R +++ b/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R @@ -28,8 +28,8 @@ for(r in Runs){ #subset file list to var obs.var <- obs[grepl(v,obs)] - #subset to calibration years - obs.varc <- obs.var[grepl("1980_", obs.var)] + #subset to calibration years + obs.varc <- obs.var[grepl("1980", obs.var)] obs.df <- fread(paste0(fp, obs.varc)) obs.df <- as.data.frame(obs.df) @@ -37,6 +37,12 @@ for(r in Runs){ obs.df$x <- NULL obs.df$y <- NULL + #Remove the dates not in the cpm + ## find col position of the first cpm date 19801201 + n1 <-min(grep("19801201", names(obs.df))) + obs.df <- obs.df[c(n1:ncol(obs.df))] + + #Using 1980 - 2010 as calibration period fp <- paste0(dd, "Interim/CPM/Data_as_df/") cpm.files <- list.files(fp) @@ -51,34 +57,28 @@ for(r in Runs){ cpm.cal.var <- cpm.cal[grepl(v, cpm.cal)&grepl(r, cpm.cal)] #Load in - cal.dfs1 <- lapply(cpm.cal.var, function(x){ + cal.df <- lapply(cpm.cal.var, function(x){ df <- fread(paste0(fp, x)) df <- as.data.frame(df) + + row.names(df)<- paste0(df$x, "_", df$y) + df$x <- NULL + df$y <- NULL + return(df) }) - - names(cal.dfs1) <- cpm.cal.var - + + cal.df <- cal.df %>% reduce(cbind) #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here - years <- 2000:2009 - lyrs <- paste0("_day_", years, collapse = "|") - - cal.df2 <- cal.dfs1[[2]][,grepl(lyrs, names(cal.dfs1[[2]]))] - - #Create final cal.df for this run - cal.df <- list(cal.dfs1[[1]], cal.df2) %>% reduce(cbind) - row.names(cal.df)<- paste0(cal.df$x, "_", cal.df$y) - cal.df$x <- NULL - cal.df$y <- NULL + #Keep all of the files with these years - because the naming convention runs + #from Nov to following year we need to just take the first 30 days of the one starting with 20091201- + n2 <- min(grep("20091201-",names(cal.df))) + 29 + + #This is the first part of the validation dataset, but all the val will be added to the projection df for + #the sake of bias correction and assessed separately + proj.df1 <- cal.df[c((n2+1):ncol(cal.df))] + cal.df <- cal.df[c(1:n2)] - #Clean up - remove("cal.df2") - - #Subset out the test years (2010-2020) - proj.df1 <- cal.dfs1[[2]][,!grepl(lyrs, names(cal.dfs1[[2]]))] - - #Clean up - remove("cal.dfs1") gc() yi <- paste0(i,c(2020,2040,2060), collapse="|") @@ -97,20 +97,15 @@ for(r in Runs){ names(proj.df2) <- cpm.proj proj.df <- c(list(proj.df1), proj.df2) %>% reduce(cbind) - - row.names(proj.df) <- paste0(proj.df$x, "_", proj.df$y) - proj.df$x <- NULL - proj.df$y <- NULL remove("proj.df1") remove("proj.df2") ## **2. Wrangle the data** - missing.in.hads.cpm.cal <- cal.df[-which(row.names(cal.df)%in%row.names(obs.df)),] - missing.in.hads.cpm.proj <- proj.df[-which(row.names(proj.df)%in%row.names(obs.df)),] - - #mnt/vmfileshare/ClimateData/Debiased/R/QuantileMapping + #missing.in.hads.cpm.cal <- cal.df[-which(row.names(cal.df)%in%row.names(obs.df)),] + #missing.in.hads.cpm.proj <- proj.df[-which(row.names(proj.df)%in%row.names(obs.df)),] + cal.df <- cal.df[which(row.names(cal.df)%in%row.names(obs.df)),] proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] @@ -118,7 +113,7 @@ for(r in Runs){ #save the missing outputs p <- paste0("checkpoint1", v, "_", i, "_", r, "_") print(p) - write.csv(missing.in.hads.cpm.cal, paste0(dd, "Debiased/R/QuantileMapping/missing.in.hads/",r,"_",i,"_",v, ".csv")) + #write.csv(missing.in.hads.cpm.cal, paste0(dd, "Debiased/R/QuantileMapping/missing.in.hads/",r,"_",i,"_",v, ".csv")) ### Update obs data to 360 days @@ -129,6 +124,8 @@ for(r in Runs){ remove <- paste0(remove, collapse = "|") obs.df <- obs.df[,!grepl(remove, names(obs.df))] + #This still pulls in the 31st Dec 2009 for some reason is in the hads so manual remove + obs.df <- obs.df[1:ncol(cal.df)] ### Transpose the data sets @@ -181,126 +178,122 @@ for(r in Runs){ } else { #### Precipitation - the HADs variable has is called 'rainfall' - dd <- "/mnt/vmfileshare/ClimateData/" - - #Subset to Area - #HADs grid observational data - fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") - files <- list.files(fp) - obs <- files[grepl(i, files)] - - #subset file list to var - obs.var <- obs[grepl("rainfall",obs)] - - obs.df <- fread(paste0(fp, obs.var)) - obs.df <- as.data.frame(obs.df) - - row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) - obs.df$x <- NULL - obs.df$y <- NULL - - #Using 1980 - 2010 as calibration period - fp <- paste0(dd, "Interim/CPM/Data_as_df/") - cpm.files <- list.files(fp) - - #Calibration years 1980 - 2010 - load in full one for 1980 - 2000 - cpm.cal <- cpm.files[grepl("1980|2000", cpm.files)] - - #Subset file list to area - cpm.cal <- cpm.cal[grepl(i, cpm.cal)] - - #subset to var and run - cpm.cal.var <- cpm.cal[grepl(v, cpm.cal)&grepl(r, cpm.cal)] - - #Load in - cal.dfs1 <- lapply(cpm.cal.var, function(x){ - df <- fread(paste0(fp, x)) - df <- as.data.frame(df) - }) - - names(cal.dfs1) <- cpm.cal.var - - - #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here - years <- 2000:2009 - lyrs <- paste0("_day_", years, collapse = "|") - - cal.df2 <- cal.dfs1[[2]][,grepl(lyrs, names(cal.dfs1[[2]]))] - - #Create final cal.df for this run - cal.df <- list(cal.dfs1[[1]], cal.df2) %>% reduce(cbind) - row.names(cal.df)<- paste0(cal.df$x, "_", cal.df$y) - cal.df$x <- NULL - cal.df$y <- NULL - - #Clean up - remove("cal.df2") - - #Subset out the test years (2010-2020) - proj.df1 <- cal.dfs1[[2]][,!grepl(lyrs, names(cal.dfs1[[2]]))] - - #Clean up - remove("cal.dfs1") - gc() - - yi <- paste0(i,c(2020,2040,2060), collapse="|") - cpm.proj <- cpm.files[grepl(yi, cpm.files)] - - #Subset to Area, var and run - cpm.proj <- cpm.proj[grepl(i, cpm.proj)&grepl(v, cpm.proj)&grepl(r, cpm.proj)] - - #Load in - proj.df2 <- lapply(cpm.proj, function(x){ - df <- as.data.frame(fread(paste0(fp, x))) - #Remove x and y cols - df[c(3:ncol(df))] - }) - - names(proj.df2) <- cpm.proj - - proj.df <- c(list(proj.df1), proj.df2) %>% reduce(cbind) - - row.names(proj.df) <- paste0(proj.df$x, "_", proj.df$y) - proj.df$x <- NULL - proj.df$y <- NULL - - remove("proj.df1") - remove("proj.df2") - - ## **2. Wrangle the data** - - missing.in.hads.cpm.cal <- cal.df[-which(row.names(cal.df)%in%row.names(obs.df)),] - missing.in.hads.cpm.proj <- proj.df[-which(row.names(proj.df)%in%row.names(obs.df)),] - - #mnt/vmfileshare/ClimateData/Debiased/R/QuantileMapping - - cal.df <- cal.df[which(row.names(cal.df)%in%row.names(obs.df)),] - proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] - - #save the missing outputs - p <- paste0("checkpoint1", v, "_", i, "_", r, "_") - print(p) - write.csv(missing.in.hads.cpm.cal, paste0(dd, "Debiased/R/QuantileMapping/missing.in.hads/",r,"_",i,"_",v, ".csv")) - - ### Update obs data to 360 days - - #The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted - - #Convert obs to 360 day year - has 40 more vars so remove the ones not in cal - remove <- c("0229_29", "0430_30", "0731_31", "0930_30", "1130_30") - remove <- paste0(remove, collapse = "|") - - obs.df <- obs.df[,!grepl(remove, names(obs.df))] - - ### Transpose the data sets - - #Obs grid should be cols, observations (time) should be rows for linear scaling - - cal.df <- t(cal.df) - proj.df <- t(proj.df) - obs.df <- t(obs.df) - + #Subset to Area + #HADs grid observational data + fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") + files <- list.files(fp) + obs <- files[grepl(i, files)] + + #subset file list to var + obs.var <- obs[grepl(v,obs)] + + #subset to calibration years + obs.varc <- obs.var[grepl("1980", obs.var)] + obs.df <- fread(paste0(fp, obs.varc)) + obs.df <- as.data.frame(obs.df) + + row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) + obs.df$x <- NULL + obs.df$y <- NULL + + #Remove the dates not in the cpm + ## find col position of the first cpm date 19801201 + n1 <-min(grep("19801201", names(obs.df))) + obs.df <- obs.df[c(n1:ncol(obs.df))] + + + #Using 1980 - 2010 as calibration period + fp <- paste0(dd, "Interim/CPM/Data_as_df/") + cpm.files <- list.files(fp) + + #Calibration years 1980 - 2010 - load in full one for 1980 - 2000 + cpm.cal <- cpm.files[grepl("1980|2000", cpm.files)] + + #Subset file list to area + cpm.cal <- cpm.cal[grepl(i, cpm.cal)] + + #subset to var and run + cpm.cal.var <- cpm.cal[grepl(v, cpm.cal)&grepl(r, cpm.cal)] + + #Load in + cal.df <- lapply(cpm.cal.var, function(x){ + df <- fread(paste0(fp, x)) + df <- as.data.frame(df) + + row.names(df)<- paste0(df$x, "_", df$y) + df$x <- NULL + df$y <- NULL + return(df) + }) + + cal.df <- cal.df %>% reduce(cbind) + + #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here + #Keep all of the files with these years - because the naming convention runs + #from Nov to following year we need to just take the first 30 days of the one starting with 20091201- + n2 <- min(grep("20091201-",names(cal.df))) + 29 + + #This is the first part of the validation dataset, but all the val will be added to the projection df for + #the sake of bias correction and assessed separately + proj.df1 <- cal.df[c((n2+1):ncol(cal.df))] + cal.df <- cal.df[c(1:n2)] + + gc() + + yi <- paste0(i,c(2020,2040,2060), collapse="|") + cpm.proj <- cpm.files[grepl(yi, cpm.files)] + + #Subset to Area, var and run + cpm.proj <- cpm.proj[grepl(i, cpm.proj)&grepl(v, cpm.proj)&grepl(r, cpm.proj)] + + #Load in + proj.df2 <- lapply(cpm.proj, function(x){ + df <- as.data.frame(fread(paste0(fp, x))) + #Remove x and y cols + df[c(3:ncol(df))] + }) + + names(proj.df2) <- cpm.proj + + proj.df <- c(list(proj.df1), proj.df2) %>% reduce(cbind) + + remove("proj.df1") + remove("proj.df2") + + ## **2. Wrangle the data** + + #missing.in.hads.cpm.cal <- cal.df[-which(row.names(cal.df)%in%row.names(obs.df)),] + #missing.in.hads.cpm.proj <- proj.df[-which(row.names(proj.df)%in%row.names(obs.df)),] + + + cal.df <- cal.df[which(row.names(cal.df)%in%row.names(obs.df)),] + proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] + + #save the missing outputs + p <- paste0("checkpoint1", v, "_", i, "_", r, "_") + print(p) + #write.csv(missing.in.hads.cpm.cal, paste0(dd, "Debiased/R/QuantileMapping/missing.in.hads/",r,"_",i,"_",v, ".csv")) + + ### Update obs data to 360 days + + #The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted + + #Convert obs to 360 day year - has 40 more vars so remove the ones not in cal + remove <- c("0229_29", "0430_30", "0731_31", "0930_30", "1130_30") + remove <- paste0(remove, collapse = "|") + + obs.df <- obs.df[,!grepl(remove, names(obs.df))] + #This still pulls in the 31st Dec 2009 for some reason is in the hads so manual remove + obs.df <- obs.df[1:ncol(cal.df)] + + ### Transpose the data sets + + #Obs grid should be cols, observations (time) should be rows for linear scaling + + cal.df <- t(cal.df) + proj.df <- t(proj.df) + obs.df <- t(obs.df) ## **3. Empirical Quantile Mapping** diff --git a/Region.Refs.csv b/Region.Refs.csv deleted file mode 100644 index dfc5be12..00000000 --- a/Region.Refs.csv +++ /dev/null @@ -1,13 +0,0 @@ -"","n","RegionName","Regioncd" -"1",1,"North East (England)","UKC" -"2",2,"North West (England)","UKD" -"3",3,"Yorkshire and The Humber","UKE" -"4",4,"East Midlands (England)","UKF" -"5",5,"West Midlands (England)","UKG" -"6",6,"East of England","UKH" -"7",7,"London","UKI" -"8",8,"South East (England)","UKJ" -"9",9,"South West (England)","UKK" -"10",10,"Wales","UKL" -"11",11,"Scotland","UKM" -"12",12,"Northern Ireland","UKN" From 1695c31f2ee05edf169b17974e350ae8ae5d89b8 Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Fri, 25 Aug 2023 10:09:43 +0000 Subject: [PATCH 06/83] Correcting precip error in Hads read in --- R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R b/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R index 586f49ae..f1c7c19e 100644 --- a/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R +++ b/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R @@ -28,7 +28,7 @@ for(r in Runs){ #subset file list to var obs.var <- obs[grepl(v,obs)] - #subset to calibration years + #subset to calibration years obs.varc <- obs.var[grepl("1980", obs.var)] obs.df <- fread(paste0(fp, obs.varc)) obs.df <- as.data.frame(obs.df) @@ -186,7 +186,7 @@ for(r in Runs){ obs <- files[grepl(i, files)] #subset file list to var - obs.var <- obs[grepl(v,obs)] + obs.var <- obs[grepl("rainfall",obs)] #subset to calibration years obs.varc <- obs.var[grepl("1980", obs.var)] From ad2845bfcf4bfe5c892cfed2c1d61308c8e8f91f Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Fri, 25 Aug 2023 10:09:43 +0000 Subject: [PATCH 07/83] Correcting precip error in Hads read in --- R/LCAT/Processing.data.for.LCAT.R | 13 +- .../apply_qmapQuant_to_crpd_df_fn.R | 343 +++++++++++++++++- 2 files changed, 350 insertions(+), 6 deletions(-) diff --git a/R/LCAT/Processing.data.for.LCAT.R b/R/LCAT/Processing.data.for.LCAT.R index 40298bb6..6ce08b11 100644 --- a/R/LCAT/Processing.data.for.LCAT.R +++ b/R/LCAT/Processing.data.for.LCAT.R @@ -1,3 +1,4 @@ +rm(list=ls()) source("/home/dyme/Desktop/clim-recal/clim-recal/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R") @@ -6,13 +7,19 @@ library(tidyverse) library(data.table) library(qmap) -Region.Refs <- read.csv("R/LCAT/Region.Refs.csv") +Region.Refs <- read.csv("/home/dyme/Desktop/clim-recal/clim-recal/R/bias-correction-methods/R/LCAT/Region.Refs.csv") Regioncds <- Region.Refs$Regioncd -#Scotland (UKM) needs to be broked down, so running on everyone else -Regioncds.2 <- Regioncds[c(1:10, 12)] +#Scotland (UKM) needs to be broken down, so running on everyone else +#Regioncds.2 <- Regioncds[c(1:10, 12)] - this was killed at UKK - so running the remaining as: + +Regioncds.2 <- c("UKK", "UKL", "UKN", "UKM") lapply(Regioncds.2, function(i){ apply_bias_correction_to_cropped_df(region=i, var=c("tasmin", "tasmax", "pr"), Runs=c("Run05", "Run06", "Run07", "Run08"))}) + +## Scotland + + diff --git a/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R b/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R index 586f49ae..f17fa6de 100644 --- a/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R +++ b/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R @@ -28,7 +28,7 @@ for(r in Runs){ #subset file list to var obs.var <- obs[grepl(v,obs)] - #subset to calibration years + #subset to calibration years obs.varc <- obs.var[grepl("1980", obs.var)] obs.df <- fread(paste0(fp, obs.varc)) obs.df <- as.data.frame(obs.df) @@ -186,7 +186,7 @@ for(r in Runs){ obs <- files[grepl(i, files)] #subset file list to var - obs.var <- obs[grepl(v,obs)] + obs.var <- obs[grepl("rainfall",obs)] #subset to calibration years obs.varc <- obs.var[grepl("1980", obs.var)] @@ -339,7 +339,344 @@ for(r in Runs){ } } - ########################## + +###################### Further cropping to the cropped dfs (!) - mostly for Scotland which is too big! + +cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs to relate to the file name in a unique way to subset + var, #Meterological variables + Runs,#Runs in form 'Run08'- matched input + Lines){ #Number of lines to read in + + i <- region + + for(r in Runs){ + for(v in var){ + if(v!="pr"){ + dd <- "/mnt/vmfileshare/ClimateData/" + + #Subset to Area + #HADs grid observational data + fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") + files <- list.files(fp) + obs <- files[grepl(i, files)] + + #subset file list to var + obs.var <- obs[grepl(v,obs)] + + #subset to calibration years + obs.varc <- obs.var[grepl("1980", obs.var)] + obs.df <- fread(paste0(fp, obs.varc)) + obs.df <- as.data.frame(obs.df) + + row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) + obs.df$x <- NULL + obs.df$y <- NULL + + #Remove the dates not in the cpm + ## find col position of the first cpm date 19801201 + n1 <-min(grep("19801201", names(obs.df))) + obs.df <- obs.df[c(n1:ncol(obs.df))] + + + #Using 1980 - 2010 as calibration period + fp <- paste0(dd, "Interim/CPM/Data_as_df/") + cpm.files <- list.files(fp) + + #Calibration years 1980 - 2010 - load in full one for 1980 - 2000 + cpm.cal <- cpm.files[grepl("1980|2000", cpm.files)] + + #Subset file list to area + cpm.cal <- cpm.cal[grepl(i, cpm.cal)] + + #subset to var and run + cpm.cal.var <- cpm.cal[grepl(v, cpm.cal)&grepl(r, cpm.cal)] + + #Load in + cal.df <- lapply(cpm.cal.var, function(x){ + df <- fread(paste0(fp, x)) + df <- as.data.frame(df) + + row.names(df)<- paste0(df$x, "_", df$y) + df$x <- NULL + df$y <- NULL + return(df) + }) + + cal.df <- cal.df %>% reduce(cbind) + + #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here + #Keep all of the files with these years - because the naming convention runs + #from Nov to following year we need to just take the first 30 days of the one starting with 20091201- + n2 <- min(grep("20091201-",names(cal.df))) + 29 + + #This is the first part of the validation dataset, but all the val will be added to the projection df for + #the sake of bias correction and assessed separately + proj.df1 <- cal.df[c((n2+1):ncol(cal.df))] + cal.df <- cal.df[c(1:n2)] + + gc() + + yi <- paste0(i,c(2020,2040,2060), collapse="|") + cpm.proj <- cpm.files[grepl(yi, cpm.files)] + + #Subset to Area, var and run + cpm.proj <- cpm.proj[grepl(i, cpm.proj)&grepl(v, cpm.proj)&grepl(r, cpm.proj)] + + #Load in + proj.df2 <- lapply(cpm.proj, function(x){ + df <- as.data.frame(fread(paste0(fp, x))) + #Remove x and y cols + df[c(3:ncol(df))] + }) + + names(proj.df2) <- cpm.proj + + proj.df <- c(list(proj.df1), proj.df2) %>% reduce(cbind) + + remove("proj.df1") + remove("proj.df2") + + ## **2. Wrangle the data** + + #missing.in.hads.cpm.cal <- cal.df[-which(row.names(cal.df)%in%row.names(obs.df)),] + #missing.in.hads.cpm.proj <- proj.df[-which(row.names(proj.df)%in%row.names(obs.df)),] + + + cal.df <- cal.df[which(row.names(cal.df)%in%row.names(obs.df)),] + proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] + + #save the missing outputs + p <- paste0("checkpoint1", v, "_", i, "_", r, "_") + print(p) + #write.csv(missing.in.hads.cpm.cal, paste0(dd, "Debiased/R/QuantileMapping/missing.in.hads/",r,"_",i,"_",v, ".csv")) + + ### Update obs data to 360 days + + #The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted + + #Convert obs to 360 day year - has 40 more vars so remove the ones not in cal + remove <- c("0229_29", "0430_30", "0731_31", "0930_30", "1130_30") + remove <- paste0(remove, collapse = "|") + + obs.df <- obs.df[,!grepl(remove, names(obs.df))] + #This still pulls in the 31st Dec 2009 for some reason is in the hads so manual remove + obs.df <- obs.df[1:ncol(cal.df)] + + ### Transpose the data sets + + #Obs grid should be cols, observations (time) should be rows for linear scaling + + cal.df <- t(cal.df) + proj.df <- t(proj.df) + obs.df <- t(obs.df) + + + ## **3. Empirical Quantile Mapping** + + #(from qmap vignette) - fitQmapQUANT estimates values of the empirical cumulative distribution function of observed and + #modelled time series for regularly spaced quantiles. doQmapQUANT uses these estimates to perform + #quantile mapping + p <- paste0("checkpoint2", v, "_", i, "_", r, "_") + print(p) + + library(qmap) + qm1.fit <- fitQmapQUANT(obs.df, cal.df, + wet.day = FALSE, + qstep = 0.01, + nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. + + + qm1.hist.a <- doQmapQUANT(cal.df, qm1.fit, type="linear") + qm1.hist.b <- doQmapQUANT(cal.df, qm1.fit, type="tricub") + + qm1.proj.a <- doQmapQUANT(proj.df, qm1.fit, type="linear") + qm1.proj.b <- doQmapQUANT(proj.df, qm1.fit, type="tricub") + + ## **4. Save the data** + p <- paste0("checkpoint3", v, "_", i, "_", r, "_") + print(p) + # Save data - lists of dfs for now (will be easier for assessment) + results.L <- list(obs.df, cal.df, proj.df, qm1.hist.a, qm1.hist.b, qm1.proj.a, qm1.proj.b) + + names(results.L) <- c("t.obs", "t.cal", "t.proj", "qm1.hist.a", "qm1.hist.b", "qm1.proj.a", "qm1.proj.b") + p <- paste0("checkpoint4", v, "_", i, "_", r, "_") + print(p) + base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/resultsL", r,"_",i,"_",v, ".RDS")) + + p <- paste0("checkpoint5", v, "_", i, "_", r, "_") + print(p) + rm(list=setdiff(ls(), c("v", "i", "r", "var", "Runs"))) + + gc(reset=TRUE) + + + } else { + + #### Precipitation - the HADs variable has is called 'rainfall' + dd <- "/mnt/vmfileshare/ClimateData/" + #Subset to Area + #HADs grid observational data + fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") + files <- list.files(fp) + obs <- files[grepl(i, files)] + + #subset file list to var + obs.var <- obs[grepl("rainfall",obs)] + + #subset to calibration years + obs.varc <- obs.var[grepl("1980", obs.var)] + obs.df <- fread(paste0(fp, obs.varc)) + obs.df <- as.data.frame(obs.df) + + row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) + obs.df$x <- NULL + obs.df$y <- NULL + + #Remove the dates not in the cpm + ## find col position of the first cpm date 19801201 + n1 <-min(grep("19801201", names(obs.df))) + obs.df <- obs.df[c(n1:ncol(obs.df))] + + + #Using 1980 - 2010 as calibration period + fp <- paste0(dd, "Interim/CPM/Data_as_df/") + cpm.files <- list.files(fp) + + #Calibration years 1980 - 2010 - load in full one for 1980 - 2000 + cpm.cal <- cpm.files[grepl("1980|2000", cpm.files)] + + #Subset file list to area + cpm.cal <- cpm.cal[grepl(i, cpm.cal)] + + #subset to var and run + cpm.cal.var <- cpm.cal[grepl(v, cpm.cal)&grepl(r, cpm.cal)] + + #Load in + cal.df <- lapply(cpm.cal.var, function(x){ + df <- fread(paste0(fp, x)) + df <- as.data.frame(df) + + row.names(df)<- paste0(df$x, "_", df$y) + df$x <- NULL + df$y <- NULL + return(df) + }) + + cal.df <- cal.df %>% reduce(cbind) + + #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here + #Keep all of the files with these years - because the naming convention runs + #from Nov to following year we need to just take the first 30 days of the one starting with 20091201- + n2 <- min(grep("20091201-",names(cal.df))) + 29 + + #This is the first part of the validation dataset, but all the val will be added to the projection df for + #the sake of bias correction and assessed separately + proj.df1 <- cal.df[c((n2+1):ncol(cal.df))] + cal.df <- cal.df[c(1:n2)] + + gc() + + yi <- paste0(i,c(2020,2040,2060), collapse="|") + cpm.proj <- cpm.files[grepl(yi, cpm.files)] + + #Subset to Area, var and run + cpm.proj <- cpm.proj[grepl(i, cpm.proj)&grepl(v, cpm.proj)&grepl(r, cpm.proj)] + + #Load in + proj.df2 <- lapply(cpm.proj, function(x){ + df <- as.data.frame(fread(paste0(fp, x))) + #Remove x and y cols + df[c(3:ncol(df))] + }) + + names(proj.df2) <- cpm.proj + + proj.df <- c(list(proj.df1), proj.df2) %>% reduce(cbind) + + remove("proj.df1") + remove("proj.df2") + + ## **2. Wrangle the data** + + #missing.in.hads.cpm.cal <- cal.df[-which(row.names(cal.df)%in%row.names(obs.df)),] + #missing.in.hads.cpm.proj <- proj.df[-which(row.names(proj.df)%in%row.names(obs.df)),] + + + cal.df <- cal.df[which(row.names(cal.df)%in%row.names(obs.df)),] + proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] + + #save the missing outputs + p <- paste0("checkpoint1", v, "_", i, "_", r, "_") + print(p) + #write.csv(missing.in.hads.cpm.cal, paste0(dd, "Debiased/R/QuantileMapping/missing.in.hads/",r,"_",i,"_",v, ".csv")) + + ### Update obs data to 360 days + + #The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted + + #Convert obs to 360 day year - has 40 more vars so remove the ones not in cal + remove <- c("0229_29", "0430_30", "0731_31", "0930_30", "1130_30") + remove <- paste0(remove, collapse = "|") + + obs.df <- obs.df[,!grepl(remove, names(obs.df))] + #This still pulls in the 31st Dec 2009 for some reason is in the hads so manual remove + obs.df <- obs.df[1:ncol(cal.df)] + + ### Transpose the data sets + + #Obs grid should be cols, observations (time) should be rows for linear scaling + + cal.df <- t(cal.df) + proj.df <- t(proj.df) + obs.df <- t(obs.df) + + ## **3. Empirical Quantile Mapping** + + #(from qmap vignette) - fitQmapQUANT estimates values of the empirical cumulative distribution function of observed and + #modelled time series for regularly spaced quantiles. doQmapQUANT uses these estimates to perform + #quantile mapping + p <- paste0("checkpoint2", v, "_", i, "_", r, "_") + print(p) + + + qm1.fit <- fitQmapQUANT(obs.df, cal.df, + wet.day = TRUE, #If wet.day=TRUE the empirical probability of nonzero observations is found (obs>=0) and the corresponding modelled value is selected as a threshold. All modelled values below this threshold are set to zero. If wet.day is numeric the same procedure is performed after setting all obs to zero. + qstep = 0.01, + nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. + + + qm1.hist.a <- doQmapQUANT(cal.df, qm1.fit, type="linear") + qm1.hist.b <- doQmapQUANT(cal.df, qm1.fit, type="tricub") + + qm1.proj.a <- doQmapQUANT(proj.df, qm1.fit, type="linear") + qm1.proj.b <- doQmapQUANT(proj.df, qm1.fit, type="tricub") + + ## **4. Save the data** + p <- paste0("checkpoint3", v, "_", i, "_", r, "_") + print(p) + # Save data - lists of dfs for now (will be easier for assessment) + results.L <- list(obs.df, cal.df, proj.df, qm1.hist.a, qm1.hist.b, qm1.proj.a, qm1.proj.b) + + names(results.L) <- c("t.obs", "t.cal", "t.proj", "qm1.hist.a", "qm1.hist.b", "qm1.proj.a", "qm1.proj.b") + p <- paste0("checkpoint4", v, "_", i, "_", r, "_") + print(p) + base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/resultsL", r,"_",i,"_",v, ".RDS")) + + p <- paste0("checkpoint5", v, "_", i, "_", r, "_") + print(p) + rm(list=setdiff(ls(), c("v", "i", "r", "var", "Runs"))) + + gc(reset=TRUE) + + + } + } + } +} + + + +########################## #Function for applying the bias correction to a list of dfs (ie rather than reading in the csvs, as above) From b77383ff21414e803c3fb806eb66fdd30958a10d Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Tue, 29 Aug 2023 19:30:13 +0000 Subject: [PATCH 08/83] Updating fnc for scotland --- .../apply_qmapQuant_to_crpd_df_fn.R | 407 +++++------------- 1 file changed, 105 insertions(+), 302 deletions(-) diff --git a/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R b/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R index f17fa6de..dfe5a4a9 100644 --- a/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R +++ b/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R @@ -344,40 +344,22 @@ for(r in Runs){ cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs to relate to the file name in a unique way to subset var, #Meterological variables - Runs,#Runs in form 'Run08'- matched input - Lines){ #Number of lines to read in + Runs, #Runs in form 'Run08'- matched input + N.new.segments,...){ #Numeric, Number of dfs to break down to, eg 4 i <- region + N.new.segments<- N.new.segments + Runs <- Runs + var <- var for(r in Runs){ for(v in var){ + for(y in 1:N.new.segments){ if(v!="pr"){ dd <- "/mnt/vmfileshare/ClimateData/" #Subset to Area - #HADs grid observational data - fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") - files <- list.files(fp) - obs <- files[grepl(i, files)] - - #subset file list to var - obs.var <- obs[grepl(v,obs)] - - #subset to calibration years - obs.varc <- obs.var[grepl("1980", obs.var)] - obs.df <- fread(paste0(fp, obs.varc)) - obs.df <- as.data.frame(obs.df) - - row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) - obs.df$x <- NULL - obs.df$y <- NULL - - #Remove the dates not in the cpm - ## find col position of the first cpm date 19801201 - n1 <-min(grep("19801201", names(obs.df))) - obs.df <- obs.df[c(n1:ncol(obs.df))] - - + #Load cpm first and then use this to subset the latter as there are more cells in cpm that hads #Using 1980 - 2010 as calibration period fp <- paste0(dd, "Interim/CPM/Data_as_df/") cpm.files <- list.files(fp) @@ -404,6 +386,7 @@ cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs t cal.df <- cal.df %>% reduce(cbind) + #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here #Keep all of the files with these years - because the naming convention runs #from Nov to following year we need to just take the first 30 days of the one starting with 20091201- @@ -414,8 +397,14 @@ cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs t proj.df1 <- cal.df[c((n2+1):ncol(cal.df))] cal.df <- cal.df[c(1:n2)] - gc() + #Subset the dataframe iteratively depending on y + nrows.seg <- nrow(cal.df)/N.new.segments + y_1 <- y-1 + nr1 <- round(nrows.seg*y_1) + nr2 <- round(nrows.seg*y) + cal.df <- cal.df[nr1:nr2,] + #proj data yi <- paste0(i,c(2020,2040,2060), collapse="|") cpm.proj <- cpm.files[grepl(yi, cpm.files)] @@ -429,13 +418,43 @@ cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs t df[c(3:ncol(df))] }) - names(proj.df2) <- cpm.proj + names(proj.df2) <- cpm.proj proj.df <- c(list(proj.df1), proj.df2) %>% reduce(cbind) + proj.df <- proj.df[which(row.names(proj.df)%in%row.names(cal.df)),] remove("proj.df1") remove("proj.df2") + + #HADs grid observational data + fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") + files <- list.files(fp) + obs <- files[grepl(i, files)] + + #subset file list to var + obs.var <- obs[grepl(v,obs)] + + #subset to calibration years + obs.varc <- obs.var[grepl("1980", obs.var)] + obs.df <- fread(paste0(fp, obs.varc)) + obs.df <- as.data.frame(obs.df) + + row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) + obs.df$x <- NULL + obs.df$y <- NULL + + #Subset to the rows which are in above (some will be missing) + obs.df <- obs.df[which(row.names(obs.df)%in%row.names(cal.df)),] + + #Remove the dates not in the cpm + ## find col position of the first cpm date 19801201 + n1 <-min(grep("19801201", names(obs.df))) + obs.df <- obs.df[c(n1:ncol(obs.df))] + + gc() + + ## **2. Wrangle the data** #missing.in.hads.cpm.cal <- cal.df[-which(row.names(cal.df)%in%row.names(obs.df)),] @@ -446,7 +465,7 @@ cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs t proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] #save the missing outputs - p <- paste0("checkpoint1", v, "_", i, "_", r, "_") + p <- paste0("checkpoint1", v, "_", i, "_", r, "_",y) print(p) #write.csv(missing.in.hads.cpm.cal, paste0(dd, "Debiased/R/QuantileMapping/missing.in.hads/",r,"_",i,"_",v, ".csv")) @@ -476,7 +495,7 @@ cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs t #(from qmap vignette) - fitQmapQUANT estimates values of the empirical cumulative distribution function of observed and #modelled time series for regularly spaced quantiles. doQmapQUANT uses these estimates to perform #quantile mapping - p <- paste0("checkpoint2", v, "_", i, "_", r, "_") + p <- paste0("checkpoint2", v, "_", i, "_", r, "_",y) print(p) library(qmap) @@ -493,19 +512,19 @@ cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs t qm1.proj.b <- doQmapQUANT(proj.df, qm1.fit, type="tricub") ## **4. Save the data** - p <- paste0("checkpoint3", v, "_", i, "_", r, "_") + p <- paste0("checkpoint3", v, "_", i, "_", r, "_", y) print(p) # Save data - lists of dfs for now (will be easier for assessment) results.L <- list(obs.df, cal.df, proj.df, qm1.hist.a, qm1.hist.b, qm1.proj.a, qm1.proj.b) names(results.L) <- c("t.obs", "t.cal", "t.proj", "qm1.hist.a", "qm1.hist.b", "qm1.proj.a", "qm1.proj.b") - p <- paste0("checkpoint4", v, "_", i, "_", r, "_") + p <- paste0("checkpoint4", v, "_", i, "_", r, "_", y) print(p) - base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/resultsL", r,"_",i,"_",v, ".RDS")) + base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/resultsL", r,"_",i,"_",y,"_",v, ".RDS")) - p <- paste0("checkpoint5", v, "_", i, "_", r, "_") + p <- paste0("checkpoint5", v, "_", i, "_", r, "_", y) print(p) - rm(list=setdiff(ls(), c("v", "i", "r", "var", "Runs"))) + rm(list=setdiff(ls(), c("v", "i", "r", "var", "Runs", "y"))) gc(reset=TRUE) @@ -514,30 +533,9 @@ cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs t #### Precipitation - the HADs variable has is called 'rainfall' dd <- "/mnt/vmfileshare/ClimateData/" - #Subset to Area - #HADs grid observational data - fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") - files <- list.files(fp) - obs <- files[grepl(i, files)] - - #subset file list to var - obs.var <- obs[grepl("rainfall",obs)] - - #subset to calibration years - obs.varc <- obs.var[grepl("1980", obs.var)] - obs.df <- fread(paste0(fp, obs.varc)) - obs.df <- as.data.frame(obs.df) - - row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) - obs.df$x <- NULL - obs.df$y <- NULL - - #Remove the dates not in the cpm - ## find col position of the first cpm date 19801201 - n1 <-min(grep("19801201", names(obs.df))) - obs.df <- obs.df[c(n1:ncol(obs.df))] - + #Subset to Area + #Load cpm first and then use this to subset the latter as there are more cells in cpm that hads #Using 1980 - 2010 as calibration period fp <- paste0(dd, "Interim/CPM/Data_as_df/") cpm.files <- list.files(fp) @@ -564,6 +562,7 @@ cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs t cal.df <- cal.df %>% reduce(cbind) + #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here #Keep all of the files with these years - because the naming convention runs #from Nov to following year we need to just take the first 30 days of the one starting with 20091201- @@ -574,8 +573,15 @@ cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs t proj.df1 <- cal.df[c((n2+1):ncol(cal.df))] cal.df <- cal.df[c(1:n2)] - gc() + #Subset the dataframe iteratively depending on y + nrows.seg <- nrow(cal.df)/N.new.segments + y_1 <- y-1 + nr1 <- round(nrows.seg*y_1) + nr2 <- round(nrows.seg*y) + cal.df <- cal.df[nr1:nr2,] + + #proj data yi <- paste0(i,c(2020,2040,2060), collapse="|") cpm.proj <- cpm.files[grepl(yi, cpm.files)] @@ -592,10 +598,40 @@ cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs t names(proj.df2) <- cpm.proj proj.df <- c(list(proj.df1), proj.df2) %>% reduce(cbind) + proj.df <- proj.df[which(row.names(proj.df)%in%row.names(cal.df)),] remove("proj.df1") remove("proj.df2") + + #HADs grid observational data + fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") + files <- list.files(fp) + obs <- files[grepl(i, files)] + + #subset file list to var + obs.var <- obs[grepl("rainfall",obs)] + + #subset to calibration years + obs.varc <- obs.var[grepl("1980", obs.var)] + obs.df <- fread(paste0(fp, obs.varc)) + obs.df <- as.data.frame(obs.df) + + row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) + obs.df$x <- NULL + obs.df$y <- NULL + + #Subset to the rows which are in above (some will be missing) + obs.df <- obs.df[which(row.names(obs.df)%in%row.names(cal.df)),] + + #Remove the dates not in the cpm + ## find col position of the first cpm date 19801201 + n1 <-min(grep("19801201", names(obs.df))) + obs.df <- obs.df[c(n1:ncol(obs.df))] + + gc() + + ## **2. Wrangle the data** #missing.in.hads.cpm.cal <- cal.df[-which(row.names(cal.df)%in%row.names(obs.df)),] @@ -606,7 +642,7 @@ cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs t proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] #save the missing outputs - p <- paste0("checkpoint1", v, "_", i, "_", r, "_") + p <- paste0("checkpoint1", v, "_", i, "_", r, "_",y) print(p) #write.csv(missing.in.hads.cpm.cal, paste0(dd, "Debiased/R/QuantileMapping/missing.in.hads/",r,"_",i,"_",v, ".csv")) @@ -630,278 +666,45 @@ cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs t proj.df <- t(proj.df) obs.df <- t(obs.df) + ## **3. Empirical Quantile Mapping** #(from qmap vignette) - fitQmapQUANT estimates values of the empirical cumulative distribution function of observed and #modelled time series for regularly spaced quantiles. doQmapQUANT uses these estimates to perform #quantile mapping - p <- paste0("checkpoint2", v, "_", i, "_", r, "_") + p <- paste0("checkpoint2", v, "_", i, "_", r, "_",y) print(p) - qm1.fit <- fitQmapQUANT(obs.df, cal.df, wet.day = TRUE, #If wet.day=TRUE the empirical probability of nonzero observations is found (obs>=0) and the corresponding modelled value is selected as a threshold. All modelled values below this threshold are set to zero. If wet.day is numeric the same procedure is performed after setting all obs to zero. qstep = 0.01, nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. - qm1.hist.a <- doQmapQUANT(cal.df, qm1.fit, type="linear") - qm1.hist.b <- doQmapQUANT(cal.df, qm1.fit, type="tricub") - - qm1.proj.a <- doQmapQUANT(proj.df, qm1.fit, type="linear") - qm1.proj.b <- doQmapQUANT(proj.df, qm1.fit, type="tricub") ## **4. Save the data** - p <- paste0("checkpoint3", v, "_", i, "_", r, "_") + p <- paste0("checkpoint3", v, "_", i, "_", r, "_", y) print(p) # Save data - lists of dfs for now (will be easier for assessment) results.L <- list(obs.df, cal.df, proj.df, qm1.hist.a, qm1.hist.b, qm1.proj.a, qm1.proj.b) names(results.L) <- c("t.obs", "t.cal", "t.proj", "qm1.hist.a", "qm1.hist.b", "qm1.proj.a", "qm1.proj.b") - p <- paste0("checkpoint4", v, "_", i, "_", r, "_") + p <- paste0("checkpoint4", v, "_", i, "_", r, "_", y) print(p) - base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/resultsL", r,"_",i,"_",v, ".RDS")) + base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/resultsL", r,"_",i,"_",y,"_",v, ".RDS")) - p <- paste0("checkpoint5", v, "_", i, "_", r, "_") + p <- paste0("checkpoint5", v, "_", i, "_", r, "_", y) print(p) - rm(list=setdiff(ls(), c("v", "i", "r", "var", "Runs"))) + rm(list=setdiff(ls(), c("v", "i", "r", "var", "Runs", "y"))) gc(reset=TRUE) + } } } -} - - - -########################## - - #Function for applying the bias correction to a list of dfs (ie rather than reading in the csvs, as above) - -apply_qmap_to_cropped_dfL <- function(region, #Region code - needs to relate to the file name in a unique way to subset - var, #Meterological variables - Runs){ #Runs as 05 not Run05 - - i <- region - -for(r in Runs){ - for(v in var){ - if(v!="pr"){ - dd <- "/mnt/vmfileshare/ClimateData/" - - obs.df <- cities.Hads.dfs[[i]] - n <- names(obs.df) - obs.df <- obs.df[[n[grepl(v, n)&grepl("1980",n)]]] #change to rainfall in this #1980 is the calibration period - pulls out of the list - - row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) - obs.df$x <- NULL - obs.df$y <- NULL - - ci <- min(grep("19801201",names(obs.df))) - obs.df <- obs.df[,ci:ncol(obs.df)] - - #Remove the extra dates -- can be removed in future - remove <- c(paste0("0229_", v,"_29"), paste0("0430_",v,"_30"), paste0("0731_",v,"_31"), paste0("0930_",v,"_30"), - paste0("1130_",v,"_30")) - remove <- paste0(remove, collapse = "|") - - obs.df <- obs.df[,!grepl(remove, names(obs.df))] - - - n <- names(cities.cpm.dfs) - cal.df <- cities.cpm.dfs[[i]] - n <- names(cal.df) - ii <- n[grepl(paste0("_",v,"_"),n)&grepl("calibration",n)&grepl(paste0("_",r,"_"), n)] - cal.df <- cal.df[[ii]] - row.names(cal.df) <- paste0(cal.df$x, "_", cal.df$y) - cal.df$x <- NULL - cal.df$y <- NULL - - #Accidently added in too many dates to the the calibration period -- and need to start the obs period from 1st Dec 1980 -- - #This should be removed for next re-running when the HADs 360 cal and regrouping is run - ci <- min(grep("19801201",names(obs.df))) - obs.df <- obs.df[,ci:ncol(obs.df)] - add.to.proj <- cal.df[,ncol(obs.df):ncol(cal.df)] - cal.df <- cal.df[,1:ncol(obs.df)] - - n <- names(cities.cpm.dfs) - proj.df <- cities.cpm.dfs[[i]] - n <- names(proj.df) - ii <- n[grepl(paste0("_",v,"_"),n)&grepl("projection",n)&grepl(paste0("_",r,"_"), n)] - proj.df <- proj.df[[ii]] - row.names(proj.df) <- paste0(proj.df$x, "_", proj.df$y) - proj.df$x <- NULL - proj.df$y <- NULL - proj.df <- cbind(add.to.proj, proj.df) - - #clean up - remove(add.to.proj) - -## **2. Wrangle the data** - - cal.df <- cal.df[which(row.names(cal.df)%in%row.names(obs.df)),] - proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] - - #This all needs to be updated as currently the dates are not aligning very well at all - -### Transpose the data sets - - #Obs grid should be cols, observations (time) should be rows for linear scaling - - cal.df <- t(cal.df) - proj.df <- t(proj.df) - obs.df <- t(obs.df) - - -## **3. Empirical Quantile Mapping** - -#(from qmap vignette) - fitQmapQUANT estimates values of the empirical cumulative distribution function of observed and -#modelled time series for regularly spaced quantiles. doQmapQUANT uses these estimates to perform -#quantile mapping - p <- paste0("checkpoint2", v, "_", i, "_", r, "_") - print(p) - - library(qmap) - qm1.fit <- fitQmapQUANT(obs.df, cal.df, - wet.day = FALSE, - qstep = 0.01, - nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. - - - qm1.hist.a <- doQmapQUANT(cal.df, qm1.fit, type="linear") - qm1.hist.b <- doQmapQUANT(cal.df, qm1.fit, type="tricub") - - qm1.proj.a <- doQmapQUANT(proj.df, qm1.fit, type="linear") - qm1.proj.b <- doQmapQUANT(proj.df, qm1.fit, type="tricub") - -## **4. Save the data** - p <- paste0("checkpoint3", v, "_", i, "_", r, "_") - print(p) - # Save data - lists of dfs for now (will be easier for assessment) - results.L <- list(obs.df, cal.df, proj.df, qm1.hist.a, qm1.hist.b, qm1.proj.a, qm1.proj.b) - - names(results.L) <- c("t.obs", "t.cal", "t.proj", "qm1.hist.a", "qm1.hist.b", "qm1.proj.a", "qm1.proj.b") - p <- paste0("checkpoint4", v, "_", i, "_", r, "_") - print(p) - base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/three.cities/resultsL", r,"_",i,"_",v, ".RDS")) + } + } - p <- paste0("checkpoint5", v, "_", i, "_", r, "_") - print(p) - rm(list=setdiff(ls(), c("v", "i", "r", "var", "Runs"))) - - gc(reset=TRUE) - - } else { - -#### Precipitation - the HADs variable has is called 'rainfall' - - dd <- "/mnt/vmfileshare/ClimateData/" - - obs.df <- cities.Hads.dfs[[i]] - n <- names(obs.df) - obs.df <- obs.df[[n[grepl("rainfall", n)&grepl("1980",n)]]] #change to rainfall in this #1980 is the calibration period - pulls out of the list - - row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) - obs.df$x <- NULL - obs.df$y <- NULL - - ci <- min(grep("19801201",names(obs.df))) - obs.df <- obs.df[,ci:ncol(obs.df)] - - #Remove the extra dates -- can be removed in future - remove <- c("0229_rainfall_29", "0430_rainfall_30", "0731_rainfall_31", "0930_rainfall_30", "1130_rainfall_30") - remove <- paste0(remove, collapse = "|") - - obs.df <- obs.df[,!grepl(remove, names(obs.df))] - - - n <- names(cities.cpm.dfs) - cal.df <- cities.cpm.dfs[[i]] - n <- names(cal.df) - ii <- n[grepl(paste0("_",v,"_"),n)&grepl("calibration",n)&grepl(paste0("_",r,"_"), n)] - cal.df <- cal.df[[ii]] - row.names(cal.df) <- paste0(cal.df$x, "_", cal.df$y) - cal.df$x <- NULL - cal.df$y <- NULL - - #Accidently added in too many dates to the the calibration period -- and need to start the obs period from 1st Dec 1980 -- - #This should be removed for next re-running when the HADs 360 cal and regrouping is run - ci <- min(grep("19801201",names(obs.df))) - obs.df <- obs.df[,ci:ncol(obs.df)] - add.to.proj <- cal.df[,ncol(obs.df):ncol(cal.df)] - cal.df <- cal.df[,1:ncol(obs.df)] - - n <- names(cities.cpm.dfs) - proj.df <- cities.cpm.dfs[[i]] - n <- names(proj.df) - ii <- n[grepl(paste0("_",v,"_"),n)&grepl("projection",n)&grepl(paste0("_",r,"_"), n)] - proj.df <- proj.df[[ii]] - row.names(proj.df) <- paste0(proj.df$x, "_", proj.df$y) - proj.df$x <- NULL - proj.df$y <- NULL - proj.df <- cbind(add.to.proj, proj.df) - - #clean up - remove(add.to.proj) - - ## **2. Wrangle the data** - - cal.df <- cal.df[which(row.names(cal.df)%in%row.names(obs.df)),] - proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] - - #This all needs to be updated as currently the dates are not aligning very well at all - - ### Transpose the data sets - - #Obs grid should be cols, observations (time) should be rows for linear scaling - - cal.df <- t(cal.df) - proj.df <- t(proj.df) - obs.df <- t(obs.df) - - ## **3. Empirical Quantile Mapping** - - #(from qmap vignette) - fitQmapQUANT estimates values of the empirical cumulative distribution function of observed and - #modelled time series for regularly spaced quantiles. doQmapQUANT uses these estimates to perform - #quantile mapping - p <- paste0("checkpoint2", v, "_", i, "_", r, "_") - print(p) - - - qm1.fit <- fitQmapQUANT(obs.df, cal.df, - wet.day = TRUE, #If wet.day=TRUE the empirical probability of nonzero observations is found (obs>=0) and the corresponding modelled value is selected as a threshold. All modelled values below this threshold are set to zero. If wet.day is numeric the same procedure is performed after setting all obs to zero. - qstep = 0.01, - nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. - - - qm1.hist.a <- doQmapQUANT(cal.df, qm1.fit, type="linear") - qm1.hist.b <- doQmapQUANT(cal.df, qm1.fit, type="tricub") - - qm1.proj.a <- doQmapQUANT(proj.df, qm1.fit, type="linear") - qm1.proj.b <- doQmapQUANT(proj.df, qm1.fit, type="tricub") - - ## **4. Save the data** - p <- paste0("checkpoint3", v, "_", i, "_", r, "_") - print(p) - # Save data - lists of dfs for now (will be easier for assessment) - results.L <- list(obs.df, cal.df, proj.df, qm1.hist.a, qm1.hist.b, qm1.proj.a, qm1.proj.b) - - names(results.L) <- c("t.obs", "t.cal", "t.proj", "qm1.hist.a", "qm1.hist.b", "qm1.proj.a", "qm1.proj.b") - p <- paste0("checkpoint4", v, "_", i, "_", r, "_") - print(p) - base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/three.cities/resultsL", r,"_",i,"_",v, ".RDS")) - - p <- paste0("checkpoint5", v, "_", i, "_", r, "_") - print(p) - rm(list=setdiff(ls(), c("v", "i", "r", "var", "Runs"))) - - gc(reset=TRUE) - - - } - } -} -} From 0c828f70aa5a15274679316655bc4b8e042f153d Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Tue, 29 Aug 2023 19:30:13 +0000 Subject: [PATCH 09/83] Updating fnc for scotland --- .../apply_qmapQuant_to_crpd_df_fn.R | 407 +++++------------- 1 file changed, 105 insertions(+), 302 deletions(-) diff --git a/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R b/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R index f17fa6de..283f7767 100644 --- a/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R +++ b/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R @@ -344,40 +344,22 @@ for(r in Runs){ cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs to relate to the file name in a unique way to subset var, #Meterological variables - Runs,#Runs in form 'Run08'- matched input - Lines){ #Number of lines to read in + Runs, #Runs in form 'Run08'- matched input + N.new.segments,...){ #Numeric, Number of dfs to break down to, eg 4 i <- region + N.new.segments<- N.new.segments + Runs <- Runs + var <- var for(r in Runs){ for(v in var){ + for(y in 1:N.new.segments){ if(v!="pr"){ dd <- "/mnt/vmfileshare/ClimateData/" #Subset to Area - #HADs grid observational data - fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") - files <- list.files(fp) - obs <- files[grepl(i, files)] - - #subset file list to var - obs.var <- obs[grepl(v,obs)] - - #subset to calibration years - obs.varc <- obs.var[grepl("1980", obs.var)] - obs.df <- fread(paste0(fp, obs.varc)) - obs.df <- as.data.frame(obs.df) - - row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) - obs.df$x <- NULL - obs.df$y <- NULL - - #Remove the dates not in the cpm - ## find col position of the first cpm date 19801201 - n1 <-min(grep("19801201", names(obs.df))) - obs.df <- obs.df[c(n1:ncol(obs.df))] - - + #Load cpm first and then use this to subset the latter as there are more cells in cpm that hads #Using 1980 - 2010 as calibration period fp <- paste0(dd, "Interim/CPM/Data_as_df/") cpm.files <- list.files(fp) @@ -404,6 +386,7 @@ cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs t cal.df <- cal.df %>% reduce(cbind) + #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here #Keep all of the files with these years - because the naming convention runs #from Nov to following year we need to just take the first 30 days of the one starting with 20091201- @@ -414,8 +397,14 @@ cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs t proj.df1 <- cal.df[c((n2+1):ncol(cal.df))] cal.df <- cal.df[c(1:n2)] - gc() + #Subset the dataframe iteratively depending on y + nrows.seg <- nrow(cal.df)/N.new.segments + y_1 <- y-1 + nr1 <- round(nrows.seg*y_1) + 1 + nr2 <- round(nrows.seg*y) + cal.df <- cal.df[nr1:nr2,] + #proj data yi <- paste0(i,c(2020,2040,2060), collapse="|") cpm.proj <- cpm.files[grepl(yi, cpm.files)] @@ -429,13 +418,43 @@ cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs t df[c(3:ncol(df))] }) - names(proj.df2) <- cpm.proj + names(proj.df2) <- cpm.proj proj.df <- c(list(proj.df1), proj.df2) %>% reduce(cbind) + proj.df <- proj.df[which(row.names(proj.df)%in%row.names(cal.df)),] remove("proj.df1") remove("proj.df2") + + #HADs grid observational data + fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") + files <- list.files(fp) + obs <- files[grepl(i, files)] + + #subset file list to var + obs.var <- obs[grepl(v,obs)] + + #subset to calibration years + obs.varc <- obs.var[grepl("1980", obs.var)] + obs.df <- fread(paste0(fp, obs.varc)) + obs.df <- as.data.frame(obs.df) + + row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) + obs.df$x <- NULL + obs.df$y <- NULL + + #Subset to the rows which are in above (some will be missing) + obs.df <- obs.df[which(row.names(obs.df)%in%row.names(cal.df)),] + + #Remove the dates not in the cpm + ## find col position of the first cpm date 19801201 + n1 <-min(grep("19801201", names(obs.df))) + obs.df <- obs.df[c(n1:ncol(obs.df))] + + gc() + + ## **2. Wrangle the data** #missing.in.hads.cpm.cal <- cal.df[-which(row.names(cal.df)%in%row.names(obs.df)),] @@ -446,7 +465,7 @@ cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs t proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] #save the missing outputs - p <- paste0("checkpoint1", v, "_", i, "_", r, "_") + p <- paste0("checkpoint1", v, "_", i, "_", r, "_",y) print(p) #write.csv(missing.in.hads.cpm.cal, paste0(dd, "Debiased/R/QuantileMapping/missing.in.hads/",r,"_",i,"_",v, ".csv")) @@ -476,7 +495,7 @@ cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs t #(from qmap vignette) - fitQmapQUANT estimates values of the empirical cumulative distribution function of observed and #modelled time series for regularly spaced quantiles. doQmapQUANT uses these estimates to perform #quantile mapping - p <- paste0("checkpoint2", v, "_", i, "_", r, "_") + p <- paste0("checkpoint2", v, "_", i, "_", r, "_",y) print(p) library(qmap) @@ -493,19 +512,19 @@ cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs t qm1.proj.b <- doQmapQUANT(proj.df, qm1.fit, type="tricub") ## **4. Save the data** - p <- paste0("checkpoint3", v, "_", i, "_", r, "_") + p <- paste0("checkpoint3", v, "_", i, "_", r, "_", y) print(p) # Save data - lists of dfs for now (will be easier for assessment) results.L <- list(obs.df, cal.df, proj.df, qm1.hist.a, qm1.hist.b, qm1.proj.a, qm1.proj.b) names(results.L) <- c("t.obs", "t.cal", "t.proj", "qm1.hist.a", "qm1.hist.b", "qm1.proj.a", "qm1.proj.b") - p <- paste0("checkpoint4", v, "_", i, "_", r, "_") + p <- paste0("checkpoint4", v, "_", i, "_", r, "_", y) print(p) - base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/resultsL", r,"_",i,"_",v, ".RDS")) + base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/resultsL", r,"_",i,"_",y,"_",v, ".RDS")) - p <- paste0("checkpoint5", v, "_", i, "_", r, "_") + p <- paste0("checkpoint5", v, "_", i, "_", r, "_", y) print(p) - rm(list=setdiff(ls(), c("v", "i", "r", "var", "Runs"))) + rm(list=setdiff(ls(), c("v", "i", "r", "var", "Runs", "y"))) gc(reset=TRUE) @@ -514,30 +533,9 @@ cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs t #### Precipitation - the HADs variable has is called 'rainfall' dd <- "/mnt/vmfileshare/ClimateData/" - #Subset to Area - #HADs grid observational data - fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") - files <- list.files(fp) - obs <- files[grepl(i, files)] - - #subset file list to var - obs.var <- obs[grepl("rainfall",obs)] - - #subset to calibration years - obs.varc <- obs.var[grepl("1980", obs.var)] - obs.df <- fread(paste0(fp, obs.varc)) - obs.df <- as.data.frame(obs.df) - - row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) - obs.df$x <- NULL - obs.df$y <- NULL - - #Remove the dates not in the cpm - ## find col position of the first cpm date 19801201 - n1 <-min(grep("19801201", names(obs.df))) - obs.df <- obs.df[c(n1:ncol(obs.df))] - + #Subset to Area + #Load cpm first and then use this to subset the latter as there are more cells in cpm that hads #Using 1980 - 2010 as calibration period fp <- paste0(dd, "Interim/CPM/Data_as_df/") cpm.files <- list.files(fp) @@ -564,6 +562,7 @@ cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs t cal.df <- cal.df %>% reduce(cbind) + #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here #Keep all of the files with these years - because the naming convention runs #from Nov to following year we need to just take the first 30 days of the one starting with 20091201- @@ -574,8 +573,15 @@ cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs t proj.df1 <- cal.df[c((n2+1):ncol(cal.df))] cal.df <- cal.df[c(1:n2)] - gc() + #Subset the dataframe iteratively depending on y + nrows.seg <- nrow(cal.df)/N.new.segments + y_1 <- y-1 + nr1 <- round(nrows.seg*y_1) + 1 + nr2 <- round(nrows.seg*y) + cal.df <- cal.df[nr1:nr2,] + + #proj data yi <- paste0(i,c(2020,2040,2060), collapse="|") cpm.proj <- cpm.files[grepl(yi, cpm.files)] @@ -592,10 +598,40 @@ cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs t names(proj.df2) <- cpm.proj proj.df <- c(list(proj.df1), proj.df2) %>% reduce(cbind) + proj.df <- proj.df[which(row.names(proj.df)%in%row.names(cal.df)),] remove("proj.df1") remove("proj.df2") + + #HADs grid observational data + fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") + files <- list.files(fp) + obs <- files[grepl(i, files)] + + #subset file list to var + obs.var <- obs[grepl("rainfall",obs)] + + #subset to calibration years + obs.varc <- obs.var[grepl("1980", obs.var)] + obs.df <- fread(paste0(fp, obs.varc)) + obs.df <- as.data.frame(obs.df) + + row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) + obs.df$x <- NULL + obs.df$y <- NULL + + #Subset to the rows which are in above (some will be missing) + obs.df <- obs.df[which(row.names(obs.df)%in%row.names(cal.df)),] + + #Remove the dates not in the cpm + ## find col position of the first cpm date 19801201 + n1 <-min(grep("19801201", names(obs.df))) + obs.df <- obs.df[c(n1:ncol(obs.df))] + + gc() + + ## **2. Wrangle the data** #missing.in.hads.cpm.cal <- cal.df[-which(row.names(cal.df)%in%row.names(obs.df)),] @@ -606,7 +642,7 @@ cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs t proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] #save the missing outputs - p <- paste0("checkpoint1", v, "_", i, "_", r, "_") + p <- paste0("checkpoint1", v, "_", i, "_", r, "_",y) print(p) #write.csv(missing.in.hads.cpm.cal, paste0(dd, "Debiased/R/QuantileMapping/missing.in.hads/",r,"_",i,"_",v, ".csv")) @@ -630,278 +666,45 @@ cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs t proj.df <- t(proj.df) obs.df <- t(obs.df) + ## **3. Empirical Quantile Mapping** #(from qmap vignette) - fitQmapQUANT estimates values of the empirical cumulative distribution function of observed and #modelled time series for regularly spaced quantiles. doQmapQUANT uses these estimates to perform #quantile mapping - p <- paste0("checkpoint2", v, "_", i, "_", r, "_") + p <- paste0("checkpoint2", v, "_", i, "_", r, "_",y) print(p) - qm1.fit <- fitQmapQUANT(obs.df, cal.df, wet.day = TRUE, #If wet.day=TRUE the empirical probability of nonzero observations is found (obs>=0) and the corresponding modelled value is selected as a threshold. All modelled values below this threshold are set to zero. If wet.day is numeric the same procedure is performed after setting all obs to zero. qstep = 0.01, nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. - qm1.hist.a <- doQmapQUANT(cal.df, qm1.fit, type="linear") - qm1.hist.b <- doQmapQUANT(cal.df, qm1.fit, type="tricub") - - qm1.proj.a <- doQmapQUANT(proj.df, qm1.fit, type="linear") - qm1.proj.b <- doQmapQUANT(proj.df, qm1.fit, type="tricub") ## **4. Save the data** - p <- paste0("checkpoint3", v, "_", i, "_", r, "_") + p <- paste0("checkpoint3", v, "_", i, "_", r, "_", y) print(p) # Save data - lists of dfs for now (will be easier for assessment) results.L <- list(obs.df, cal.df, proj.df, qm1.hist.a, qm1.hist.b, qm1.proj.a, qm1.proj.b) names(results.L) <- c("t.obs", "t.cal", "t.proj", "qm1.hist.a", "qm1.hist.b", "qm1.proj.a", "qm1.proj.b") - p <- paste0("checkpoint4", v, "_", i, "_", r, "_") + p <- paste0("checkpoint4", v, "_", i, "_", r, "_", y) print(p) - base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/resultsL", r,"_",i,"_",v, ".RDS")) + base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/resultsL", r,"_",i,"_",y,"_",v, ".RDS")) - p <- paste0("checkpoint5", v, "_", i, "_", r, "_") + p <- paste0("checkpoint5", v, "_", i, "_", r, "_", y) print(p) - rm(list=setdiff(ls(), c("v", "i", "r", "var", "Runs"))) + rm(list=setdiff(ls(), c("v", "i", "r", "var", "Runs", "y"))) gc(reset=TRUE) + } } } -} - - - -########################## - - #Function for applying the bias correction to a list of dfs (ie rather than reading in the csvs, as above) - -apply_qmap_to_cropped_dfL <- function(region, #Region code - needs to relate to the file name in a unique way to subset - var, #Meterological variables - Runs){ #Runs as 05 not Run05 - - i <- region - -for(r in Runs){ - for(v in var){ - if(v!="pr"){ - dd <- "/mnt/vmfileshare/ClimateData/" - - obs.df <- cities.Hads.dfs[[i]] - n <- names(obs.df) - obs.df <- obs.df[[n[grepl(v, n)&grepl("1980",n)]]] #change to rainfall in this #1980 is the calibration period - pulls out of the list - - row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) - obs.df$x <- NULL - obs.df$y <- NULL - - ci <- min(grep("19801201",names(obs.df))) - obs.df <- obs.df[,ci:ncol(obs.df)] - - #Remove the extra dates -- can be removed in future - remove <- c(paste0("0229_", v,"_29"), paste0("0430_",v,"_30"), paste0("0731_",v,"_31"), paste0("0930_",v,"_30"), - paste0("1130_",v,"_30")) - remove <- paste0(remove, collapse = "|") - - obs.df <- obs.df[,!grepl(remove, names(obs.df))] - - - n <- names(cities.cpm.dfs) - cal.df <- cities.cpm.dfs[[i]] - n <- names(cal.df) - ii <- n[grepl(paste0("_",v,"_"),n)&grepl("calibration",n)&grepl(paste0("_",r,"_"), n)] - cal.df <- cal.df[[ii]] - row.names(cal.df) <- paste0(cal.df$x, "_", cal.df$y) - cal.df$x <- NULL - cal.df$y <- NULL - - #Accidently added in too many dates to the the calibration period -- and need to start the obs period from 1st Dec 1980 -- - #This should be removed for next re-running when the HADs 360 cal and regrouping is run - ci <- min(grep("19801201",names(obs.df))) - obs.df <- obs.df[,ci:ncol(obs.df)] - add.to.proj <- cal.df[,ncol(obs.df):ncol(cal.df)] - cal.df <- cal.df[,1:ncol(obs.df)] - - n <- names(cities.cpm.dfs) - proj.df <- cities.cpm.dfs[[i]] - n <- names(proj.df) - ii <- n[grepl(paste0("_",v,"_"),n)&grepl("projection",n)&grepl(paste0("_",r,"_"), n)] - proj.df <- proj.df[[ii]] - row.names(proj.df) <- paste0(proj.df$x, "_", proj.df$y) - proj.df$x <- NULL - proj.df$y <- NULL - proj.df <- cbind(add.to.proj, proj.df) - - #clean up - remove(add.to.proj) - -## **2. Wrangle the data** - - cal.df <- cal.df[which(row.names(cal.df)%in%row.names(obs.df)),] - proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] - - #This all needs to be updated as currently the dates are not aligning very well at all - -### Transpose the data sets - - #Obs grid should be cols, observations (time) should be rows for linear scaling - - cal.df <- t(cal.df) - proj.df <- t(proj.df) - obs.df <- t(obs.df) - - -## **3. Empirical Quantile Mapping** - -#(from qmap vignette) - fitQmapQUANT estimates values of the empirical cumulative distribution function of observed and -#modelled time series for regularly spaced quantiles. doQmapQUANT uses these estimates to perform -#quantile mapping - p <- paste0("checkpoint2", v, "_", i, "_", r, "_") - print(p) - - library(qmap) - qm1.fit <- fitQmapQUANT(obs.df, cal.df, - wet.day = FALSE, - qstep = 0.01, - nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. - - - qm1.hist.a <- doQmapQUANT(cal.df, qm1.fit, type="linear") - qm1.hist.b <- doQmapQUANT(cal.df, qm1.fit, type="tricub") - - qm1.proj.a <- doQmapQUANT(proj.df, qm1.fit, type="linear") - qm1.proj.b <- doQmapQUANT(proj.df, qm1.fit, type="tricub") - -## **4. Save the data** - p <- paste0("checkpoint3", v, "_", i, "_", r, "_") - print(p) - # Save data - lists of dfs for now (will be easier for assessment) - results.L <- list(obs.df, cal.df, proj.df, qm1.hist.a, qm1.hist.b, qm1.proj.a, qm1.proj.b) - - names(results.L) <- c("t.obs", "t.cal", "t.proj", "qm1.hist.a", "qm1.hist.b", "qm1.proj.a", "qm1.proj.b") - p <- paste0("checkpoint4", v, "_", i, "_", r, "_") - print(p) - base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/three.cities/resultsL", r,"_",i,"_",v, ".RDS")) + } + } - p <- paste0("checkpoint5", v, "_", i, "_", r, "_") - print(p) - rm(list=setdiff(ls(), c("v", "i", "r", "var", "Runs"))) - - gc(reset=TRUE) - - } else { - -#### Precipitation - the HADs variable has is called 'rainfall' - - dd <- "/mnt/vmfileshare/ClimateData/" - - obs.df <- cities.Hads.dfs[[i]] - n <- names(obs.df) - obs.df <- obs.df[[n[grepl("rainfall", n)&grepl("1980",n)]]] #change to rainfall in this #1980 is the calibration period - pulls out of the list - - row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) - obs.df$x <- NULL - obs.df$y <- NULL - - ci <- min(grep("19801201",names(obs.df))) - obs.df <- obs.df[,ci:ncol(obs.df)] - - #Remove the extra dates -- can be removed in future - remove <- c("0229_rainfall_29", "0430_rainfall_30", "0731_rainfall_31", "0930_rainfall_30", "1130_rainfall_30") - remove <- paste0(remove, collapse = "|") - - obs.df <- obs.df[,!grepl(remove, names(obs.df))] - - - n <- names(cities.cpm.dfs) - cal.df <- cities.cpm.dfs[[i]] - n <- names(cal.df) - ii <- n[grepl(paste0("_",v,"_"),n)&grepl("calibration",n)&grepl(paste0("_",r,"_"), n)] - cal.df <- cal.df[[ii]] - row.names(cal.df) <- paste0(cal.df$x, "_", cal.df$y) - cal.df$x <- NULL - cal.df$y <- NULL - - #Accidently added in too many dates to the the calibration period -- and need to start the obs period from 1st Dec 1980 -- - #This should be removed for next re-running when the HADs 360 cal and regrouping is run - ci <- min(grep("19801201",names(obs.df))) - obs.df <- obs.df[,ci:ncol(obs.df)] - add.to.proj <- cal.df[,ncol(obs.df):ncol(cal.df)] - cal.df <- cal.df[,1:ncol(obs.df)] - - n <- names(cities.cpm.dfs) - proj.df <- cities.cpm.dfs[[i]] - n <- names(proj.df) - ii <- n[grepl(paste0("_",v,"_"),n)&grepl("projection",n)&grepl(paste0("_",r,"_"), n)] - proj.df <- proj.df[[ii]] - row.names(proj.df) <- paste0(proj.df$x, "_", proj.df$y) - proj.df$x <- NULL - proj.df$y <- NULL - proj.df <- cbind(add.to.proj, proj.df) - - #clean up - remove(add.to.proj) - - ## **2. Wrangle the data** - - cal.df <- cal.df[which(row.names(cal.df)%in%row.names(obs.df)),] - proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] - - #This all needs to be updated as currently the dates are not aligning very well at all - - ### Transpose the data sets - - #Obs grid should be cols, observations (time) should be rows for linear scaling - - cal.df <- t(cal.df) - proj.df <- t(proj.df) - obs.df <- t(obs.df) - - ## **3. Empirical Quantile Mapping** - - #(from qmap vignette) - fitQmapQUANT estimates values of the empirical cumulative distribution function of observed and - #modelled time series for regularly spaced quantiles. doQmapQUANT uses these estimates to perform - #quantile mapping - p <- paste0("checkpoint2", v, "_", i, "_", r, "_") - print(p) - - - qm1.fit <- fitQmapQUANT(obs.df, cal.df, - wet.day = TRUE, #If wet.day=TRUE the empirical probability of nonzero observations is found (obs>=0) and the corresponding modelled value is selected as a threshold. All modelled values below this threshold are set to zero. If wet.day is numeric the same procedure is performed after setting all obs to zero. - qstep = 0.01, - nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. - - - qm1.hist.a <- doQmapQUANT(cal.df, qm1.fit, type="linear") - qm1.hist.b <- doQmapQUANT(cal.df, qm1.fit, type="tricub") - - qm1.proj.a <- doQmapQUANT(proj.df, qm1.fit, type="linear") - qm1.proj.b <- doQmapQUANT(proj.df, qm1.fit, type="tricub") - - ## **4. Save the data** - p <- paste0("checkpoint3", v, "_", i, "_", r, "_") - print(p) - # Save data - lists of dfs for now (will be easier for assessment) - results.L <- list(obs.df, cal.df, proj.df, qm1.hist.a, qm1.hist.b, qm1.proj.a, qm1.proj.b) - - names(results.L) <- c("t.obs", "t.cal", "t.proj", "qm1.hist.a", "qm1.hist.b", "qm1.proj.a", "qm1.proj.b") - p <- paste0("checkpoint4", v, "_", i, "_", r, "_") - print(p) - base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/three.cities/resultsL", r,"_",i,"_",v, ".RDS")) - - p <- paste0("checkpoint5", v, "_", i, "_", r, "_") - print(p) - rm(list=setdiff(ls(), c("v", "i", "r", "var", "Runs"))) - - gc(reset=TRUE) - - - } - } -} -} From 470fb08bd62b471afddb5a55cf2d6e4a2240cbe6 Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Tue, 5 Sep 2023 12:29:03 +0000 Subject: [PATCH 10/83] Including fnc for Scotland --- R/LCAT/Processing.data.for.LCAT.R | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/R/LCAT/Processing.data.for.LCAT.R b/R/LCAT/Processing.data.for.LCAT.R index 6ce08b11..f57384de 100644 --- a/R/LCAT/Processing.data.for.LCAT.R +++ b/R/LCAT/Processing.data.for.LCAT.R @@ -1,25 +1,28 @@ rm(list=ls()) -source("/home/dyme/Desktop/clim-recal/clim-recal/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R") +#setwd("~/Desktop/clim-recal/clim-recal/") +setwd("/home/dyme/Desktop/clim-recal/clim-recal") +source("R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R") library(terra) library(tidyverse) library(data.table) library(qmap) -Region.Refs <- read.csv("/home/dyme/Desktop/clim-recal/clim-recal/R/bias-correction-methods/R/LCAT/Region.Refs.csv") +Region.Refs <- read.csv("R/bias-correction-methods/R/LCAT/Region.Refs.csv") Regioncds <- Region.Refs$Regioncd #Scotland (UKM) needs to be broken down, so running on everyone else -#Regioncds.2 <- Regioncds[c(1:10, 12)] - this was killed at UKK - so running the remaining as: - -Regioncds.2 <- c("UKK", "UKL", "UKN", "UKM") - -lapply(Regioncds.2, function(i){ - apply_bias_correction_to_cropped_df(region=i, - var=c("tasmin", "tasmax", "pr"), - Runs=c("Run05", "Run06", "Run07", "Run08"))}) - -## Scotland - +Regioncds.2 <- Regioncds[c(1:10, 12)] + + apply_bias_correction_to_cropped_df(region="UKK", + var=c("tasmin", "tasmax", "pr"), + Runs=c("Run05", "Run06", "Run07", "Run08")) + +## Scotland -- further cropping so as to proccess + cropdf_further_apply_bc_to_cropped_df(region = "UKM", #Region code - needs to relate to the file name in a unique way to subset + var=c("tasmax"), + Runs=c("Run06"), + N.new.segments=4) + From dff994f61d780486fb5ada56bd525044c3141700 Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Tue, 5 Sep 2023 16:20:12 +0000 Subject: [PATCH 11/83] Deriving seasonal means --- R/LCAT/Assessing.BC.data.RMD | 99 ++++++++++++++++++++++++++---------- 1 file changed, 72 insertions(+), 27 deletions(-) diff --git a/R/LCAT/Assessing.BC.data.RMD b/R/LCAT/Assessing.BC.data.RMD index cf9c80d6..cda53546 100644 --- a/R/LCAT/Assessing.BC.data.RMD +++ b/R/LCAT/Assessing.BC.data.RMD @@ -46,7 +46,7 @@ The objects within this R list are as follows: - 'qm1.proj.a' - bias corrected values for the validation/projection period, values fitted with linear interpolation - 'qm1.proj.b' - bias corrected values for the validation/projection period, values fitted with tricubic interpolation -## **1. Bias Correction Assessment** +## **1. Bias Correction Assessment: trends** ### **London - tasmax = Run 08** @@ -60,6 +60,8 @@ London <- readRDS(paste0(dd,"/Debiased/R/QuantileMapping/resultsLRun08_UKI_tasma ### **1b. Check trends** +The next set of chunks visualise the data by converting back to raster, and by looking at the trends of data across all time periods + ```{r convert to df and raster} ## Load a source raster to extract the crs @@ -70,7 +72,7 @@ rast <- rast(rp) crs <- crs(rast) -## Convert from matix to df, transpose, create x and y cols +## Convert from matix to df, transpose, create x and y cols - when run in chunk this works fine but for some reason can throw an error when run otherwise London.df <- lapply(London, function(x){ df <- as.data.frame(t(x)) rn <- row.names(df) #The x_y coords were saves as rownames @@ -100,7 +102,7 @@ tm_shape(London.rasts$t.cal[[1]]) + tm_raster(title="Calibration, 1980-12-01") tm_shape(London.rasts$qm1.hist.a[[1]]) + tm_raster(title="Calibration, bias corrected, linear 1980-12-01") tm_shape(London.rasts$qm1.hist.b[[1]]) + tm_raster(title="Calibration, bias corrected, tricubic 1980-12-01") ``` -#### *Annual trends - Calibration period* +#### *Annual trends - Calibration period - daily mean* ```{r} @@ -123,14 +125,18 @@ London.dfg <- lapply(names(London.df), function(i){ dfx_g <- dfx %>% purrr::reduce(rbind) }) -names(London.dfg) <- names(London.df) +names(London.dfg) <- c("obs.daymeans", "raw.cal.daymeans", + "raw.proj.daymeans", "bc.a.cal.daymeans", + "bc.b.cal.daymeans", "bc.a.proj.daymeans", + "bc.b.proj.daymeans") ``` ```{r} #Add a day index to align the cal and obs -London.dfg.calp <- London.dfg[c("t.obs", "t.cal", "qm1.hist.a", "qm1.hist.b")] +London.dfg.calp <- London.dfg[c("obs.daymeans", "raw.cal.daymeans", + "bc.b.cal.daymeans", "bc.a.cal.daymeans")] London.dfg.calp <- lapply(London.dfg.calp, function(x){ x$dayi <- 1:nrow(x) @@ -162,38 +168,67 @@ ggplot(London.dfg.calp_mm, aes(dayi, value, group=variable, colour=variable)) + ``` -#### *Annual trends - Calibration period* - +#### *Annual trends - Calibration period - seasonal mean* -#Annotate season based on month index +Annotate season based on month index - the dates have different formats depending on the input data (ie hads vs cpm) so am pulling out the necessary to adjust sep ```{r} -proj.raw.df.g$season <- ifelse(grepl("-12-|-01-|-02-", proj.raw.df.g$dmy), "Winter", - ifelse(grepl("-03-|-04-|-05-", proj.raw.df.g$dmy), "Spring", - ifelse(grepl("-06-|-07-|-08-", proj.raw.df.g$dmy), "Summer", "Autumn"))) +#Hads/obs df +obs.daymeans.df <- London.dfg$obs.daymeans + +x <- obs.daymeans.df$day +obs.daymeans.df$season <- ifelse(grepl("1231_|0131_|0228_|0229_", x), "Winter", + ifelse(grepl("0331_|0430_|0531_", x), "Spring", + ifelse(grepl("0630_|0731_|0831_", x), "Summer", "Autumn"))) + +#A note here - the seasons should each have 90 days but seemingly Winter and Autumn have 89 and Spring and Summer have 91 - this is due to how the manual aligning worked out and should be updated when the hads data is re-run -proj.raw.df.g$year <- as.numeric(sub("-.*", "", proj.raw.df.g$dmy)) +# Mutate to a seasonal mean +obs.seasonal.mean.df <- obs.daymeans.df %>% + group_by(season_year) %>% + mutate(mean.seasonal = mean(t.obs.mean), + sd.high.seasonal = mean.seasonal + sd(t.obs.mean), + sd.low.seasonal = mean.seasonal - sd(t.obs.mean)) + +obs.seasonal.mean.df <- obs.seasonal.mean.df %>% + dplyr::select(season_year:sd.low.seasonal) %>% distinct() + +#Grouping variable for later vars +obs.seasonal.mean.df$model <- "obs" +``` + +```{r} +#lapply needs to needed -#Create a season_year var than considers the same Winter season across 2 years -## i.e. - Jan 2021 is considered as Winter 2020 -proj.raw.df.g$season_year <- ifelse(proj.raw.df.g$season != "Winter"| grepl("-12-", proj.raw.df.g$dmy), - paste0(proj.raw.df.g$season, "_", proj.raw.df.g$year), paste0(proj.raw.df.g$season,"_", proj.raw.df.g$year-1)) +London.dfg[c("raw.cal.daymeans", "bc.b.cal.daymeans", "bc.a.cal.daymeans")] +#lapply for remaining dfs + x <- df$day + #The CPM days are consecutive 1 - 360 by year + winter <- paste0(30, "_", 1:90, collapse="|") + spring <- paste0(30, "_", 91:180, collapse="|") + summer <- paste0(30, "_", 181:270, collapse="|") +``` +#HERE - sorting out below season +```{r} + df$season <- ifelse(grepl(winter, x), "Winter", + ifelse(grepl(spring, x), "Spring", + ifelse(grepl(summer, x), "Summer", "Autumn"))) -#Calculate seasonal mean and SD -seasonal.mean <- proj.raw.df.g %>% - group_by(season_year) %>% mutate(mean.seasonal = mean(mean), - sd.high.seasonal = mean.seasonal + sd(mean), - sd.low.seasonal = mean.seasonal - sd(mean)) +# Mutate to a seasonal mean +obs.seasonal.mean.df <- obs.daymeans.df %>% + group_by(season_year) %>% + mutate(mean.seasonal = mean(t.obs.mean), + sd.high.seasonal = mean.seasonal + sd(t.obs.mean), + sd.low.seasonal = mean.seasonal - sd(t.obs.mean)) -#Remove daily vals to avoid confusion -seasonal.mean[c("mean", "sd.high", "sd.low")] <- NULL +obs.seasonal.mean.df <- obs.seasonal.mean.df %>% + dplyr::select(season_year:sd.low.seasonal) %>% distinct() -#Remove duplicate values -seasonal.mean <- distinct(seasonal.mean, season_year, .keep_all=T) #160 seasons +obs.seasonal.mean.df$model <- "obs" ``` @@ -270,7 +305,17 @@ ggplot(dfg_sm_s) + ``` -### Metrics +#### *Annual trends - seasonal max* + +I think visualising the daily data is not mega helpful, but now grouping to season and calculating the seasonal maxima vals (i.e. rather than means above) + +#### *Validation period - annual trends - seasonal mean* + +#### *Validation period - annual trends - seasonal max* + + +## **2. Bias Correction Assessment: Metrics** -Add in HADs data +Add in HADs data from the cal period +### mean by cell From c15beabc9e28a1314f283e57647da4dd86d176da Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Thu, 7 Sep 2023 11:25:13 +0000 Subject: [PATCH 12/83] Loading obs data --- R/LCAT/Assessing.BC.data.RMD | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/R/LCAT/Assessing.BC.data.RMD b/R/LCAT/Assessing.BC.data.RMD index cda53546..9cbeaeee 100644 --- a/R/LCAT/Assessing.BC.data.RMD +++ b/R/LCAT/Assessing.BC.data.RMD @@ -58,6 +58,26 @@ London <- readRDS(paste0(dd,"/Debiased/R/QuantileMapping/resultsLRun08_UKI_tasma ``` +Load in Hads validation data +(So this can be run for all of the LCAT data, I'm going to read in the whole HADs files for the calibration years) + +**The calibration period is 2009-12-01 to 2019-11-30 to relate to the CPM month grouping** + +```{r} + +fp <- "/mnt/vmfileshare/ClimateData/Processed/HadsUKgrid/resampled_2.2km/tasmax/day/" +hads.tasmax <- list.files(fp) +i <- grep("20091201-", hads.tasmax) +ii <- grep("20191130", hads.tasmax) +hads.tasmax.i <- hads.tasmax[i:ii] + +source("/home/dyme/Desktop/clim-recal/clim-recal/R/misc/read_crop.fn.R") + + + +``` + + ### **1b. Check trends** The next set of chunks visualise the data by converting back to raster, and by looking at the trends of data across all time periods @@ -73,7 +93,8 @@ rast <- rast(rp) crs <- crs(rast) ## Convert from matix to df, transpose, create x and y cols - when run in chunk this works fine but for some reason can throw an error when run otherwise -London.df <- lapply(London, function(x){ +London.df <- lapply(London, + function(x){ df <- as.data.frame(t(x)) rn <- row.names(df) #The x_y coords were saves as rownames x <- gsub("_.*", "", rn) From 89df88eacd571fcfb423b74a2ef5b11aec35f93f Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Thu, 7 Sep 2023 11:27:12 +0000 Subject: [PATCH 13/83] Simplifying code - this might be a conflict later though --- R/misc/Cropping_Rasters_to_three_cities.R | 144 +++------------------- 1 file changed, 18 insertions(+), 126 deletions(-) diff --git a/R/misc/Cropping_Rasters_to_three_cities.R b/R/misc/Cropping_Rasters_to_three_cities.R index 7e0812b5..7cf0188e 100644 --- a/R/misc/Cropping_Rasters_to_three_cities.R +++ b/R/misc/Cropping_Rasters_to_three_cities.R @@ -1,12 +1,14 @@ ## Crop CPM and HADs rm(list=ls()) - -source("~/Desktop/clim-recal/clim-recal/R/misc/read_crop.fn.R") +#setwd("~/Desktop/clim-recal/clim-recal/") +#setwd("/home/dyme/Desktop/clim-recal/clim-recal") +source("R/misc/read_crop.fn.R") library(tidyverse) library(data.table) library(qmap) +library(terra) dd <- "/mnt/vmfileshare/ClimateData/" @@ -54,151 +56,41 @@ cities <- c("London", "Glasgow", "Manchester") ext.L <- list(London.ext, Glasgow.ext, Manchester.ext) names(ext.L) <- cities -lapply(cities, function(x){ - - cpm_read_crop(runs=runs, var = var, - fp = paste0(dd, "Reprojected/UKCP2.2/"), - year1=1980, - year2=2000, - crop.area=ext.L[[x]], - cropname=x) }) - - -# Splitting up next time slice for calib and val - lapply(cities, function(x){ cpm_read_crop(runs=runs, var = var, - fp = paste0(dd, "Reprojected_infill/UKCP2.2/"), - year1=2000, - year2=2010, + fp = paste0(dd, "Reprojected_infill/UKCP2.2/"), + rd = paste0(dd, "Cropped/three.cities/CPM/"), crop.area=ext.L[[x]], cropname=x) }) -lapply(cities, function(x){ - - cpm_read_crop(runs=runs, var = var, - fp = paste0(dd, "Reprojected_infill/UKCP2.2/"), - year1=2010, - year2=2020, - crop.area=ext.L[[x]], - cropname=x) }) -## Next time slice 2020-2040 -lapply(cities, function(x){ - - cpm_read_crop(runs=runs, var = var, - fp = paste0(dd, "Reprojected/UKCP2.2/"), - year1=2020, - year2=2040, - crop.area=ext.L[[x]], - cropname=x) }) +#### HADS - original 360 -## Next time slice 2040-2060 -lapply(cities, function(x){ - - cpm_read_crop(runs=runs, var = var, - fp = paste0(dd, "Reprojected_infill/UKCP2.2/"), - year1=2040, - year2=2060, - crop.area=ext.L[[x]], - cropname=x) }) +var <- c("tasmax", "tasmin", "rainfall") -## Next time slice 2060-2080 lapply(cities, function(x){ - cpm_read_crop(runs=runs, var = var, - fp = paste0(dd, "Reprojected/UKCP2.2/"), - year1=2060, - year2=2080, - crop.area=ext.L[[x]], - cropname=x) }) - - + hads_read_crop(var = var, + fp= paste0(dd, "Processed/HadsUKgrid/resampled_2.2km/"), + rd= paste0(dd, "Cropped/three.cities/Hads.original360/"), + file.date="19801201", #Start from the same date as the CPM + crop.area=ext.L[[x]], + cropname=x) }) -#### HADS -#Calibration years files 1 - 360 (first 30 years) +#### HADs - updated 360 calendar (to be run pending updated files) var <- c("tasmax", "tasmin", "rainfall") -lapply(cities, function(x){ - - hads_read_crop(var = var, - fp= paste0(dd, "Processed/HadsUKgrid/resampled_2.2km/"), - i1 = 1, i2 = 360, - crop.area=ext.L[[x]], - cropname=x) }) - -#Validation years files 361 - 480 -- years 2010 - 2020 - lapply(cities, function(x){ hads_read_crop(var = var, - fp= paste0(dd, "Processed/HadsUKgrid/resampled_2.2km/"), - i1 = 361, i2 = 480, + fp= paste0(dd, "Processed/HadsUKgrid/resampled_calendarfix/"), + rd= paste0(dd, "Cropped/three.cities/Hads.updated360/"), + file.date="19801201", #Start from the same date as the CPM crop.area=ext.L[[x]], cropname=x) }) -### Group the CPM to cal, val and projection -runs <- c("05", "07", "08", "06") -var <- c("tasmax", "tasmin","pr") - -for(x in cities){ - for(r in runs){ - for(v in var){ - p <- paste0(dd, "Interim/CPM/three.cities/", x, "/") - files <- list.files(p) - - files.y.v <- files[grepl("day_1980|day_2000", files)&grepl(v, files)&grepl(paste0(r, "_day"), files)] - - dfL <- lapply(files.y.v, function(n){ - f <- paste0(p, n) - r <- rast(f) - }) - - R <- dfL %>% reduce(c) - - #Write directory - rp <- paste0(dd, "Interim/CPM/three.cities/", x, "/grouped/",x, "_") #adding in cropname to write, I think will make easier to track - - fn <- paste0(rp, v, "_", r,"_calibration_1980-2010.tif") - writeRaster(R, fn, overwrite=TRUE) - - gc() - } -} -} - -#For validation I just copied over and renamed the files as they were already split that way - -## Projection years - -for(x in cities){ - for(r in runs){ - for(v in var){ - p <- paste0(dd, "Interim/CPM/three.cities/", x, "/") - files <- list.files(p) - - files.y.v <- files[grepl("day_2020|day_2040|day_2060", files)&grepl(v, files)&grepl(paste0(r, "_day"), files)] - - dfL <- lapply(files.y.v, function(n){ - f <- paste0(p, n) - r <- rast(f) - }) - - - R <- dfL %>% reduce(c) - - #Write directory - rp <- paste0(dd, "Interim/CPM/three.cities/", x, "/grouped/",x, "_") #adding in cropname to write, I think will make easier to track - - fn <- paste0(rp, v, "_", r,"_projection_2020-2080.tif") - writeRaster(R, fn, overwrite=TRUE) - - gc() - } - } -} From b1958554b27163b21d9273ceffea45b5070c5d1f Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Tue, 12 Sep 2023 13:15:54 +0000 Subject: [PATCH 14/83] Adding metrics, sorting out fig generation --- R/LCAT/Assessing.BC.data.RMD | 116 +++++++++++++++++++++++++++++++---- 1 file changed, 104 insertions(+), 12 deletions(-) diff --git a/R/LCAT/Assessing.BC.data.RMD b/R/LCAT/Assessing.BC.data.RMD index 9cbeaeee..d5be3960 100644 --- a/R/LCAT/Assessing.BC.data.RMD +++ b/R/LCAT/Assessing.BC.data.RMD @@ -63,17 +63,16 @@ Load in Hads validation data **The calibration period is 2009-12-01 to 2019-11-30 to relate to the CPM month grouping** -```{r} - -fp <- "/mnt/vmfileshare/ClimateData/Processed/HadsUKgrid/resampled_2.2km/tasmax/day/" -hads.tasmax <- list.files(fp) -i <- grep("20091201-", hads.tasmax) -ii <- grep("20191130", hads.tasmax) -hads.tasmax.i <- hads.tasmax[i:ii] - -source("/home/dyme/Desktop/clim-recal/clim-recal/R/misc/read_crop.fn.R") +Hads data were also cropped to the regional files for the calibration years - some of the dates might need to be added from the observation (or just be ignored for ease) +```{r} +fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") +f <- list.files(fp) +v <- "tasmax" +reg <- "UKI" +f <- f[grepl("2010_2020", f)&grepl(v,f)&grepl(reg, f)] +obs.val.df <- read.csv(paste0(fp,f)) # Starts here from 2010 - 01 -01 -- because for the BC I removed these vals to align with the cpm years we're missing the month of Dec - so need to update the cpm data to reflect this in the assessment -- wont be a problem for other BC data ``` @@ -94,8 +93,8 @@ crs <- crs(rast) ## Convert from matix to df, transpose, create x and y cols - when run in chunk this works fine but for some reason can throw an error when run otherwise London.df <- lapply(London, - function(x){ - df <- as.data.frame(t(x)) + function(d){ + df <- as.data.frame(t(d)) rn <- row.names(df) #The x_y coords were saves as rownames x <- gsub("_.*", "", rn) y <- gsub(".*_", "", rn) @@ -330,13 +329,106 @@ ggplot(dfg_sm_s) + I think visualising the daily data is not mega helpful, but now grouping to season and calculating the seasonal maxima vals (i.e. rather than means above) +#### Create validaton df list + +Adding in the observational HADs data and aligning based on dates + +*Note* So as not to re-run the UK wide LCAT data processing, a workaround was added to the bias correction function used to group the obs data - this means that to align the validation cpm data we have to remove a month in the beginning - ie the LCAT specific + +20191201-20201130_30 - runs to the end of the month of December of 2019 -- so should map the obvs.val + +```{r} + +#Extract validation period of raw and bias corrected CPM data +cpm.val.dfs <- lapply(London.df[c("t.proj", "qm1.proj.a", "qm1.proj.b")], function(x){ + i <- grep("20191201-20201130_30", names(x))[1] + df <- x[,1:i] +}) + +#Using the old cpm data for the hads obs - so need to remove the dates to ensure theres 30 days per year +remove <- c("0229_29", "0430_30", "0731_31", "0930_30", "1130_30") +remove <- paste0(remove, collapse = "|") + +obs.val.df <- obs.val.df[,!grepl(remove, names(obs.val.df))] +row.names(obs.val.df) <- paste0(obs.val.df$x, "_", obs.val.df$y) + +val.dfs <- c(list(obs.val.df), cpm.val.dfs) +names(val.dfs) <- c("obs.val.df", "raw.cpm.val", "bc1.cpm.val", "bc2.cpm.val") +``` + + #### *Validation period - annual trends - seasonal mean* + + #### *Validation period - annual trends - seasonal max* ## **2. Bias Correction Assessment: Metrics** -Add in HADs data from the cal period +Using the validation data set for this + +Most metrics will just require vectors of values at this point, although it would be nice to have the georeferenced incorporated incase values spatially vary depending on eg topography + +```{r} +val.dfs.v <- lapply(val.dfs, function(x){ + #Remove x and y + x$x <- NULL + x$y <- NULL + #Convert to single vector + unlist(as.vector(x)) +}) + +val.dfs.v.df <- val.dfs.v %>% reduce(cbind) +val.dfs.v.df <- as.data.frame(val.dfs.v.df) +``` + +### **2a. Descriptive statistics** + +```{r descriptives validation} + +descriptives <- lapply(val.dfs.v, function(x){ + per <- data.frame(as.list(quantile(x, probs=c(0.1, 0.9)))) + data.frame(mean=mean(x), sd=sd(x), per10th=per$X10.,per90th=per$X90.) +}) + +descriptives <- descriptives %>% reduce(rbind) +row.names(descriptives) <- c("obs", "raw.cpm", "bc1.cpm", "bc2.cpm") +descriptives +``` + + + +#### **Distribution** + +```{r} + +cal.val.dfg <- reshape2::melt(as.matrix(val.dfs.v)) + + +``` + + +### **2b. RMSE** + +sqrt(mean((data$actual - data$predicted)^2)) + +```{r} + +actual <- val.dfs.v$obs.val.df + +rmse <- sapply(val.dfs.v[c(2:4)], function(x){ + sqrt(mean((actual - x)^2)) +}) + +data.frame(as.list(rmse), row.names = "RMSE") +``` + +In this example, the first bias correction has a lower RMSE (just!) and therefore is better fitting than the raw + +# Taylor diagram + + +## **3. Bias Correction Assessment: Metric specific - tasmax** ### mean by cell From 4e8be762eb7ea4c6e3f965960d134314ad5ae894 Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Tue, 12 Sep 2023 16:38:46 +0000 Subject: [PATCH 15/83] Adding in all runs rather than one and extracting rasters for general assesment --- R/LCAT/Assessing.BC.data.RMD | 171 +++++++++++++++++++++++------------ 1 file changed, 114 insertions(+), 57 deletions(-) diff --git a/R/LCAT/Assessing.BC.data.RMD b/R/LCAT/Assessing.BC.data.RMD index d5be3960..7eb7009c 100644 --- a/R/LCAT/Assessing.BC.data.RMD +++ b/R/LCAT/Assessing.BC.data.RMD @@ -17,6 +17,7 @@ library(ggplot2) library(terra) library(tmap) #pretty maps library(RColorBrewer) +library(plotrix) #For taylor diagrams dd <- "/mnt/vmfileshare/ClimateData/" @@ -54,8 +55,13 @@ Using the London region (UKI) as this is the smallest -- not this is the same re ```{r} -London <- readRDS(paste0(dd,"/Debiased/R/QuantileMapping/resultsLRun08_UKI_tasmax.RDS")) +runs <- c("Run05", "Run06", "Run07", "Run08") +London.allruns <- lapply(runs, function(i){ + rds <- paste0(dd,"/Debiased/R/QuantileMapping/resultsL",i,"_UKI_tasmax.RDS") + readRDS(rds)}) + +names(London.allruns) <- runs ``` Load in Hads validation data @@ -79,8 +85,9 @@ obs.val.df <- read.csv(paste0(fp,f)) # Starts here from 2010 - 01 -01 -- because ### **1b. Check trends** -The next set of chunks visualise the data by converting back to raster, and by looking at the trends of data across all time periods +The next set of chunks visualise the data +This next chunk converts the dfs back to raster with the correct CRS ```{r convert to df and raster} ## Load a source raster to extract the crs @@ -91,39 +98,94 @@ rast <- rast(rp) crs <- crs(rast) -## Convert from matix to df, transpose, create x and y cols - when run in chunk this works fine but for some reason can throw an error when run otherwise -London.df <- lapply(London, - function(d){ - df <- as.data.frame(t(d)) - rn <- row.names(df) #The x_y coords were saves as rownames - x <- gsub("_.*", "", rn) - y <- gsub(".*_", "", rn) - xy <- data.frame(x=x,y=y) - df <- cbind(xy,df) +## Convert from matrix to df, transpose, create x and y cols - when run in chunk this works fine but for some reason can throw an error when run otherwise +London.df.rL <- lapply(runs, function(i){ + L <- London.allruns[[i]] + lapply(L, function(x){ + df <- t(x) + df <- as.data.frame(df) + rn <- row.names(df) #The rownames were saves as x_y coordinates + xi <- gsub("_.*", "", rn) + yi <- gsub(".*_", "", rn) + xy <- data.frame(x = xi, y = yi) + df <- cbind(xy, df)}) }) +names(London.df.rL) <- runs + ## Convert to rasters -London.rasts <- lapply(London.df, function(x){ - r <- rast(x, type="xyz") - crs(r) <- crs - return(r) +London.rasts <- lapply(runs, function(i){ + L <- London.df.rL[[i]] + lapply(L, function(x){ + r <- rast(x, type="xyz") + crs(r) <- crs + return(r)}) }) +names(London.rasts) <- runs + ``` -#### *Raster vis comparison* +#### **Raster vis comparison** Random selection of 3 days of the observation, calibration and two adjusted cals, for three historic days +(Note: I'm just plotting the bias corrected with linear interpolation so as to overwhelm with plots) + +##### *Day 1 - 1980-12-01* + +```{r, figures-side, fig.show="hold", out.width="33%"} +tm_shape(London.rasts$Run05$t.obs[[1]]) + tm_raster(title="Observation, 1980-12-01") #Obviously just one call of the observation +tm_shape(London.rasts$Run05$t.cal[[1]]) + tm_raster(title="Calibration, Run 05, Raw 1980-12-01") +tm_shape(London.rasts$Run06$t.cal[[1]]) + tm_raster(title="Calibration, Run 06, Raw 1980-12-01") +tm_shape(London.rasts$Run07$t.cal[[1]]) + tm_raster(title="Calibration, Run 07, Raw 1980-12-01") +tm_shape(London.rasts$Run08$t.cal[[1]]) + tm_raster(title="Calibration, Run 08, Raw 1980-12-01") +tm_shape(London.rasts$Run05$qm1.hist.a[[1]]) + tm_raster(title="Calibration, Run 05, BC 1980-12-01") +tm_shape(London.rasts$Run06$qm1.hist.a[[1]]) + tm_raster(title="Calibration, Run 06, BC 1980-12-01") +tm_shape(London.rasts$Run07$qm1.hist.a[[1]]) + tm_raster(title="Calibration, Run 07, BC 1980-12-01") +tm_shape(London.rasts$Run08$qm1.hist.a[[1]]) + tm_raster(title="Calibration, Run 08, BC 1980-12-01") + + +``` + +##### *Day 2 - 1991-06-01* + +Just to note I was so suprised by how much lower the observation data was for this raster I loaded the raw HADs to check (in resampled_2.2km/tasmax and the original 1km grid it does reflect it - it just seems very low) + +```{r, figures-side, fig.show="hold", out.width="33%"} +tm_shape(London.rasts$Run05$t.obs[[3781]]) + tm_raster(title="Observation, 1991-06-01") #Obviously just one call of the observation +tm_shape(London.rasts$Run05$t.cal[[3781]]) + tm_raster(title="Calibration, Run 05, Raw 1991-06-01") +tm_shape(London.rasts$Run06$t.cal[[3781]]) + tm_raster(title="Calibration, Run 06, Raw 1991-06-01") +tm_shape(London.rasts$Run07$t.cal[[3781]]) + tm_raster(title="Calibration, Run 07, Raw 1991-06-01") +tm_shape(London.rasts$Run08$t.cal[[3781]]) + tm_raster(title="Calibration, Run 08, Raw 1991-06-01") +tm_shape(London.rasts$Run05$qm1.hist.a[[3781]]) + tm_raster(title="Calibration, Run 05, BC 1991-06-01") +tm_shape(London.rasts$Run06$qm1.hist.a[[3781]]) + tm_raster(title="Calibration, Run 06, BC 1991-06-01") +tm_shape(London.rasts$Run07$qm1.hist.a[[3781]]) + tm_raster(title="Calibration, Run 07, BC 1991-06-01") +tm_shape(London.rasts$Run08$qm1.hist.a[[3781]]) + tm_raster(title="Calibration, Run 08, BC 1991-06-01") + +``` + + + +##### *Day 3 - 2000-08-01* + + +```{r, figures-side, fig.show="hold", out.width="33%"} +tm_shape(London.rasts$Run05$t.obs[[7081]]) + tm_raster(title="Observation, 2000-08-01") #Obviously just one call of the observation +tm_shape(London.rasts$Run05$t.cal[[7081]]) + tm_raster(title="Calibration, Run 05, Raw 2000-08-01") +tm_shape(London.rasts$Run06$t.cal[[7081]]) + tm_raster(title="Calibration, Run 06, Raw 2000-08-01") +tm_shape(London.rasts$Run07$t.cal[[7081]]) + tm_raster(title="Calibration, Run 07, Raw 2000-08-01") +tm_shape(London.rasts$Run08$t.cal[[7081]]) + tm_raster(title="Calibration, Run 08, Raw 2000-08-01") +tm_shape(London.rasts$Run05$qm1.hist.a[[7081]]) + tm_raster(title="Calibration, Run 05, BC 2000-08-01") +tm_shape(London.rasts$Run06$qm1.hist.a[[7081]]) + tm_raster(title="Calibration, Run 06, BC 2000-08-01") +tm_shape(London.rasts$Run07$qm1.hist.a[[7081]]) + tm_raster(title="Calibration, Run 07, BC 2000-08-01") +tm_shape(London.rasts$Run08$qm1.hist.a[[7081]]) + tm_raster(title="Calibration, Run 08, BC 2000-08-01") -```{r} -tm_shape(London.rasts$t.obs[[1]]) + tm_raster(title="Observation, 1980-12-01") -tm_shape(London.rasts$t.cal[[1]]) + tm_raster(title="Calibration, 1980-12-01") -tm_shape(London.rasts$qm1.hist.a[[1]]) + tm_raster(title="Calibration, bias corrected, linear 1980-12-01") -tm_shape(London.rasts$qm1.hist.b[[1]]) + tm_raster(title="Calibration, bias corrected, tricubic 1980-12-01") ``` + #### *Annual trends - Calibration period - daily mean* +#here convert to runs aspect ```{r} London.dfg <- lapply(names(London.df), function(i){ @@ -202,10 +264,15 @@ obs.daymeans.df$season <- ifelse(grepl("1231_|0131_|0228_|0229_", x), "Winter", ifelse(grepl("0331_|0430_|0531_", x), "Spring", ifelse(grepl("0630_|0731_|0831_", x), "Summer", "Autumn"))) -#A note here - the seasons should each have 90 days but seemingly Winter and Autumn have 89 and Spring and Summer have 91 - this is due to how the manual aligning worked out and should be updated when the hads data is re-run - +#Note: the seasons should each have 90 days but seemingly Winter and Autumn have 89 and Spring and Summer have 91 - this is due to how the manual aligning worked out and should be updated when the hads data is re-run -# Mutate to a seasonal mean +#Create season_year - All Winter months apart from Dec to be added to the previous year (ie Winter 2000) would be the Dec of 2000 to the Feb of 2001 +year <- gsub("^[^_]*_", "", x) +year <- as.numeric(substr(year, 1,4)) +obs.daymeans.df$season_year <- ifelse(grepl("0131_|0228_|0229_", x), + paste0(obs.daymeans.df$season, year-1), + paste0(obs.daymeans.df$season, year)) +# Mutate to a seasonal mean df obs.seasonal.mean.df <- obs.daymeans.df %>% group_by(season_year) %>% mutate(mean.seasonal = mean(t.obs.mean), @@ -222,55 +289,45 @@ obs.seasonal.mean.df$model <- "obs" ```{r} #lapply needs to needed -London.dfg[c("raw.cal.daymeans", "bc.b.cal.daymeans", "bc.a.cal.daymeans")] - -#lapply for remaining dfs +London.dfg.seasonal.mean <- lapply(c("raw.cal.daymeans", "bc.b.cal.daymeans", "bc.a.cal.daymeans"), function(i){ + df <- London.dfg[[i]] x <- df$day + x <- gsub(".*_", "", x) + x <- as.numeric(x) #The CPM days are consecutive 1 - 360 by year - winter <- paste0(30, "_", 1:90, collapse="|") - spring <- paste0(30, "_", 91:180, collapse="|") - summer <- paste0(30, "_", 181:270, collapse="|") -``` -#HERE - sorting out below season -```{r} - df$season <- ifelse(grepl(winter, x), "Winter", - ifelse(grepl(spring, x), "Spring", - ifelse(grepl(summer, x), "Summer", "Autumn"))) + df$season <- ifelse(x<91, "Winter", + ifelse(x<181, "Spring", + ifelse(x<271, "Summer", "Autumn"))) + + +#Create season_year - All Winter months apart from Dec to be added to the previous year (ie Winter 2000) would be the Dec of 2000 to the Feb of 2001 +year <- gsub(".*day_", "", df$day) +year <- as.numeric(substr(year, 1,4)) +df$season_year <- ifelse(x>29&x<91, + paste0(df$season, year-1), + paste0(df$season, year)) # Mutate to a seasonal mean -obs.seasonal.mean.df <- obs.daymeans.df %>% +df <- obs.daymeans.df %>% group_by(season_year) %>% mutate(mean.seasonal = mean(t.obs.mean), sd.high.seasonal = mean.seasonal + sd(t.obs.mean), sd.low.seasonal = mean.seasonal - sd(t.obs.mean)) -obs.seasonal.mean.df <- obs.seasonal.mean.df %>% +df <- df %>% dplyr::select(season_year:sd.low.seasonal) %>% distinct() +ii <- gsub(".daymeans", "",i) -obs.seasonal.mean.df$model <- "obs" -``` - +df$model <- ii +as.data.frame(df)}) -```{r Raw trend seasonal} - -#Add in missing years for clearer plotting of trend -dfg_sm <- seasonal.mean - -seas.miss <- rep(c("Spring", "Summer", "Autumn", "Winter"), 19) -year.miss <- rep(2041:2059, each=4) +names(London.dfg.seasonal.mean) +``` -add.s.y <- paste0(seas.miss, "_", year.miss) -add.s.y <- c("Winter_2040", add.s.y) +```{r} -dfg_sm <- plyr::rbind.fill(dfg_sm, - data.frame(year=c(2040, year.miss), - season_year=add.s.y, - mean.seasonal=NA, - sd.low.seasonal=NA, - sd.high.seasonal=NA)) -dfg_sm <- dfg_sm[order(dfg_sm$year),] ``` From 049e74933c6a6e6c8d5fb98ac877cfaadbb028b5 Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Tue, 12 Sep 2023 22:18:29 +0000 Subject: [PATCH 16/83] Adding loops to deal with the four runs simultaneously --- R/LCAT/Assessing.BC.data.RMD | 179 +++++++++++++++++++---------------- 1 file changed, 99 insertions(+), 80 deletions(-) diff --git a/R/LCAT/Assessing.BC.data.RMD b/R/LCAT/Assessing.BC.data.RMD index 7eb7009c..47dc7c19 100644 --- a/R/LCAT/Assessing.BC.data.RMD +++ b/R/LCAT/Assessing.BC.data.RMD @@ -185,55 +185,68 @@ tm_shape(London.rasts$Run08$qm1.hist.a[[7081]]) + tm_raster(title="Calibration, #### *Annual trends - Calibration period - daily mean* -#here convert to runs aspect ```{r} - -London.dfg <- lapply(names(London.df), function(i){ - dfi <- London.df[[i]] - x <- 3:ncol(dfi) - - dfx <- lapply(x, function(x){ - y <- dfi[,x] - mean <- mean(y, na.rm=T) - sd <- sd(y, na.rm=T) - dfr <- data.frame(mean=mean, +#Returns a list of dfs in handy format for graphing +London.dfg.rL <- lapply(runs, function(i){ + L <- London.df.rL[[i]] + dfg <- lapply(names(L), function(ii){ + dfi <- L[[ii]] + x <- 3:ncol(dfi) #ignore cols 1 & 2 with x y + #Calc mean and sd + dfx <- lapply(x, function(x){ + y <- dfi[,x] + mean <- mean(y, na.rm=T) + sd <- sd(y, na.rm=T) + dfr <- data.frame(mean=mean, sd.high=mean+sd, sd.low=mean-sd) - names(dfr) <- paste0(i,".",names(dfr)) - dfr$day <- names(dfi)[x] - return(dfr) - }) + dfr$day <- names(dfi)[x] + return(dfr) + }) - dfx_g <- dfx %>% purrr::reduce(rbind) -}) + dfx_g <- dfx %>% purrr::reduce(rbind) + }) -names(London.dfg) <- c("obs.daymeans", "raw.cal.daymeans", + names(dfg) <- c("obs.daymeans", "raw.cal.daymeans", "raw.proj.daymeans", "bc.a.cal.daymeans", "bc.b.cal.daymeans", "bc.a.proj.daymeans", "bc.b.proj.daymeans") + + return(dfg) +}) +names(London.dfg.rL) <- runs ``` + ```{r} -#Add a day index to align the cal and obs +#Create a df for all of the runs to plot +##Add a day index to align the cal and obs -London.dfg.calp <- London.dfg[c("obs.daymeans", "raw.cal.daymeans", +London.dfg.calp.L <- lapply(runs, function(i){ + dfg <- London.dfg.rL[[i]] + dfg.calp <- dfg[c("obs.daymeans", "raw.cal.daymeans", "bc.b.cal.daymeans", "bc.a.cal.daymeans")] -London.dfg.calp <- lapply(London.dfg.calp, function(x){ - x$dayi <- 1:nrow(x) - x$day<- NULL - return(x) -}) + dfg.calp <- lapply(dfg.calp, function(x){ + x$dayi <- 1:nrow(x) + x$day<- NULL + return(x) + }) + + dfg.calp <- dfg.calp %>% reduce(merge, "dayi") + dfg.calp$Run <- i + return(dfg.calp)}) -London.dfg.calp <- London.dfg.calp %>% reduce(merge, "dayi") +names(London.dfg.calp.L) <- runs + +London.dfg.calp <- London.dfg.calp.L %>% reduce(rbind) -head(London.dfg.calp) ``` ```{r} -London.dfg.calp_m <- reshape2::melt(London.dfg.calp, id="dayi") #create long df for plotting multiple lines +London.dfg.calp_m <- reshape2::melt(London.dfg.calp, id=c("dayi", "Run")) #create long df for plotting multiple lines London.dfg.calp_mm <- London.dfg.calp_m[grepl(".mean", London.dfg.calp_m$variable),] #For easy vis, only keep mean vals ``` @@ -242,7 +255,8 @@ London.dfg.calp_mm <- London.dfg.calp_m[grepl(".mean", London.dfg.calp_m$variabl ```{r Historic trend 1} ggplot(London.dfg.calp_mm, aes(dayi, value, group=variable, colour=variable)) + - geom_line() + + geom_line(alpha=0.7) + + facet_wrap(.~Run) + theme_bw() + ylab("Av daily max temp oC") + ggtitle("Tasmax Hisotric trends") + scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Day, 1980.12.01 - 2009.12.01") + @@ -256,75 +270,80 @@ Annotate season based on month index - the dates have different formats dependin ```{r} -#Hads/obs df -obs.daymeans.df <- London.dfg$obs.daymeans +seasonal.means <- lapply(runs, function(r){ + dfg <- London.dfg.rL[[r]] + #Hads/obs df + obs.daymeans.df <- dfg$obs.daymeans -x <- obs.daymeans.df$day -obs.daymeans.df$season <- ifelse(grepl("1231_|0131_|0228_|0229_", x), "Winter", + x <- obs.daymeans.df$day + obs.daymeans.df$season <- ifelse(grepl("1231_|0131_|0228_|0229_", x), + "Winter", ifelse(grepl("0331_|0430_|0531_", x), "Spring", ifelse(grepl("0630_|0731_|0831_", x), "Summer", "Autumn"))) #Note: the seasons should each have 90 days but seemingly Winter and Autumn have 89 and Spring and Summer have 91 - this is due to how the manual aligning worked out and should be updated when the hads data is re-run -#Create season_year - All Winter months apart from Dec to be added to the previous year (ie Winter 2000) would be the Dec of 2000 to the Feb of 2001 -year <- gsub("^[^_]*_", "", x) -year <- as.numeric(substr(year, 1,4)) -obs.daymeans.df$season_year <- ifelse(grepl("0131_|0228_|0229_", x), + #Create season_year - All Winter months apart from Dec to be added to the previous year (ie Winter 2000) would be the Dec of 2000 to the Feb of 2001 + year <- gsub("^[^_]*_", "", x) + year <- as.numeric(substr(year, 1,4)) + obs.daymeans.df$season_year <- ifelse(grepl("0131_|0228_|0229_", x), paste0(obs.daymeans.df$season, year-1), paste0(obs.daymeans.df$season, year)) -# Mutate to a seasonal mean df -obs.seasonal.mean.df <- obs.daymeans.df %>% - group_by(season_year) %>% - mutate(mean.seasonal = mean(t.obs.mean), - sd.high.seasonal = mean.seasonal + sd(t.obs.mean), - sd.low.seasonal = mean.seasonal - sd(t.obs.mean)) - -obs.seasonal.mean.df <- obs.seasonal.mean.df %>% - dplyr::select(season_year:sd.low.seasonal) %>% distinct() - -#Grouping variable for later vars -obs.seasonal.mean.df$model <- "obs" -``` - -```{r} -#lapply needs to needed - -London.dfg.seasonal.mean <- lapply(c("raw.cal.daymeans", "bc.b.cal.daymeans", "bc.a.cal.daymeans"), function(i){ - df <- London.dfg[[i]] - x <- df$day - x <- gsub(".*_", "", x) - x <- as.numeric(x) - #The CPM days are consecutive 1 - 360 by year - df$season <- ifelse(x<91, "Winter", + # Mutate to a seasonal mean df + obs.seasonal.mean.df <- obs.daymeans.df %>% + group_by(season_year) %>% + mutate(mean.seasonal = mean(mean), + sd.high.seasonal = mean.seasonal + sd(mean), + sd.low.seasonal = mean.seasonal - sd(mean)) + + obs.seasonal.mean.df <- obs.seasonal.mean.df %>% + dplyr::select(season_year:sd.low.seasonal) %>% distinct() + + #Grouping variable for later vars + obs.seasonal.mean.df$model <- "obs" + + + dfg.seasonal.mean <- lapply(c("raw.cal.daymeans", "bc.b.cal.daymeans", + "bc.a.cal.daymeans"), function(i){ + df <- dfg[[i]] + x <- df$day + x <- gsub(".*_", "", x) + x <- as.numeric(x) + #The CPM days are consecutive 1 - 360 by year + df$season <- ifelse(x<91, "Winter", ifelse(x<181, "Spring", ifelse(x<271, "Summer", "Autumn"))) - -#Create season_year - All Winter months apart from Dec to be added to the previous year (ie Winter 2000) would be the Dec of 2000 to the Feb of 2001 -year <- gsub(".*day_", "", df$day) -year <- as.numeric(substr(year, 1,4)) -df$season_year <- ifelse(x>29&x<91, + #Create season_year - All Winter months apart from Dec to be added to the previous year (ie Winter 2000) would be the Dec of 2000 to the Feb of 2001 + year <- gsub(".*day_", "", df$day) + year <- as.numeric(substr(year, 1,4)) + df$season_year <- ifelse(x>29&x<91, paste0(df$season, year-1), paste0(df$season, year)) + + # Mutate to a seasonal mean + df <- df %>% + group_by(season_year) %>% + mutate(mean.seasonal = mean(mean), + sd.high.seasonal = mean.seasonal + sd(mean), + sd.low.seasonal = mean.seasonal - sd(mean)) -# Mutate to a seasonal mean -df <- obs.daymeans.df %>% - group_by(season_year) %>% - mutate(mean.seasonal = mean(t.obs.mean), - sd.high.seasonal = mean.seasonal + sd(t.obs.mean), - sd.low.seasonal = mean.seasonal - sd(t.obs.mean)) - -df <- df %>% - dplyr::select(season_year:sd.low.seasonal) %>% distinct() + df <- df %>% + dplyr::select(season_year:sd.low.seasonal) %>% distinct() -ii <- gsub(".daymeans", "",i) + ii <- gsub(".daymeans", "",i) -df$model <- ii -as.data.frame(df)}) + df$model <- ii + as.data.frame(df)}) -names(London.dfg.seasonal.mean) + dff <- c(list(obs.seasonal.mean.df), dfg.seasonal.mean) %>% reduce(rbind) + dff$Run <- r + return(dff) +}) + ``` - +#Here +check above chunk works and graph as in the annual trend ```{r} From 5791e977f682db89ad47f626d3896d7b0b4dedd0 Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Fri, 15 Sep 2023 12:54:31 +0000 Subject: [PATCH 17/83] Updating with more figs --- R/LCAT/Assessing.BC.data.RMD | 260 +++++++++++++++++++++++++---------- 1 file changed, 187 insertions(+), 73 deletions(-) diff --git a/R/LCAT/Assessing.BC.data.RMD b/R/LCAT/Assessing.BC.data.RMD index 47dc7c19..d4757c38 100644 --- a/R/LCAT/Assessing.BC.data.RMD +++ b/R/LCAT/Assessing.BC.data.RMD @@ -17,6 +17,7 @@ library(ggplot2) library(terra) library(tmap) #pretty maps library(RColorBrewer) +library(tidyverse) library(plotrix) #For taylor diagrams dd <- "/mnt/vmfileshare/ClimateData/" @@ -132,7 +133,7 @@ names(London.rasts) <- runs Random selection of 3 days of the observation, calibration and two adjusted cals, for three historic days (Note: I'm just plotting the bias corrected with linear interpolation so as to overwhelm with plots) -##### *Day 1 - 1980-12-01* +##### Fig. *Day 1 - 1980-12-01* ```{r, figures-side, fig.show="hold", out.width="33%"} tm_shape(London.rasts$Run05$t.obs[[1]]) + tm_raster(title="Observation, 1980-12-01") #Obviously just one call of the observation @@ -148,7 +149,7 @@ tm_shape(London.rasts$Run08$qm1.hist.a[[1]]) + tm_raster(title="Calibration, Run ``` -##### *Day 2 - 1991-06-01* +##### Fig. *Day 2 - 1991-06-01* Just to note I was so suprised by how much lower the observation data was for this raster I loaded the raw HADs to check (in resampled_2.2km/tasmax and the original 1km grid it does reflect it - it just seems very low) @@ -167,7 +168,7 @@ tm_shape(London.rasts$Run08$qm1.hist.a[[3781]]) + tm_raster(title="Calibration, -##### *Day 3 - 2000-08-01* +##### Fig. *Day 3 - 2000-08-01* ```{r, figures-side, fig.show="hold", out.width="33%"} @@ -183,12 +184,14 @@ tm_shape(London.rasts$Run08$qm1.hist.a[[7081]]) + tm_raster(title="Calibration, ``` -#### *Annual trends - Calibration period - daily mean* +#### **Calibration period - annual trends** + ```{r} #Returns a list of dfs in handy format for graphing London.dfg.rL <- lapply(runs, function(i){ L <- London.df.rL[[i]] + names(L)[1:3] <- c("obs", "cal", "proj") dfg <- lapply(names(L), function(ii){ dfi <- L[[ii]] x <- 3:ncol(dfi) #ignore cols 1 & 2 with x y @@ -200,6 +203,7 @@ London.dfg.rL <- lapply(runs, function(i){ dfr <- data.frame(mean=mean, sd.high=mean+sd, sd.low=mean-sd) + names(dfr) <- paste0(ii, ".", names(dfr)) dfr$day <- names(dfi)[x] return(dfr) }) @@ -233,7 +237,8 @@ London.dfg.calp.L <- lapply(runs, function(i){ x$day<- NULL return(x) }) - + + dfg.calp <- dfg.calp %>% reduce(merge, "dayi") dfg.calp$Run <- i return(dfg.calp)}) @@ -248,9 +253,10 @@ London.dfg.calp <- London.dfg.calp.L %>% reduce(rbind) London.dfg.calp_m <- reshape2::melt(London.dfg.calp, id=c("dayi", "Run")) #create long df for plotting multiple lines -London.dfg.calp_mm <- London.dfg.calp_m[grepl(".mean", London.dfg.calp_m$variable),] #For easy vis, only keep mean vals +London.dfg.calp_mm <- London.dfg.calp_m[grepl("mean", London.dfg.calp_m$variable),] #For easy vis, only keep mean vals ``` +#### Fig. Calibration period - annual mean ```{r Historic trend 1} @@ -264,7 +270,7 @@ ggplot(London.dfg.calp_mm, aes(dayi, value, group=variable, colour=variable)) + ``` -#### *Annual trends - Calibration period - seasonal mean* +#### **Seasonal trends - Calibration period ** Annotate season based on month index - the dates have different formats depending on the input data (ie hads vs cpm) so am pulling out the necessary to adjust sep @@ -287,17 +293,15 @@ seasonal.means <- lapply(runs, function(r){ year <- gsub("^[^_]*_", "", x) year <- as.numeric(substr(year, 1,4)) obs.daymeans.df$season_year <- ifelse(grepl("0131_|0228_|0229_", x), - paste0(obs.daymeans.df$season, year-1), - paste0(obs.daymeans.df$season, year)) - # Mutate to a seasonal mean df - obs.seasonal.mean.df <- obs.daymeans.df %>% - group_by(season_year) %>% - mutate(mean.seasonal = mean(mean), - sd.high.seasonal = mean.seasonal + sd(mean), - sd.low.seasonal = mean.seasonal - sd(mean)) - - obs.seasonal.mean.df <- obs.seasonal.mean.df %>% - dplyr::select(season_year:sd.low.seasonal) %>% distinct() + paste0(year-1, obs.daymeans.df$season), + paste0(year, obs.daymeans.df$season)) + # Mutate to a seasonal mean df + obs.seasonal.mean.df <- aggregate(obs.daymeans.df[[1]], list(obs.daymeans.df[["season_year"]]), function(x) c(seasonal.mean = mean(x), sd.high.seasonal = mean(x) + sd(x), sd.low.seasonal = mean(x) - sd(x))) + obs.seasonal.mean.df<- data.frame(season_year=obs.seasonal.mean.df$Group.1, + seasonal.mean=obs.seasonal.mean.df$x[,"seasonal.mean"], + sd.high.seasonal = obs.seasonal.mean.df$x[,"sd.high.seasonal"], + sd.low.seasonal = obs.seasonal.mean.df$x[,"sd.low.seasonal"]) + #Grouping variable for later vars obs.seasonal.mean.df$model <- "obs" @@ -318,92 +322,196 @@ seasonal.means <- lapply(runs, function(r){ year <- gsub(".*day_", "", df$day) year <- as.numeric(substr(year, 1,4)) df$season_year <- ifelse(x>29&x<91, - paste0(df$season, year-1), - paste0(df$season, year)) + paste0(year-1, df$season), + paste0(year, df$season)) - # Mutate to a seasonal mean - df <- df %>% - group_by(season_year) %>% - mutate(mean.seasonal = mean(mean), - sd.high.seasonal = mean.seasonal + sd(mean), - sd.low.seasonal = mean.seasonal - sd(mean)) - - df <- df %>% - dplyr::select(season_year:sd.low.seasonal) %>% distinct() + # Mutate to a seasonal mean -- cant get this to run in tidyverse within loop as cant seem to get col indexing working so: + df2 <- aggregate(df[[1]], list(df[["season_year"]]), function(x) c(seasonal.mean = mean(x), sd.high.seasonal = mean(x) + sd(x), sd.low.seasonal = mean(x) - sd(x))) + + df2 <- data.frame(season_year=df2$Group.1, + seasonal.mean=df2$x[,"seasonal.mean"], + sd.high.seasonal = df2$x[,"sd.high.seasonal"], + sd.low.seasonal = df2$x[,"sd.low.seasonal"]) - ii <- gsub(".daymeans", "",i) + df2$model <- gsub(".daymeans","",i) - df$model <- ii - as.data.frame(df)}) + return(df2)}) dff <- c(list(obs.seasonal.mean.df), dfg.seasonal.mean) %>% reduce(rbind) dff$Run <- r return(dff) }) - -``` -#Here -check above chunk works and graph as in the annual trend -```{r} +names(seasonal.means) <- runs -``` +seasonal.means.df <- seasonal.means %>% reduce(rbind) +``` -**'Raw' - seasonal** +#### Fig. Calibration period - seasonal mean -```{r Raw seasonal} +```{r} -ggplot(dfg_sm) + - geom_ribbon(aes(x = 1:length(season_year), ymin = sd.low.seasonal, ymax=sd.high.seasonal), color="lightgrey", alpha=0.5) + - geom_line(aes(x=1:length(season_year), y=mean.seasonal), color="cornflowerblue", group=1) + +ggplot(seasonal.means.df, aes(season_year, seasonal.mean, group=model, colour=model)) + + geom_line() + + facet_wrap(.~Run) + theme_bw() + ylab("Av daily max temp oC") + - ggtitle("'Raw' - tasmax seasonal") + - xlab("Season - Year") + - scale_x_discrete(labels = c(dfg_sm$season_year)) + - theme(axis.text.x = element_text(angle = 270, vjust = 0.5, hjust=1)) + ggtitle("Tasmax Hisotric trends") + + scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Seasonal averages, 1980.12.01 - 2009.12.01") + + scale_color_brewer(palette="Set1", name="Model") ``` -**'Raw' - Winter only** +##### *Summer only* ```{r Raw seasonal winter} -dfg_sm_w <- subset(dfg_sm, grepl("Winter", season_year)) +dfg_sm<- subset(seasonal.means.df, grepl("Summer", season_year)) -ggplot(dfg_sm_w) + - geom_ribbon(aes(year, ymin = sd.low.seasonal, ymax=sd.high.seasonal), - fill="lightblue3", alpha=0.5) + - geom_line(aes(year, y=mean.seasonal), color="lightblue4", group=1) + - theme_bw() + ylab("Av daily max temp oC") + - ggtitle("'Raw' - tasmax seasonal - Winter only") + - xlab("Year") + - theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +ggplot(dfg_sm, aes(season_year, seasonal.mean, group=model, colour=model)) + + geom_line(alpha=0.7) + + facet_wrap(.~Run) + + theme_bw() + ylab("Av daily max temp oC -Summer average") + + ggtitle("Tasmax Hisotric trends") + + scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Summer averages, 1980.12.01 - 2009.12.01") + + scale_color_brewer(palette="Set1", name="Model") ``` +It looks purple because the two bc methods aren't revealing much difference so subsetting to just one instead -**'Raw' - Summer only** - -```{r Raw seasonal summer} +```{r} -dfg_sm_s <- subset(dfg_sm, grepl("Summer", season_year)) +dfg_sm<- subset(seasonal.means.df, !grepl("bc.b.cal", model)&grepl("Summer", season_year)) -ggplot(dfg_sm_s) + - geom_ribbon(aes(year, ymin = sd.low.seasonal, ymax=sd.high.seasonal), - fill="darkgoldenrod", alpha=0.5) + - geom_line(aes(year, y=mean.seasonal), color="darkred", group=1) + - theme_bw() + ylab("Av daily max temp oC") + - ggtitle("'Raw' - tasmax seasonal - Summer only") + - xlab("Year") + - theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +ggplot(dfg_sm, aes(season_year, seasonal.mean, group=model, colour=model)) + + geom_line(alpha=0.7) + + facet_wrap(.~Run) + + theme_bw() + ylab("Av daily max temp oC -Summer average") + + ggtitle("Tasmax Hisotric trends") + + scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Seasonal averages, 1980.12.01 - 2009.12.01") + + scale_color_brewer(palette="Set1", name="Model") ``` #### *Annual trends - seasonal max* -I think visualising the daily data is not mega helpful, but now grouping to season and calculating the seasonal maxima vals (i.e. rather than means above) +For tasmax - grouping to season and calculating the seasonal maxima vals (i.e. rather than means above) + +```{r} + +#Convert to max, out put a df in easy fig format +London.dfg.max <- lapply(runs, function(r){ + L <- London.df.rL[[r]] + names(L)[1:3] <- c("obs", "cal", "proj") + dfg <- lapply(names(L), function(ii){ + dfi <- L[[ii]] + x <- 3:ncol(dfi) #ignore cols 1 & 2 with x y + #Calc maxima of the + dfx <- lapply(x, function(x){ + xx <- dfi[,x] + data.frame(max=max(xx, na.rm=T), day= names(dfi)[x]) + }) + + dfx_g <- dfx %>% purrr::reduce(rbind) + }) + + names(dfg) <- paste0(names(L), ".max") + return(dfg) +}) + +names(London.dfg.max) <- runs + +seasonal.max.cal <- lapply(runs, function(r){ + dfg <- London.dfg.max[[r]] + #Hads/obs df + df1 <- dfg$obs.max + x <- df1$day + df1$season <- ifelse(grepl("1231_|0131_|0228_|0229_", x), + "Winter", + ifelse(grepl("0331_|0430_|0531_", x), "Spring", + ifelse(grepl("0630_|0731_|0831_", x), "Summer", "Autumn"))) + +#Note: the seasons should each have 90 days but seemingly Winter and Autumn have 89 and Spring and Summer have 91 - this is due to how the manual aligning worked out and should be updated when the hads data is re-run + + #Create season_year - All Winter months apart from Dec to be added to the previous year (ie Winter 2000) would be the Dec of 2000 to the Feb of 2001 + year <- gsub("^[^_]*_", "", x) + year <- as.numeric(substr(year, 1,4)) + df1$season_year <- ifelse(grepl("0131_|0228_|0229_", x), + paste0(year-1, df1$season), + paste0(year, df1$season)) + # Mutate to a seasonal mean df + obs.seasonal.max.df <- aggregate(df1[[1]], list(df1[["season_year"]]), max) + #Grouping variable for later vars + obs.seasonal.max.df$model <- "obs" + + dfg.seasonal.max <- lapply(c("cal.max", "qm1.hist.a.max", + "qm1.hist.b.max"), function(i){ + df <- dfg[[i]] + x <- df$day + x <- gsub(".*_", "", x) + x <- as.numeric(x) + #The CPM days are consecutive 1 - 360 by year + df$season <- ifelse(x<91, "Winter", + ifelse(x<181, "Spring", + ifelse(x<271, "Summer", "Autumn"))) + + #Create season_year - All Winter months apart from Dec to be added to the previous year (ie Winter 2000) would be the Dec of 2000 to the Feb of 2001 + year <- gsub(".*day_", "", df$day) + year <- as.numeric(substr(year, 1,4)) + df$season_year <- ifelse(x>29&x<91, + paste0(year-1, df$season), + paste0(year, df$season)) + + # Mutate to a seasonal mean -- cant get this to run in tidyverse within loop as cant seem to get col indexing working so: + df2 <- aggregate(df[[1]], list(df[["season_year"]]), max) + + df2$model <- gsub(".max","",i) + + return(df2)}) + + dff <- c(list(obs.seasonal.max.df), dfg.seasonal.max) %>% reduce(rbind) + dff$Run <- r + return(dff) +}) + +names(seasonal.max.cal) <- runs + +seasonal.maxima.df <- seasonal.max.cal %>% reduce(rbind) +names(seasonal.maxima.df) <- c("season_year", "max", "model", "Run") +``` + +#### Fig. Calibration period - seasonal max + +```{r} + +ggplot(seasonal.maxima.df, aes(season_year, max, group=model, colour=model)) + + geom_line() + + facet_wrap(.~Run) + + theme_bw() + ylab("Max daily max temp oC") + + ggtitle("Tasmax Hisotric trends") + + scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Seasonal averages, 1980.12.01 - 2009.12.01") + + scale_color_brewer(palette="Set1", name="Model") + +``` + +#### Fig. Calibration period - *Summer only* + +```{r} + +dfg_sm<- subset(seasonal.maxima.df, !grepl("qm1.hist.b", model)&grepl("Summer", season_year)) + +ggplot(dfg_sm, aes(season_year, max, group=model, colour=model)) + + geom_line(alpha=0.7) + + facet_wrap(.~Run) + + theme_bw() + ylab("Av daily max temp oC -Summer average") + + ggtitle("Tasmax Hisotric trends") + + scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Seasonal averages, 1980.12.01 - 2009.12.01") + + scale_color_brewer(palette="Set1", name="Model") + +``` +# here #### Create validaton df list @@ -415,8 +523,8 @@ Adding in the observational HADs data and aligning based on dates ```{r} -#Extract validation period of raw and bias corrected CPM data -cpm.val.dfs <- lapply(London.df[c("t.proj", "qm1.proj.a", "qm1.proj.b")], function(x){ +#Extract validation period of raw and bias corrected CPM data -- need to update the 1 +cpm.val.dfs <- lapply(London.df[[1]][c("t.proj", "qm1.proj.a", "qm1.proj.b")], function(x){ i <- grep("20191201-20201130_30", names(x))[1] df <- x[,1:i] }) @@ -446,6 +554,8 @@ Using the validation data set for this Most metrics will just require vectors of values at this point, although it would be nice to have the georeferenced incorporated incase values spatially vary depending on eg topography +**to do** I think also extracting and aligning a similar cal.dfs would really help here + ```{r} val.dfs.v <- lapply(val.dfs, function(x){ #Remove x and y @@ -487,7 +597,6 @@ cal.val.dfg <- reshape2::melt(as.matrix(val.dfs.v)) ### **2b. RMSE** -sqrt(mean((data$actual - data$predicted)^2)) ```{r} @@ -507,4 +616,9 @@ In this example, the first bias correction has a lower RMSE (just!) and therefor ## **3. Bias Correction Assessment: Metric specific - tasmax** + +#### **For future work** + +I used 100 quantiles but obviously more within qmap would help! + ### mean by cell From bd9d793527442089c944132594cf457b0987041b Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Tue, 19 Sep 2023 15:21:36 +0000 Subject: [PATCH 18/83] conversion of val dfs to run over all runs --- R/LCAT/Assessing.BC.data.RMD | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/R/LCAT/Assessing.BC.data.RMD b/R/LCAT/Assessing.BC.data.RMD index d4757c38..83fb8b65 100644 --- a/R/LCAT/Assessing.BC.data.RMD +++ b/R/LCAT/Assessing.BC.data.RMD @@ -506,12 +506,11 @@ ggplot(dfg_sm, aes(season_year, max, group=model, colour=model)) + geom_line(alpha=0.7) + facet_wrap(.~Run) + theme_bw() + ylab("Av daily max temp oC -Summer average") + - ggtitle("Tasmax Hisotric trends") + - scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Seasonal averages, 1980.12.01 - 2009.12.01") + + ggtitle("Tasmax Historic trends") + + scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Seasonal Summer averages, 1980.12.01 - 2009.12.01") + scale_color_brewer(palette="Set1", name="Model") ``` -# here #### Create validaton df list @@ -521,23 +520,27 @@ Adding in the observational HADs data and aligning based on dates 20191201-20201130_30 - runs to the end of the month of December of 2019 -- so should map the obvs.val +#HERE + ```{r} #Extract validation period of raw and bias corrected CPM data -- need to update the 1 -cpm.val.dfs <- lapply(London.df[[1]][c("t.proj", "qm1.proj.a", "qm1.proj.b")], function(x){ - i <- grep("20191201-20201130_30", names(x))[1] - df <- x[,1:i] -}) +val.dfs <- lapply(London.dfg.rL, function(x)) + London.df <- London.dfg.rL + cpm.val.dfs <- lapply(London.df[[1]][c("t.proj", "qm1.proj.a", "qm1.proj.b")], function(x){ + i <- grep("20191201-20201130_30", names(x))[1] + df <- x[,1:i] + }) -#Using the old cpm data for the hads obs - so need to remove the dates to ensure theres 30 days per year -remove <- c("0229_29", "0430_30", "0731_31", "0930_30", "1130_30") -remove <- paste0(remove, collapse = "|") + #Using the old cpm data for the hads obs - so need to remove the dates to ensure theres 30 days per year + remove <- c("0229_29", "0430_30", "0731_31", "0930_30", "1130_30") + remove <- paste0(remove, collapse = "|") -obs.val.df <- obs.val.df[,!grepl(remove, names(obs.val.df))] -row.names(obs.val.df) <- paste0(obs.val.df$x, "_", obs.val.df$y) + obs.val.df <- obs.val.df[,!grepl(remove, names(obs.val.df))] + row.names(obs.val.df) <- paste0(obs.val.df$x, "_", obs.val.df$y) -val.dfs <- c(list(obs.val.df), cpm.val.dfs) -names(val.dfs) <- c("obs.val.df", "raw.cpm.val", "bc1.cpm.val", "bc2.cpm.val") + val.dfs <- c(list(obs.val.df), cpm.val.dfs) + names(val.dfs) <- c("obs.val.df", "raw.cpm.val", "bc1.cpm.val", "bc2.cpm.val") ``` From 894efea704bc1b224b6c08a22c9c115da6d2f97a Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Tue, 19 Sep 2023 15:23:06 +0000 Subject: [PATCH 19/83] update read_crop.fn.R - this could have been pushed already? --- R/misc/read_crop.fn.R | 218 ++++++++++++++++++++++++++++-------------- 1 file changed, 144 insertions(+), 74 deletions(-) diff --git a/R/misc/read_crop.fn.R b/R/misc/read_crop.fn.R index fd0335a9..48c235fb 100644 --- a/R/misc/read_crop.fn.R +++ b/R/misc/read_crop.fn.R @@ -4,111 +4,181 @@ # A function to read in specific runs, vars and years, crop them to an area (optionally) and write vals to a georef'd df cpm_read_crop <- function(runs, #Character vector of selected runs as number only eg Run08 is "08" - var, #Character vector of selected variables - this might need changing - fp, #filepath of parent d of folders where files are - eg paste0(dd, "Reprojected_infill/UKCP2.2/") - year1, #Numeric, first year of segment - year2, #Numeric, lastyear of segment - crop.area, #Polygon of area to crop to - any Spat obj accepted by terra::crop will work - cropname){ #Character - name of crop to be assigned to the returned vect - + var, #Character vector of selected variables - this might need changing + fp, #filepath of parent d of folders where files are - eg paste0(dd, "Reprojected_infill/UKCP2.2/") + rd, #path to results directory eg paste0(dd, "Cropped/three.cities/CPM/") + crop.area, #Polygon of area to crop to - any Spat obj accepted by terra::crop will work + cropname){ #Character - name of crop to be assigned to the returned vect + runs <- runs var <- var fp <- fp - years <- paste0(year1:year2, "1201", collapse="|") - - bbox <- crop.area - - for(i in runs){ - for(v in var){ - p <- paste0(fp, v, "/", i, "/latest/") - files <- list.files(p) - files <- files[!grepl("aux.xml", files)] - - files.y <- files[grepl(years, files)]# Historical timeslice 2 for calibration - files.y.p <- paste0(p, files.y) - - # Load and convert remaining to single col dfs - dfL <- lapply(1:length(files.y.p), function(n){ - f <- files.y.p[[n]] - r <- rast(f) - r_c <- crop(r, bbox, snap="out") - }) - - R <- dfL %>% reduce(c) + rd <- rd + + bbox <- crop.area + cropname <- cropname + + for(i in runs){ + for(v in var){ + p <- paste0(fp, v, "/", i, "/latest/") + files <- list.files(p) + files <- files[!grepl("aux.xml", files)] + + files.p <- paste0(p, files) + + # Load and convert remaining to single col dfs + dfL <- lapply(1:length(files.p), function(n){ + f <- files.p[[n]] + r <- rast(f) + r_c <- crop(r, bbox, snap="out") - #Write directory - rp <- paste0(dd, "Interim/CPM/three.cities/", cropname, "/" , cropname,"_") #adding in cropname to write, I think will make easier to track + #Write + f <- files[[n]]#filename as it was read in + fn <- paste0(rd, cropname, "/" , f) - fn <- paste0(rp, v, "_rcp85_land-cpm_uk_2.2km_", i, "_day_", year1, "_", year2, ".tif") - writeRaster(R, fn, overwrite=TRUE) + writeRaster(r_c, fn, overwrite=TRUE) - gc() - } + }) + + gc() } + } } # HADs function hads_read_crop <- function(var, #Character vector of selected variables - this might need changing - fp, #filepath of parent d of folders where files are - eg paste0(dd, "Reprojected_infill/UKCP2.2/") - i1, ## First file n index, eg for 1980-2010 this is files [1:360] i1=1 (I appreciate this is a lazy code) - i2, ## First file n index, eg for 1980-2010 this is files [1:360] i2=360 (I appreciate this is a lazy code) - crop.area, #Polygon of area to crop to - any Spat obj accepted by terra::crop will work - cropname){ #Character - name of crop to be assigned to the returned df - usually the crop area - + fp, #filepath of parent d of folders where files are - eg paste0(dd, "Reprojected_infill/UKCP2.2/") + rd, #path to results directory eg paste0(dd, "Cropped/three.cities/CPM/") + file.date, #Character, Date of HADs file to crop from in YYYYMMDD + crop.area, #Polygon of area to crop to - any Spat obj accepted by terra::crop will work + cropname){ #Character - name of crop to be assigned to the returned vect + var <- var fp <- fp bbox <- crop.area cropname <- cropname - + file.date <- file.date + for(v in var){ HADs.files <- list.files(paste0(fp, v,"/day/")) files <- HADs.files[grepl(v, HADs.files)] - Runpaths <- paste0(fp,v,"/day/",files[i1:i2]) + file.i <- grep(file.date,files) + files <- files[file.i:length(files)] + files.p <- paste0(fp, v,"/day/",files) - # Load and convert remaining to single col dfs - i <- 1:length(Runpaths) - dfL <-lapply(i, function(i){ - p <- Runpaths[[i]] - r <- rast(p) - r_c <- crop(r, bbox, snap="out")}) + # Load and convert remaining to single col dfs + dfL <- lapply(1:length(files.p), function(n){ + f <- files.p[[n]] + r <- rast(f) + r_c <- crop(r, bbox, snap="out") - R <- dfL %>% reduce(c) + #Write + f <- files[[n]]#filename as it was read in + fn <- paste0(rd, cropname, "/" , f) + writeCDF(r_c, fn, overwrite=TRUE) + }) + gc() + } +} - #To ensure each layer has a useful naming convention +#Function to read and crop and convert to df, but not write it. - at some point should be merged with other R folder - lyr.n <-unlist(lapply(i, function(i){ - p <- Runpaths[[i]] - rast.names <- names(rast(p)) - +hads_read_crop_df <- function(var, #Character vector of selected variables - this might need changing + fp, #filepath of parent d of folders where files are - eg paste0(dd, "Reprojected_infill/UKCP2.2/"). Include up until var folder + file.date1, #Character, Date of HADs file to crop from in YYYYMMDD + file.date2, #Character, Date of HADs file to crop to in YYYYMMDD + crop.area){ #Polygon of area to crop to - any Spat obj accepted by terra::crop will work + + + var <- var + fp <- fp + bbox <- crop.area + file.date1 <- file.date1 + file.date2 <- file.date2 + + for(v in var){ + + HADs.files <- list.files(paste0(fp, v,"/day/")) + files <- HADs.files[grepl(v, HADs.files)] + file.i <- grep(file.date1,files) + file.ii <- grep(file.date2,files) + files <- files[file.i:file.ii] + files.p <- paste0(fp, v,"/day/",files) + + # Read in 1st runpath as df with xy coords to ensure overlay with CPM data + p <- files.p[[1]] + r <- rast(p) + rdf1 <- as.data.frame(r, xy=T) + + #To ensure subset dataframe has useful naming convention - this does not pull it through as such + n <- substr(p, nchar(p)-20, nchar(p)) + n <- gsub(".nc","", n) + names(rdf1) <- gsub("_", paste0(n, "_"), names(rdf1)) + + # Load and convert remaining to single col dfs + i <- 2:length(files.p) + + dfL <-lapply(i, function(i){ + p <- files.p[[i]] + r <- rast(p) + rdf <- as.data.frame(r) + #To ensure subset dataframe has useful naming convention - this does not pull it through as such n <- substr(p, nchar(p)-20, nchar(p)) n <- gsub(".nc","", n) - nn <- paste0("hadukgrid_2.2km_resampled", n, "_", rast.names)})) - - names(R) <- lyr.n - - #Write directory - rp <- paste0(dd, "Interim/HadsUK/three.cities/", cropname, "/" , cropname,"_") #adding in cropname to write, I think will make easier to track - - fn1 <- Runpaths[[1]] - fn1 <- gsub(".*resampled_", "",fn1) - fn1 <- gsub("-.*", "", fn1) - - ii <- length(Runpaths) - fn2 <- Runpaths[[ii]] - fn2 <- gsub(".*resampled_", "",fn2) - fn2 <- gsub(".*-", "", fn2) - fn2 <- gsub(".nc", "", fn2) + names(rdf) <- gsub("_", paste0(n, "_"), names(rdf)) + return(rdf) + }) + + df <- dfL %>% reduce(cbind) + df <- cbind(rdf1, df) + + gc() + } +} + + + +## This function for the different file structure of the updated 360 calendar - to be updated when have confirmation about the files +hads_read_crop2 <- function(var, #Character vector of selected variables - this might need changing + fp, #filepath of parent d of folders where files are - eg paste0(dd, "Reprojected_infill/UKCP2.2/") + rd, #path to results directory eg paste0(dd, "Cropped/three.cities/CPM/") + file.date, #Character, Date of HADs file to crop from in YYYYMMDD + crop.area, #Polygon of area to crop to - any Spat obj accepted by terra::crop will work + cropname){ #Character - name of crop to be assigned to the returned vect + + var <- var + fp <- fp + bbox <- crop.area + cropname <- cropname + file.date <- file.date + + for(v in var){ + + HADs.files <- list.files(paste0(fp)) + files <- HADs.files[grepl(v, HADs.files)] + file.i <- grep(file.date,files) + files <- files[file.i:length(files)] + files.p <- paste0(fp, files) + + + # Load and convert remaining to single col dfs + dfL <- lapply(1:length(files.p), function(n){ + f <- files.p[[n]] + r <- rast(f) + r_c <- crop(r, bbox, snap="out") - fn <- paste0(rp, v, "_hadukgrid_2.2km_resampled_",fn1, "_", fn2, ".tif") - writeRaster(R, fn, overwrite=TRUE) + #Write + f <- files[[n]]#filename as it was read in + fn <- paste0(rd, cropname, "/" , f) - gc() - + writeCDF(r_c, fn, overwrite=TRUE) + }) + gc() } } From 68d7aca99eba1c81fbc3f19e7a0561f6ad7912c1 Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Thu, 21 Sep 2023 11:03:38 +0000 Subject: [PATCH 20/83] Added density plot, rmse for each run --- R/LCAT/Assessing.BC.data.RMD | 106 ++++++++++++++++++++++++----------- 1 file changed, 73 insertions(+), 33 deletions(-) diff --git a/R/LCAT/Assessing.BC.data.RMD b/R/LCAT/Assessing.BC.data.RMD index 83fb8b65..b759e08f 100644 --- a/R/LCAT/Assessing.BC.data.RMD +++ b/R/LCAT/Assessing.BC.data.RMD @@ -516,31 +516,31 @@ ggplot(dfg_sm, aes(season_year, max, group=model, colour=model)) + Adding in the observational HADs data and aligning based on dates -*Note* So as not to re-run the UK wide LCAT data processing, a workaround was added to the bias correction function used to group the obs data - this means that to align the validation cpm data we have to remove a month in the beginning - ie the LCAT specific - -20191201-20201130_30 - runs to the end of the month of December of 2019 -- so should map the obvs.val - -#HERE +*Note* So as not to re-run the UK wide LCAT data processing, a workaround was added to the bias correction function used to group the obs data - this means that to align the validation cpm data we have to remove a month in the beginning ```{r} -#Extract validation period of raw and bias corrected CPM data -- need to update the 1 -val.dfs <- lapply(London.dfg.rL, function(x)) - London.df <- London.dfg.rL - cpm.val.dfs <- lapply(London.df[[1]][c("t.proj", "qm1.proj.a", "qm1.proj.b")], function(x){ +#Extract validation period of raw and bias corrected CPM data +val.dfs <- lapply(runs, function(r){ + London.df <- London.df.rL[[r]] + cpm.val.dfs <- lapply(London.df[c("t.proj", "qm1.proj.a", "qm1.proj.b")], function(x){ i <- grep("20191201-20201130_30", names(x))[1] df <- x[,1:i] }) - #Using the old cpm data for the hads obs - so need to remove the dates to ensure theres 30 days per year - remove <- c("0229_29", "0430_30", "0731_31", "0930_30", "1130_30") - remove <- paste0(remove, collapse = "|") + #Using the old cpm data for the hads obs - so need to remove the dates to ensure theres 30 days per year + remove <- c("0229_29", "0430_30", "0731_31", "0930_30", "1130_30") + remove <- paste0(remove, collapse = "|") - obs.val.df <- obs.val.df[,!grepl(remove, names(obs.val.df))] - row.names(obs.val.df) <- paste0(obs.val.df$x, "_", obs.val.df$y) + obs.val.df <- obs.val.df[,!grepl(remove, names(obs.val.df))] + row.names(obs.val.df) <- paste0(obs.val.df$x, "_", obs.val.df$y) - val.dfs <- c(list(obs.val.df), cpm.val.dfs) - names(val.dfs) <- c("obs.val.df", "raw.cpm.val", "bc1.cpm.val", "bc2.cpm.val") + val.dfs <- c(list(obs.val.df), cpm.val.dfs) + names(val.dfs) <- c("obs.val.df", "raw.cpm.val", "bc1.cpm.val", "bc2.cpm.val") + return(val.dfs) + }) + +names(val.dfs) <- runs ``` @@ -555,35 +555,52 @@ val.dfs <- lapply(London.dfg.rL, function(x)) Using the validation data set for this -Most metrics will just require vectors of values at this point, although it would be nice to have the georeferenced incorporated incase values spatially vary depending on eg topography - -**to do** I think also extracting and aligning a similar cal.dfs would really help here ```{r} -val.dfs.v <- lapply(val.dfs, function(x){ +#Convert dfs to a vector +val.dfs.v <- lapply(runs, function(r){ + dfs <- val.dfs[[r]] + dfs2 <- lapply(dfs, function(d){ #Remove x and y - x$x <- NULL - x$y <- NULL + d$x <- NULL + d$y <- NULL #Convert to single vector - unlist(as.vector(x)) + unlist(as.vector(d))}) + names(dfs2) <- names(dfs) + +val.dfs.v.df <- dfs2 %>% reduce(cbind) +val.dfs.v.df <- as.data.frame(val.dfs.v.df)}) + +names(val.dfs.v) <- runs +``` + +```{r} +val.dfs.v <- lapply(runs, function(r){ + df <- val.dfs.v[[r]] + names(df) <-paste0(r, ".", c("obs.val.df", "raw.cpm.val", "bc1.cpm.val", "bc2.cpm.val")) + return(df) }) -val.dfs.v.df <- val.dfs.v %>% reduce(cbind) -val.dfs.v.df <- as.data.frame(val.dfs.v.df) +#Convert to a single df +val.dfs.v.allruns <- val.dfs.v %>% reduce(cbind) + +#Remove duplicate obs (pulled through across each run) +val.dfs.v.allruns[c("Run06.obs.val.df", "Run07.obs.val.df", "Run08.obs.val.df")] <- NULL +names(val.dfs.v.allruns)[1] <- "obs.val" ``` ### **2a. Descriptive statistics** ```{r descriptives validation} -descriptives <- lapply(val.dfs.v, function(x){ +descriptives <- apply(val.dfs.v.allruns,2, function(x){ per <- data.frame(as.list(quantile(x, probs=c(0.1, 0.9)))) data.frame(mean=mean(x), sd=sd(x), per10th=per$X10.,per90th=per$X90.) }) descriptives <- descriptives %>% reduce(rbind) -row.names(descriptives) <- c("obs", "raw.cpm", "bc1.cpm", "bc2.cpm") -descriptives +row.names(descriptives) <- names(val.dfs.v.allruns) +t(descriptives) ``` @@ -592,8 +609,28 @@ descriptives ```{r} -cal.val.dfg <- reshape2::melt(as.matrix(val.dfs.v)) +names(val.dfs.v) <- runs +val.dfs.v_fordist <- lapply(runs, function(r){ + df <- val.dfs.v[[r]] + names(df) <- c("obs", "raw.cpm", "bc1.cpm", "bc2.cpm") + df$run <- paste0(r) + return(df) +}) + +#Convert to a single df +val.dfs.v.allruns_fordist <- val.dfs.v_fordist %>% reduce(rbind) +val.dfg <- reshape2::melt(val.dfs.v.allruns_fordist, id="run") +``` +#### Fig.Density plot of validation period + +```{r} +ggplot(subset(val.dfg, variable!="bc2.cpm"), aes(value, fill=variable, colour=variable)) + + geom_density(alpha = 0.3, position="identity") + + facet_wrap(.~ run) + + theme_minimal() + + scale_fill_brewer(palette = "Set1") + + scale_color_brewer(palette = "Set1") ``` @@ -603,16 +640,19 @@ cal.val.dfg <- reshape2::melt(as.matrix(val.dfs.v)) ```{r} -actual <- val.dfs.v$obs.val.df +actual <- val.dfs.v.allruns$obs.val -rmse <- sapply(val.dfs.v[c(2:4)], function(x){ +rmse <- sapply(val.dfs.v.allruns[c(2:ncol(val.dfs.v.allruns))], function(x){ sqrt(mean((actual - x)^2)) }) -data.frame(as.list(rmse), row.names = "RMSE") + t(data.frame(as.list(rmse), row.names = "RMSE")) ``` -In this example, the first bias correction has a lower RMSE (just!) and therefore is better fitting than the raw +Across all runs, the bias corrected datasets have a lower RMSE (just!) and therefore are better fitting than the raw. Depending on the run, the raw could fit the data better. + +### + # Taylor diagram From 282a75eb782f40a05c03a3eceb5731fad9ed3c07 Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Fri, 22 Sep 2023 10:57:12 +0000 Subject: [PATCH 21/83] Cleaning up files for LCAT --- ...BC.data.RMD => Assessing.BC.data.LCAT.RMD} | 96 ++- R/LCAT/LCATv_apply_qmapQuant_to_crpd_df_fn.R | 717 ++++++++++++++++++ R/LCAT/Processing.data.for.LCAT.R | 4 +- R/LCAT/regions.UK.png | Bin 0 -> 19555 bytes 4 files changed, 797 insertions(+), 20 deletions(-) rename R/LCAT/{Assessing.BC.data.RMD => Assessing.BC.data.LCAT.RMD} (91%) create mode 100644 R/LCAT/LCATv_apply_qmapQuant_to_crpd_df_fn.R create mode 100644 R/LCAT/regions.UK.png diff --git a/R/LCAT/Assessing.BC.data.RMD b/R/LCAT/Assessing.BC.data.LCAT.RMD similarity index 91% rename from R/LCAT/Assessing.BC.data.RMD rename to R/LCAT/Assessing.BC.data.LCAT.RMD index b759e08f..fb39507b 100644 --- a/R/LCAT/Assessing.BC.data.RMD +++ b/R/LCAT/Assessing.BC.data.LCAT.RMD @@ -1,13 +1,18 @@ --- -title: "Assessing and Processing LCAT data" -author: "Ruth C E Bowyer" +title: "Bias correction assessment of LCAT data" +author: "Ruth Bowyer" date: "`r format(Sys.Date())`" output: - github_document + html_document: + theme: cosmo + toc: TRUE + toc_float: TRUE + toc_depth: 4 + code_folding: hide + df_print: paged --- - ```{r libs and setup, message=FALSE, warning=F} rm(list=ls()) @@ -18,6 +23,7 @@ library(terra) library(tmap) #pretty maps library(RColorBrewer) library(tidyverse) +library(kableExtra) library(plotrix) #For taylor diagrams dd <- "/mnt/vmfileshare/ClimateData/" @@ -135,7 +141,7 @@ Random selection of 3 days of the observation, calibration and two adjusted cals ##### Fig. *Day 1 - 1980-12-01* -```{r, figures-side, fig.show="hold", out.width="33%"} +```{r, fig.show="hold", out.width="33%"} tm_shape(London.rasts$Run05$t.obs[[1]]) + tm_raster(title="Observation, 1980-12-01") #Obviously just one call of the observation tm_shape(London.rasts$Run05$t.cal[[1]]) + tm_raster(title="Calibration, Run 05, Raw 1980-12-01") tm_shape(London.rasts$Run06$t.cal[[1]]) + tm_raster(title="Calibration, Run 06, Raw 1980-12-01") @@ -153,7 +159,7 @@ tm_shape(London.rasts$Run08$qm1.hist.a[[1]]) + tm_raster(title="Calibration, Run Just to note I was so suprised by how much lower the observation data was for this raster I loaded the raw HADs to check (in resampled_2.2km/tasmax and the original 1km grid it does reflect it - it just seems very low) -```{r, figures-side, fig.show="hold", out.width="33%"} +```{r, fig.show="hold", out.width="33%"} tm_shape(London.rasts$Run05$t.obs[[3781]]) + tm_raster(title="Observation, 1991-06-01") #Obviously just one call of the observation tm_shape(London.rasts$Run05$t.cal[[3781]]) + tm_raster(title="Calibration, Run 05, Raw 1991-06-01") tm_shape(London.rasts$Run06$t.cal[[3781]]) + tm_raster(title="Calibration, Run 06, Raw 1991-06-01") @@ -171,7 +177,7 @@ tm_shape(London.rasts$Run08$qm1.hist.a[[3781]]) + tm_raster(title="Calibration, ##### Fig. *Day 3 - 2000-08-01* -```{r, figures-side, fig.show="hold", out.width="33%"} +```{r, fig.show="hold", out.width="33%"} tm_shape(London.rasts$Run05$t.obs[[7081]]) + tm_raster(title="Observation, 2000-08-01") #Obviously just one call of the observation tm_shape(London.rasts$Run05$t.cal[[7081]]) + tm_raster(title="Calibration, Run 05, Raw 2000-08-01") tm_shape(London.rasts$Run06$t.cal[[7081]]) + tm_raster(title="Calibration, Run 06, Raw 2000-08-01") @@ -546,10 +552,11 @@ names(val.dfs) <- runs #### *Validation period - annual trends - seasonal mean* - +(To be added) #### *Validation period - annual trends - seasonal max* +(To be added) ## **2. Bias Correction Assessment: Metrics** @@ -595,7 +602,7 @@ names(val.dfs.v.allruns)[1] <- "obs.val" descriptives <- apply(val.dfs.v.allruns,2, function(x){ per <- data.frame(as.list(quantile(x, probs=c(0.1, 0.9)))) - data.frame(mean=mean(x), sd=sd(x), per10th=per$X10.,per90th=per$X90.) + data.frame(mean=mean(x), sd=sd(x), min = min(x), per10th=per$X10.,per90th=per$X90., max = max(x)) }) descriptives <- descriptives %>% reduce(rbind) @@ -633,35 +640,88 @@ ggplot(subset(val.dfg, variable!="bc2.cpm"), aes(value, fill=variable, colour=va scale_color_brewer(palette = "Set1") ``` +### **2b. Model fit statistics** +Using the following to assess overall fit: -### **2b. RMSE** +- **R-squared (rsq)** +- **Root Square Mean Error (RMSE)** +- **Nash-Sutcliffe Efficiency (NSE):** Magnitude of residual variance compared to measured data variance, ranges -∞ to 1, 1 = perfect match to observations +- **Percent bias (PBIAS):** The optimal value of PBIAS is 0.0, with low-magnitude values indicating accurate model simulation. Positive values indicate overestimation bias, whereas negative values indicate model underestimation bias. +```{r rsq} +actual <- val.dfs.v.allruns$obs.val -```{r} +rsq <- sapply(val.dfs.v.allruns[c(2:ncol(val.dfs.v.allruns))], function(x){ + cor(actual, x)^2 +}) -actual <- val.dfs.v.allruns$obs.val + t(data.frame(as.list(rsq), row.names = "RSQ")) +``` + +```{r rmse} rmse <- sapply(val.dfs.v.allruns[c(2:ncol(val.dfs.v.allruns))], function(x){ sqrt(mean((actual - x)^2)) }) - t(data.frame(as.list(rmse), row.names = "RMSE")) ``` -Across all runs, the bias corrected datasets have a lower RMSE (just!) and therefore are better fitting than the raw. Depending on the run, the raw could fit the data better. +```{r pbias} + +pbias <- sapply(val.dfs.v.allruns[c(2:ncol(val.dfs.v.allruns))], function(x){ + hydroGOF::pbias(x, actual) +}) + +``` + +```{r nse} +nse <- sapply(val.dfs.v.allruns[c(2:ncol(val.dfs.v.allruns))], function(x){ + hydroGOF::NSE(x, actual) +}) + +``` + +Highlighting the bias corrected statistics -### +```{r pretty kable} +k <- cbind(rsq, rmse, pbias, nse) +k %>% + kable(booktabs = T) %>% + kable_styling() %>% + row_spec(grep(".bc.",row.names(k)), background = "lightgrey") + +``` -# Taylor diagram ## **3. Bias Correction Assessment: Metric specific - tasmax** +### **3b Days above 30 degrees** + +(Not considered consecutively here) + +```{r} +val.dfs.v.allruns$year <- substr(row.names(val.dfs.v.allruns), 8,11) + +over30 <- lapply(names(val.dfs.v.allruns), function(i){ + x <- val.dfs.v.allruns[,i] + df <- aggregate(x, list(val.dfs.v.allruns$year), function(x){sum(x>=30)}) + names(df) <- c("year", paste0("Days.over.30.", i)) + return(df) +}) + +over30 %>% reduce(left_join, "year") +``` + + +### **Number of heatwaves per annum** + +(to be added) #### **For future work** -I used 100 quantiles but obviously more within qmap would help! +The number of quantiles selected will effect the efficacy of the bias correction: lots of options therefore with this specific method + -### mean by cell diff --git a/R/LCAT/LCATv_apply_qmapQuant_to_crpd_df_fn.R b/R/LCAT/LCATv_apply_qmapQuant_to_crpd_df_fn.R new file mode 100644 index 00000000..7ab0cac2 --- /dev/null +++ b/R/LCAT/LCATv_apply_qmapQuant_to_crpd_df_fn.R @@ -0,0 +1,717 @@ +#This version of this file was used to process LCAT files. Later version of this files will be renamed and split +##Loading data as created in 'Data_Processing_todf.R' + +#Requires +library(tidyverse) +library(data.table) +library(qmap) + + +apply_bias_correction_to_cropped_df <- function(region, #Region code - needs to relate to the file name in a unique way to subset + var, #Meterological variables + Runs){ + + i <- region + +for(r in Runs){ + for(v in var){ + if(v!="pr"){ + dd <- "/mnt/vmfileshare/ClimateData/" + + #Subset to Area + #HADs grid observational data + fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") + files <- list.files(fp) + obs <- files[grepl(i, files)] + + #subset file list to var + obs.var <- obs[grepl(v,obs)] + + #subset to calibration years + obs.varc <- obs.var[grepl("1980", obs.var)] + obs.df <- fread(paste0(fp, obs.varc)) + obs.df <- as.data.frame(obs.df) + + row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) + obs.df$x <- NULL + obs.df$y <- NULL + + #Remove the dates not in the cpm + ## find col position of the first cpm date 19801201 + n1 <-min(grep("19801201", names(obs.df))) + obs.df <- obs.df[c(n1:ncol(obs.df))] + + + #Using 1980 - 2010 as calibration period + fp <- paste0(dd, "Interim/CPM/Data_as_df/") + cpm.files <- list.files(fp) + + #Calibration years 1980 - 2010 - load in full one for 1980 - 2000 + cpm.cal <- cpm.files[grepl("1980|2000", cpm.files)] + + #Subset file list to area + cpm.cal <- cpm.cal[grepl(i, cpm.cal)] + + #subset to var and run + cpm.cal.var <- cpm.cal[grepl(v, cpm.cal)&grepl(r, cpm.cal)] + + #Load in + cal.df <- lapply(cpm.cal.var, function(x){ + df <- fread(paste0(fp, x)) + df <- as.data.frame(df) + + row.names(df)<- paste0(df$x, "_", df$y) + df$x <- NULL + df$y <- NULL + return(df) + }) + + cal.df <- cal.df %>% reduce(cbind) + + #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here + #Keep all of the files with these years - because the naming convention runs + #from Nov to following year we need to just take the first 30 days of the one starting with 20091201- + n2 <- min(grep("20091201-",names(cal.df))) + 29 + + #This is the first part of the validation dataset, but all the val will be added to the projection df for + #the sake of bias correction and assessed separately + proj.df1 <- cal.df[c((n2+1):ncol(cal.df))] + cal.df <- cal.df[c(1:n2)] + + gc() + + yi <- paste0(i,c(2020,2040,2060), collapse="|") + cpm.proj <- cpm.files[grepl(yi, cpm.files)] + + #Subset to Area, var and run + cpm.proj <- cpm.proj[grepl(i, cpm.proj)&grepl(v, cpm.proj)&grepl(r, cpm.proj)] + + #Load in + proj.df2 <- lapply(cpm.proj, function(x){ + df <- as.data.frame(fread(paste0(fp, x))) + #Remove x and y cols + df[c(3:ncol(df))] + }) + + names(proj.df2) <- cpm.proj + + proj.df <- c(list(proj.df1), proj.df2) %>% reduce(cbind) + + remove("proj.df1") + remove("proj.df2") + +## **2. Wrangle the data** + + #missing.in.hads.cpm.cal <- cal.df[-which(row.names(cal.df)%in%row.names(obs.df)),] + #missing.in.hads.cpm.proj <- proj.df[-which(row.names(proj.df)%in%row.names(obs.df)),] + + + cal.df <- cal.df[which(row.names(cal.df)%in%row.names(obs.df)),] + proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] + + #save the missing outputs + p <- paste0("checkpoint1", v, "_", i, "_", r, "_") + print(p) + #write.csv(missing.in.hads.cpm.cal, paste0(dd, "Debiased/R/QuantileMapping/missing.in.hads/",r,"_",i,"_",v, ".csv")) + + ### Update obs data to 360 days + + #The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted + + #Convert obs to 360 day year - has 40 more vars so remove the ones not in cal + remove <- c("0229_29", "0430_30", "0731_31", "0930_30", "1130_30") + remove <- paste0(remove, collapse = "|") + + obs.df <- obs.df[,!grepl(remove, names(obs.df))] + #This still pulls in the 31st Dec 2009 for some reason is in the hads so manual remove + obs.df <- obs.df[1:ncol(cal.df)] + +### Transpose the data sets + + #Obs grid should be cols, observations (time) should be rows for linear scaling + + cal.df <- t(cal.df) + proj.df <- t(proj.df) + obs.df <- t(obs.df) + + +## **3. Empirical Quantile Mapping** + +#(from qmap vignette) - fitQmapQUANT estimates values of the empirical cumulative distribution function of observed and +#modelled time series for regularly spaced quantiles. doQmapQUANT uses these estimates to perform +#quantile mapping + p <- paste0("checkpoint2", v, "_", i, "_", r, "_") + print(p) + + library(qmap) + qm1.fit <- fitQmapQUANT(obs.df, cal.df, + wet.day = FALSE, + qstep = 0.01, + nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. + + + qm1.hist.a <- doQmapQUANT(cal.df, qm1.fit, type="linear") + qm1.hist.b <- doQmapQUANT(cal.df, qm1.fit, type="tricub") + + qm1.proj.a <- doQmapQUANT(proj.df, qm1.fit, type="linear") + qm1.proj.b <- doQmapQUANT(proj.df, qm1.fit, type="tricub") + +## **4. Save the data** + p <- paste0("checkpoint3", v, "_", i, "_", r, "_") + print(p) + # Save data - lists of dfs for now (will be easier for assessment) + results.L <- list(obs.df, cal.df, proj.df, qm1.hist.a, qm1.hist.b, qm1.proj.a, qm1.proj.b) + + names(results.L) <- c("t.obs", "t.cal", "t.proj", "qm1.hist.a", "qm1.hist.b", "qm1.proj.a", "qm1.proj.b") + p <- paste0("checkpoint4", v, "_", i, "_", r, "_") + print(p) + base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/resultsL", r,"_",i,"_",v, ".RDS")) + + p <- paste0("checkpoint5", v, "_", i, "_", r, "_") + print(p) + rm(list=setdiff(ls(), c("v", "i", "r", "var", "Runs"))) + + gc(reset=TRUE) + + + } else { + +#### Precipitation - the HADs variable has is called 'rainfall' + dd <- "/mnt/vmfileshare/ClimateData/" + #Subset to Area + #HADs grid observational data + fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") + files <- list.files(fp) + obs <- files[grepl(i, files)] + + #subset file list to var + obs.var <- obs[grepl("rainfall",obs)] + + #subset to calibration years + obs.varc <- obs.var[grepl("1980", obs.var)] + obs.df <- fread(paste0(fp, obs.varc)) + obs.df <- as.data.frame(obs.df) + + row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) + obs.df$x <- NULL + obs.df$y <- NULL + + #Remove the dates not in the cpm + ## find col position of the first cpm date 19801201 + n1 <-min(grep("19801201", names(obs.df))) + obs.df <- obs.df[c(n1:ncol(obs.df))] + + + #Using 1980 - 2010 as calibration period + fp <- paste0(dd, "Interim/CPM/Data_as_df/") + cpm.files <- list.files(fp) + + #Calibration years 1980 - 2010 - load in full one for 1980 - 2000 + cpm.cal <- cpm.files[grepl("1980|2000", cpm.files)] + + #Subset file list to area + cpm.cal <- cpm.cal[grepl(i, cpm.cal)] + + #subset to var and run + cpm.cal.var <- cpm.cal[grepl(v, cpm.cal)&grepl(r, cpm.cal)] + + #Load in + cal.df <- lapply(cpm.cal.var, function(x){ + df <- fread(paste0(fp, x)) + df <- as.data.frame(df) + + row.names(df)<- paste0(df$x, "_", df$y) + df$x <- NULL + df$y <- NULL + return(df) + }) + + cal.df <- cal.df %>% reduce(cbind) + + #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here + #Keep all of the files with these years - because the naming convention runs + #from Nov to following year we need to just take the first 30 days of the one starting with 20091201- + n2 <- min(grep("20091201-",names(cal.df))) + 29 + + #This is the first part of the validation dataset, but all the val will be added to the projection df for + #the sake of bias correction and assessed separately + proj.df1 <- cal.df[c((n2+1):ncol(cal.df))] + cal.df <- cal.df[c(1:n2)] + + gc() + + yi <- paste0(i,c(2020,2040,2060), collapse="|") + cpm.proj <- cpm.files[grepl(yi, cpm.files)] + + #Subset to Area, var and run + cpm.proj <- cpm.proj[grepl(i, cpm.proj)&grepl(v, cpm.proj)&grepl(r, cpm.proj)] + + #Load in + proj.df2 <- lapply(cpm.proj, function(x){ + df <- as.data.frame(fread(paste0(fp, x))) + #Remove x and y cols + df[c(3:ncol(df))] + }) + + names(proj.df2) <- cpm.proj + + proj.df <- c(list(proj.df1), proj.df2) %>% reduce(cbind) + + remove("proj.df1") + remove("proj.df2") + + ## **2. Wrangle the data** + + #missing.in.hads.cpm.cal <- cal.df[-which(row.names(cal.df)%in%row.names(obs.df)),] + #missing.in.hads.cpm.proj <- proj.df[-which(row.names(proj.df)%in%row.names(obs.df)),] + + + cal.df <- cal.df[which(row.names(cal.df)%in%row.names(obs.df)),] + proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] + + #save the missing outputs + p <- paste0("checkpoint1", v, "_", i, "_", r, "_") + print(p) + #write.csv(missing.in.hads.cpm.cal, paste0(dd, "Debiased/R/QuantileMapping/missing.in.hads/",r,"_",i,"_",v, ".csv")) + + ### Update obs data to 360 days + + #The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted + + #Convert obs to 360 day year - has 40 more vars so remove the ones not in cal + remove <- c("0229_29", "0430_30", "0731_31", "0930_30", "1130_30") + remove <- paste0(remove, collapse = "|") + + obs.df <- obs.df[,!grepl(remove, names(obs.df))] + #This still pulls in the 31st Dec 2009 for some reason is in the hads so manual remove + obs.df <- obs.df[1:ncol(cal.df)] + + ### Transpose the data sets + + #Obs grid should be cols, observations (time) should be rows for linear scaling + + cal.df <- t(cal.df) + proj.df <- t(proj.df) + obs.df <- t(obs.df) + + ## **3. Empirical Quantile Mapping** + + #(from qmap vignette) - fitQmapQUANT estimates values of the empirical cumulative distribution function of observed and + #modelled time series for regularly spaced quantiles. doQmapQUANT uses these estimates to perform + #quantile mapping + p <- paste0("checkpoint2", v, "_", i, "_", r, "_") + print(p) + + + qm1.fit <- fitQmapQUANT(obs.df, cal.df, + wet.day = TRUE, #If wet.day=TRUE the empirical probability of nonzero observations is found (obs>=0) and the corresponding modelled value is selected as a threshold. All modelled values below this threshold are set to zero. If wet.day is numeric the same procedure is performed after setting all obs to zero. + qstep = 0.01, + nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. + + + qm1.hist.a <- doQmapQUANT(cal.df, qm1.fit, type="linear") + qm1.hist.b <- doQmapQUANT(cal.df, qm1.fit, type="tricub") + + qm1.proj.a <- doQmapQUANT(proj.df, qm1.fit, type="linear") + qm1.proj.b <- doQmapQUANT(proj.df, qm1.fit, type="tricub") + + ## **4. Save the data** + p <- paste0("checkpoint3", v, "_", i, "_", r, "_") + print(p) + # Save data - lists of dfs for now (will be easier for assessment) + results.L <- list(obs.df, cal.df, proj.df, qm1.hist.a, qm1.hist.b, qm1.proj.a, qm1.proj.b) + + names(results.L) <- c("t.obs", "t.cal", "t.proj", "qm1.hist.a", "qm1.hist.b", "qm1.proj.a", "qm1.proj.b") + p <- paste0("checkpoint4", v, "_", i, "_", r, "_") + print(p) + base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/resultsL", r,"_",i,"_",v, ".RDS")) + + p <- paste0("checkpoint5", v, "_", i, "_", r, "_") + print(p) + rm(list=setdiff(ls(), c("v", "i", "r", "var", "Runs"))) + + gc(reset=TRUE) + + + } + } + } +} + + +###################### Further cropping to the cropped dfs (!) - mostly for Scotland which is too big! + +cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs to relate to the file name in a unique way to subset + var, #Meterological variables + Runs, #Runs in form 'Run08'- matched input + N.new.segments,...){ #Numeric, Number of dfs to break down to, eg 4 + + i <- region + N.new.segments<- N.new.segments + Runs <- Runs + var <- var + + for(r in Runs){ + for(v in var){ + for(y in 1:N.new.segments){ + if(v!="pr"){ + dd <- "/mnt/vmfileshare/ClimateData/" + + #Subset to Area + #Load cpm first and then use this to subset the latter as there are more cells in cpm that hads + #Using 1980 - 2010 as calibration period + fp <- paste0(dd, "Interim/CPM/Data_as_df/") + cpm.files <- list.files(fp) + + #Calibration years 1980 - 2010 - load in full one for 1980 - 2000 + cpm.cal <- cpm.files[grepl("1980|2000", cpm.files)] + + #Subset file list to area + cpm.cal <- cpm.cal[grepl(i, cpm.cal)] + + #subset to var and run + cpm.cal.var <- cpm.cal[grepl(v, cpm.cal)&grepl(r, cpm.cal)] + + #Load in + cal.df <- lapply(cpm.cal.var, function(x){ + df <- fread(paste0(fp, x)) + df <- as.data.frame(df) + + row.names(df)<- paste0(df$x, "_", df$y) + df$x <- NULL + df$y <- NULL + return(df) + }) + + cal.df <- cal.df %>% reduce(cbind) + + + #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here + #Keep all of the files with these years - because the naming convention runs + #from Nov to following year we need to just take the first 30 days of the one starting with 20091201- + n2 <- min(grep("20091201-",names(cal.df))) + 29 + + #This is the first part of the validation dataset, but all the val will be added to the projection df for + #the sake of bias correction and assessed separately + proj.df1 <- cal.df[c((n2+1):ncol(cal.df))] + cal.df <- cal.df[c(1:n2)] + + #Subset the dataframe iteratively depending on y + nrows.seg <- nrow(cal.df)/N.new.segments + y_1 <- y-1 + + nr1 <- round(nrows.seg*y_1) + 1 + nr2 <- round(nrows.seg*y) + cal.df <- cal.df[nr1:nr2,] + + #proj data + yi <- paste0(i,c(2020,2040,2060), collapse="|") + cpm.proj <- cpm.files[grepl(yi, cpm.files)] + + #Subset to Area, var and run + cpm.proj <- cpm.proj[grepl(i, cpm.proj)&grepl(v, cpm.proj)&grepl(r, cpm.proj)] + + #Load in + proj.df2 <- lapply(cpm.proj, function(x){ + df <- as.data.frame(fread(paste0(fp, x))) + #Remove x and y cols + df[c(3:ncol(df))] + }) + + names(proj.df2) <- cpm.proj + + proj.df <- c(list(proj.df1), proj.df2) %>% reduce(cbind) + proj.df <- proj.df[which(row.names(proj.df)%in%row.names(cal.df)),] + + remove("proj.df1") + remove("proj.df2") + + + #HADs grid observational data + fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") + files <- list.files(fp) + obs <- files[grepl(i, files)] + + #subset file list to var + obs.var <- obs[grepl(v,obs)] + + #subset to calibration years + obs.varc <- obs.var[grepl("1980", obs.var)] + obs.df <- fread(paste0(fp, obs.varc)) + obs.df <- as.data.frame(obs.df) + + row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) + obs.df$x <- NULL + obs.df$y <- NULL + + #Subset to the rows which are in above (some will be missing) + obs.df <- obs.df[which(row.names(obs.df)%in%row.names(cal.df)),] + + #Remove the dates not in the cpm + ## find col position of the first cpm date 19801201 + n1 <-min(grep("19801201", names(obs.df))) + obs.df <- obs.df[c(n1:ncol(obs.df))] + + gc() + + + ## **2. Wrangle the data** + + #missing.in.hads.cpm.cal <- cal.df[-which(row.names(cal.df)%in%row.names(obs.df)),] + #missing.in.hads.cpm.proj <- proj.df[-which(row.names(proj.df)%in%row.names(obs.df)),] + + + cal.df <- cal.df[which(row.names(cal.df)%in%row.names(obs.df)),] + proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] + + #save the missing outputs + p <- paste0("checkpoint1", v, "_", i, "_", r, "_",y) + print(p) + #write.csv(missing.in.hads.cpm.cal, paste0(dd, "Debiased/R/QuantileMapping/missing.in.hads/",r,"_",i,"_",v, ".csv")) + + ### Update obs data to 360 days + + #The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted + + #Convert obs to 360 day year - has 40 more vars so remove the ones not in cal + remove <- c("0229_29", "0430_30", "0731_31", "0930_30", "1130_30") + remove <- paste0(remove, collapse = "|") + + obs.df <- obs.df[,!grepl(remove, names(obs.df))] + #This still pulls in the 31st Dec 2009 for some reason is in the hads so manual remove + obs.df <- obs.df[1:ncol(cal.df)] + + ### Transpose the data sets + + #Obs grid should be cols, observations (time) should be rows for linear scaling + + cal.df <- t(cal.df) + proj.df <- t(proj.df) + obs.df <- t(obs.df) + + + ## **3. Empirical Quantile Mapping** + + #(from qmap vignette) - fitQmapQUANT estimates values of the empirical cumulative distribution function of observed and + #modelled time series for regularly spaced quantiles. doQmapQUANT uses these estimates to perform + #quantile mapping + p <- paste0("checkpoint2", v, "_", i, "_", r, "_",y) + print(p) + + library(qmap) + qm1.fit <- fitQmapQUANT(obs.df, cal.df, + wet.day = FALSE, + qstep = 0.01, + nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. + + + qm1.hist.a <- doQmapQUANT(cal.df, qm1.fit, type="linear") + qm1.hist.b <- doQmapQUANT(cal.df, qm1.fit, type="tricub") + + qm1.proj.a <- doQmapQUANT(proj.df, qm1.fit, type="linear") + qm1.proj.b <- doQmapQUANT(proj.df, qm1.fit, type="tricub") + + ## **4. Save the data** + p <- paste0("checkpoint3", v, "_", i, "_", r, "_", y) + print(p) + # Save data - lists of dfs for now (will be easier for assessment) + results.L <- list(obs.df, cal.df, proj.df, qm1.hist.a, qm1.hist.b, qm1.proj.a, qm1.proj.b) + + names(results.L) <- c("t.obs", "t.cal", "t.proj", "qm1.hist.a", "qm1.hist.b", "qm1.proj.a", "qm1.proj.b") + p <- paste0("checkpoint4", v, "_", i, "_", r, "_", y) + print(p) + base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/resultsL", r,"_",i,"_",y,"_",v, ".RDS")) + + p <- paste0("checkpoint5", v, "_", i, "_", r, "_", y) + print(p) + rm(list=setdiff(ls(), c("v", "i", "r", "var", "Runs", "y", "N.new.segments"))) + + gc(reset=TRUE) + + + } else { + + #### Precipitation - the HADs variable has is called 'rainfall' + dd <- "/mnt/vmfileshare/ClimateData/" + + #Subset to Area + #Load cpm first and then use this to subset the latter as there are more cells in cpm that hads + #Using 1980 - 2010 as calibration period + fp <- paste0(dd, "Interim/CPM/Data_as_df/") + cpm.files <- list.files(fp) + + #Calibration years 1980 - 2010 - load in full one for 1980 - 2000 + cpm.cal <- cpm.files[grepl("1980|2000", cpm.files)] + + #Subset file list to area + cpm.cal <- cpm.cal[grepl(i, cpm.cal)] + + #subset to var and run + cpm.cal.var <- cpm.cal[grepl(v, cpm.cal)&grepl(r, cpm.cal)] + + #Load in + cal.df <- lapply(cpm.cal.var, function(x){ + df <- fread(paste0(fp, x)) + df <- as.data.frame(df) + + row.names(df)<- paste0(df$x, "_", df$y) + df$x <- NULL + df$y <- NULL + return(df) + }) + + cal.df <- cal.df %>% reduce(cbind) + + + #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here + #Keep all of the files with these years - because the naming convention runs + #from Nov to following year we need to just take the first 30 days of the one starting with 20091201- + n2 <- min(grep("20091201-",names(cal.df))) + 29 + + #This is the first part of the validation dataset, but all the val will be added to the projection df for + #the sake of bias correction and assessed separately + proj.df1 <- cal.df[c((n2+1):ncol(cal.df))] + cal.df <- cal.df[c(1:n2)] + + #Subset the dataframe iteratively depending on y + nrows.seg <- nrow(cal.df)/N.new.segments + y_1 <- y-1 + + nr1 <- round(nrows.seg*y_1) + 1 + nr2 <- round(nrows.seg*y) + cal.df <- cal.df[nr1:nr2,] + + + #proj data + yi <- paste0(i,c(2020,2040,2060), collapse="|") + cpm.proj <- cpm.files[grepl(yi, cpm.files)] + + #Subset to Area, var and run + cpm.proj <- cpm.proj[grepl(i, cpm.proj)&grepl(v, cpm.proj)&grepl(r, cpm.proj)] + + #Load in + proj.df2 <- lapply(cpm.proj, function(x){ + df <- as.data.frame(fread(paste0(fp, x))) + #Remove x and y cols + df[c(3:ncol(df))] + }) + + names(proj.df2) <- cpm.proj + + proj.df <- c(list(proj.df1), proj.df2) %>% reduce(cbind) + proj.df <- proj.df[which(row.names(proj.df)%in%row.names(cal.df)),] + + remove("proj.df1") + remove("proj.df2") + + + #HADs grid observational data + fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") + files <- list.files(fp) + obs <- files[grepl(i, files)] + + #subset file list to var + obs.var <- obs[grepl("rainfall",obs)] + + #subset to calibration years + obs.varc <- obs.var[grepl("1980", obs.var)] + obs.df <- fread(paste0(fp, obs.varc)) + obs.df <- as.data.frame(obs.df) + + row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) + obs.df$x <- NULL + obs.df$y <- NULL + + #Subset to the rows which are in above (some will be missing) + obs.df <- obs.df[which(row.names(obs.df)%in%row.names(cal.df)),] + + #Remove the dates not in the cpm + ## find col position of the first cpm date 19801201 + n1 <-min(grep("19801201", names(obs.df))) + obs.df <- obs.df[c(n1:ncol(obs.df))] + + gc() + + + ## **2. Wrangle the data** + + #missing.in.hads.cpm.cal <- cal.df[-which(row.names(cal.df)%in%row.names(obs.df)),] + #missing.in.hads.cpm.proj <- proj.df[-which(row.names(proj.df)%in%row.names(obs.df)),] + + + cal.df <- cal.df[which(row.names(cal.df)%in%row.names(obs.df)),] + proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] + + #save the missing outputs + p <- paste0("checkpoint1", v, "_", i, "_", r, "_",y) + print(p) + #write.csv(missing.in.hads.cpm.cal, paste0(dd, "Debiased/R/QuantileMapping/missing.in.hads/",r,"_",i,"_",v, ".csv")) + + ### Update obs data to 360 days + + #The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted + + #Convert obs to 360 day year - has 40 more vars so remove the ones not in cal + remove <- c("0229_29", "0430_30", "0731_31", "0930_30", "1130_30") + remove <- paste0(remove, collapse = "|") + + obs.df <- obs.df[,!grepl(remove, names(obs.df))] + #This still pulls in the 31st Dec 2009 for some reason is in the hads so manual remove + obs.df <- obs.df[1:ncol(cal.df)] + + ### Transpose the data sets + + #Obs grid should be cols, observations (time) should be rows for linear scaling + + cal.df <- t(cal.df) + proj.df <- t(proj.df) + obs.df <- t(obs.df) + + + ## **3. Empirical Quantile Mapping** + + #(from qmap vignette) - fitQmapQUANT estimates values of the empirical cumulative distribution function of observed and + #modelled time series for regularly spaced quantiles. doQmapQUANT uses these estimates to perform + #quantile mapping + p <- paste0("checkpoint2", v, "_", i, "_", r, "_",y) + print(p) + + qm1.fit <- fitQmapQUANT(obs.df, cal.df, + wet.day = TRUE, #If wet.day=TRUE the empirical probability of nonzero observations is found (obs>=0) and the corresponding modelled value is selected as a threshold. All modelled values below this threshold are set to zero. If wet.day is numeric the same procedure is performed after setting all obs to zero. + qstep = 0.01, + nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. + + + qm1.hist.a <- doQmapQUANT(cal.df, qm1.fit, type="linear") + qm1.hist.b <- doQmapQUANT(cal.df, qm1.fit, type="tricub") + + qm1.proj.a <- doQmapQUANT(proj.df, qm1.fit, type="linear") + qm1.proj.b <- doQmapQUANT(proj.df, qm1.fit, type="tricub") + + + ## **4. Save the data** + p <- paste0("checkpoint3", v, "_", i, "_", r, "_", y) + print(p) + # Save data - lists of dfs for now (will be easier for assessment) + results.L <- list(obs.df, cal.df, proj.df, qm1.hist.a, qm1.hist.b, qm1.proj.a, qm1.proj.b) + + names(results.L) <- c("t.obs", "t.cal", "t.proj", "qm1.hist.a", "qm1.hist.b", "qm1.proj.a", "qm1.proj.b") + p <- paste0("checkpoint4", v, "_", i, "_", r, "_", y) + print(p) + base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/resultsL", r,"_",i,"_",y,"_",v, ".RDS")) + + p <- paste0("checkpoint5", v, "_", i, "_", r, "_", y) + print(p) + rm(list=setdiff(ls(), c("v", "i", "r", "var", "Runs", "y", "N.new.segments"))) + + gc(reset=TRUE) + + + + } + } + } + } + } + + diff --git a/R/LCAT/Processing.data.for.LCAT.R b/R/LCAT/Processing.data.for.LCAT.R index f57384de..7d9066b0 100644 --- a/R/LCAT/Processing.data.for.LCAT.R +++ b/R/LCAT/Processing.data.for.LCAT.R @@ -2,7 +2,7 @@ rm(list=ls()) #setwd("~/Desktop/clim-recal/clim-recal/") setwd("/home/dyme/Desktop/clim-recal/clim-recal") -source("R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R") +source("R/LCAT/LCATv_apply_qmapQuant_to_crpd_df_fn.R") library(terra) library(tidyverse) @@ -15,7 +15,7 @@ Regioncds <- Region.Refs$Regioncd #Scotland (UKM) needs to be broken down, so running on everyone else Regioncds.2 <- Regioncds[c(1:10, 12)] - apply_bias_correction_to_cropped_df(region="UKK", + apply_bias_correction_to_cropped_df(region=Regionscds.2, var=c("tasmin", "tasmax", "pr"), Runs=c("Run05", "Run06", "Run07", "Run08")) diff --git a/R/LCAT/regions.UK.png b/R/LCAT/regions.UK.png new file mode 100644 index 0000000000000000000000000000000000000000..3bc04161679ff76f063f991b861e439c30d1f9c4 GIT binary patch literal 19555 zcmcF~ML--~@Fx-^xVsbFCAho0y9IX%?(QDkgS+eCgy0OWgTvs#9hUFEyXQUap%1U* zz3OM(RrRZA6(wn81OfyI2nb|Z83{EA2*_Iq2q*}6=ue9Ho+CX31T=(-f`;U06A}^< z3JMAu8v4tZFEB7Lu&}UjaB%SO@L#`vMLjX=P<)ZEbC1 zV`FP;YiDO?Z*TA5;Na-!=;Y+&?Ck8~;^OM+>gML=?(Xj4;qmk5Pft%zFE1}|Z*LzT zA75WzKR-WzfB%4hfWW}OprD}O;NXyukkHW3u&}W3@bHL;h{(vusHmvu=;)Z3nAq6Z zxVX6Z`1pi`gv7+eq@<+ei9er052WM*b&Wo2b&XXoVPlq)=NA+d6c!d16%`d17nhWj{QmvBw6wIWtgO7eyrQC_va+(Os;auWx~8V4wzjse zuCBhmzM-L^v9YnKsj0cSxuvD0wY9aat*yPiy`!U}v$M0StE;=ayQilI008v%_V)Gl z_4oG=3=9ko4h{_s4G#~GjEwyG^JjE)^zYxlV`F3E~i_6Q)fB*hnU0q#YU*Fu^ z+}_^a-QC^a-#^2zsf1ZJ_&N8}g5D+Ls z|2>eSP8Ajq5M&Ut5~3Pjx#xK>ewbspqg##wmGUPbF&jFy7+lX~VLbyBPr(ht#FCEg zZc^vR9U|ur;^`fx1&)tR8N$`yvN?)tX91UPIh$FlV~Bkh7wgH~K{|MFI_nTP6GJi6 zB04~Pcm_UbQ%Nhb8Zc5Ac3yBpF$bEKJM7^9AEsFf*z=FaIqNf@VOfI?g1Uq;nvm&Y zRdn)=4M)TXdnSd?KTp?}W>&@j&1Lx)veC;TmrBR@YY+?Q`Sd5AU=ifY@9?KV2R4Jm6878VAQtI_| z<0s1p2aNWrFB$)ibl0)obcTJc;=tU{gl41&E{7rfa1%q&k&L?l4yjw6-Md#LEevRq zWt4AAZBi^aSD-8kA30T!fGYa*)q3%q5d`@Xm-K&he2Ql}BK=;exo?mCnl*R8PMVtw zhFHlpP!{=ftFw^Mz*u{v?+hkUNhFCLOl%8)L)=}+#8I_Tj189PthP0t4gM66gNy_X z?*uu`LxOJNppP`f8DEP_QXo*&b_0<*T5BC*O|3iU`ob@LBvZWW zm}2vHHWfNOqudV)WzoeR)t6dm^}(G0<<6@Am_lfKRX-EW(;>fjKwxqvp@qbaY+;1qtO-Q&Cx&Jm6=itW`_Wd6!ROULm-FgyJ4cPL42x|nh#zCU3p*Sa< z5`wI`4SjX+vp+uQZ@TLd_NJ2D9H!GtZDhXx?`xuK)!JZq4F?aAp=-6j;c|&wtA@uI^RD>ZEND6KkfX0U>6qt3VHqh`B!i=i5lTk7ah07i zsG{PHKqj#!7&VYLDa7(OehRF$Q}`d5Kia6dr-30WHW62zhgUY>{|+AJD*cxVdB5i* zm!Wap_3YdMpG)d%;yq`{^o{&4(C#rgHV$^Ku%Ekkyla(gSVYVKY*phRr)1jl5CFQZ zNZ^RMh2uA@WwTiT9$fK2QUoZu|b zi&c{G9-&jV>m0U$K|ebE&tZR95n5C)o7)vjNS({Pl$Sk^`L2E@UG?A+G|Y}fpEDS= zAKuNAUrqt)O?}~xdQ&`RH}!x-|E+|1P%Olr?gqWvr_oryLFIA|a}g`wZij;qiQnxs z=3UalYuQQT?m>4~UaXq*OUWxW{e<0+apKktj%R%apwKLkZ$kgCcad_)SJv{i2{bvI z+toGL13XKgOJ0huM0ZL(jzU~qh&86xw(p(6cc6G`6ino_cziSC6+#HN#{WYs&4Lgx zpyWyJTX+gF?^)WaaZqM1T=y0J!EjnG-2AaHNY(Ykk+x6G1U`u(Zf+Cp405~_?Bx$; zih9pbH|Na3{Q0y&0Jm-A(>btqzYn|Q%l%8*v7Q7PH?;%0ZEdY>ALEmh4LQZ_5~c#b z(c$eE{mcGy;Yz%Z|3K2VlHwlSQRy{(UX$@h2Fr)#Gny`g%R1{x8-s0toCw5a(l*`o zsQ+mUXOn|Lf4P_1DB2r9zSNca$d<}Bx$Yoi+r%WRuyCYe%wYlMY0sF{3F#1aCi8 z5_6Nn2j;;FHPVf{t;i(k?d>mQ0ia|2@d`Agi*&7cq`i9YroaBy(J7S@jb9=0akw&I zGmilE5dTVL9H+^TYQO9$^5#B<0AHhj4Ngc3K5#*g&l_2utwE3ooB22BFY8iTSc5Enb@G+g%7bWaz9hw z>HMYCyjXfmx_wdbjf!S#+Ez7E7xc2K0;1XN9C}lvq4^3uCbc_X&`wko$_iV=Sw(WQ zvNmtSof43VWewK96N{IzvM+1D5C~_0nByC^(iU^$tSsrJ|FizRd`2kNw7IxkU``G% z9FjAo8*~?OQd8TU{&&A`qIb#db@7mOdVH4rd6^;@fqka62aFLUX&bo(a)D$h1!GKA z#0dkLdolK&JAPXjpJSsEdYP9T$$jATo?^=$cK*T@bg1;9?YqW<1y;tqNr#VofU$(X z?EG2C1~!)S#x--@Zh*=S76q2OW^S=>9*6ncua#L#zG?91))@XK!n=Ec#qzf^ny(3+ z5}I%J8rg#gc9q)|1M`2;v%#`XTgXOqW?YX-j13LDv}v53Tj>S!Ll(;dU7qH>P`}D?XPZxmXh{Hx6=J~1;Nt*aj^GW7t~ELgD(ztgszLzOsiKW2!<%bB-I9m>Xs z@qeZ-hTYzPK14*Uqq1wOiC;1^GUw$SUKfs*ZCB||CHMqgpGfu5h@m~9JINsg;Al6{ z;E$J$3)mzDSq>ILMY^k)XT(iiE!V^%NpAYaALh24?iqgu1N5PIi$ z!Q<>~owpj})3q=qR>e!-%tNURn!CJ9{aqM%h{bIdXCE*`ElFb|-}5V!sJ^C>9+x>v z7Be11b_Gsl9ez=;{3il}nsW56wZBI3jg z<7+%3u_m!UvEU=CV4EHZA@-Xk@8j05FJT4{*;Q052=9bXUiA5(=2J>&+~;VZ!y8Wc z3zs?xN%e4IB^8nQ^e-7Bqk!}8i{1T%VI;YaG<|@T)nc&CMEYu}? zC?!;46dBJfGpuKxWkmt=rD<$&-=1l2-j8amM0aIP+eI?SUjvsBv%A*#wjFQ)kfDeZ z^3i!1!}P=DwpYqZAjUYfto(>t*JZ>)yk2U|?BY$gFJ@u_1#>n18k6K!Kzz2;OCmc> z8!g}3NidU?wGf0OjoRJso-lS(X`!L7VUAx)7|`58jR$47h)P4ny6xZr_X!__MZvpm zzC-$ISY1_IyeARry$A>nsl_F?N5hsFQbwoa-pFockV}8N1XXnVd` zXL8^9>L4q3GN{EM>$P;bOP&o-Vw#ARb0Qly!D=jp(eOdlndz@$XB69sj;K1xU)-yR z5JI)%xqGTbUKQc#w9Jwh`(iP+->mZZfi(l`~OIa2IMg+%m)pZPYoP)kToP8*c^(y2K{49|dL?kP(pev)G zx{+gsG{W#n7&j1sv(pHC8jYvgA-bQ?-qa~9G({5m?lBQRXsO1?* zGbG}IXUR`MOqy>v&9yh4&s3Oeb%0H1OMnC~k{#&>$;eN@!-uBWP@`!TrS2-p{W~6~ zBBe$K7LriP&a}<9KWwC9O3i+{xH>k!d>BKgd@he4k=>((^_i%=gV$N7t+VVA4$Vkc z3r$mb@tl1&d}=Xm#x2x<;tH9kdW=z@@`7(8UNWTdzvW+t(>Lexv4|qp7g3EO{Wk7P z(z0j2w{N*82&cs!RJZ!sFZuSN4Z_UEP-&Z6;JU7y1wpXUEYz{Uc0>*z)o;}QWyd2k zwsGtRy}Yry+8DhG~wln$P^bI+iJ~UE==`{S4xp2@wPnwa^Td8??4Q{f+*hp0OHT=-!1MIp-pI2Q6^`?JG-=Io40G9%5#1#(5|UNlyeKJH`^9_O3$Dt z3OGKOGPorS>X_=A@#sbpHQZbt6i8(9x%7q9FZ8UlDM~3ua!16B_pUQoR`}hjHD{b_4uE4Pt!RlYhylz0tbt457T#V zjTAS6EJo~PA=buHQ40Jjfj{4`A7WS~%tz4>bKR_JG#ex@{fB)3y2}V}^dANzFaZ@5 zbO{|`#MZH{Wzv{XqNWzFIz?CaVH8&Iq+I6d(vVimYw^ z@pg^DbqpXZ7F9pekw${9<2;po*Pe}Iw`c*&Dp5|^=SzN$w279tok4P=T-U#?q(wm5 zQdUpE;3F4`@nmP#g+X`=x?}x96DSt;af3NLbo5j;f~a|;;N&XMdd8u4%aHs7`47uj z^DIS|{gGo!(bMoGTD-UDZ33BbpJ_V-oLkk2o`V2sgK4NXX8R*P50YO?OG4io684@% zpcOY0>6Ed0Bo6RpbeJpJaGi`J&W8lepIw^*q~PaRrE7mA=PLPUE%N-SC_QBsYD*3Z zqCDU#S~ol%{mmyYGiU=fs~x2!>LM@m#EaDbW=vptt(XQ2dveGCtxGVPA#)q7U4axv zn|+qj-4Q86BSccno4beybi#t7Gd!F_W-5LOki#+*C4J?XX>9^mDh-!T`9`W>p%O;H z2JO!`YkjqlTF)fN`4Z|ePUzv=ce;>+qR#}$R6KKkO^Weu*9goy(o?>zhz{7~`^QF} z=tn32?}3K{s%YQ>fF?&%FnUK=`|~(&$OTJXK4)GjRxI%$;Ks@Q=kT8zw^|y$D>0?l z$O%G}IcoFyM9c!^jsSAsb$3Z{PzHbhTdW%zpJ`V(ocEKRUMxWyzBO$u?8UM`1K!i@ zh=hXLcMTmD$Wosuf5D>5#Pqr$hz+V)V#B?xjfrd=j?vd8J$QXwe`?{D3^OP>IhHfN zM6WhD6gLtM$2OieP@w={H!2gREEl$WIq)XEluHfA53Xj0XqgL9f~qt=V@M7U7pk|Icm z4mT`LuFA68{ZhwnFe^}vRoY#G`Ek}I;n8o%_@dT4EN#6&|ls-#POtcUe#2@N0KfBeqk7d-yyukl>Izn z{evvo59mZflj`Q?w<>79%q!Y zbMy-`y{kTH6q6BuOKQ!@T>JC=Kmk@{JMmCMblJjPnTT8Snx!t@w=TZ+cZw1i^KmO5A1Yc! zp}P2j5uZ_ypZDR{jFV_@_EzU+LECkm}_Gy>}$3PxhtnRLPg5rhIgp7GjOcp zw@jMp{B*Txh76~T&IzuyiTgrE&alLHzDIm|Gsm{KH+ARyv*i1XRc&l9gWlMdQFXhT zqCYr%1-{6$C83@_siJJOk-6j3M_O%hat|FCtnK4yM*}GMlB=&#Cou#E3*=#+mFsA0 zK)s=Y#=U;`tV#7FEu^$iyzSr$bYvm(VkeA0XV?VgZ z!BO6Q-);h)zq->9KR{;w{)wl(pY;Q&6kjS@2-k#Ltn$J?HDzRe0%F5g@Rfvzp&0gZ z$mG`rKDYjLMzO|0HRu6CQr>^?zJ%rGuB$o@!=D;Q%sy7WFOP}RPqC*Fpf}(WECg>T zCuNN-@=fwvXdKS2wd^p+;&#i+Gq|{al*|ANq|k2t9;{l-ucx;=;Kki}v_* zjPS5$?FZ2zvR}|uIw9H+S`yh3Q(`@yIRqg7W9V zo-Gd1|FW1QT{}Z!0zuQBZv9f@4~*09)7=txEk~V(4O?}YAY6l%$2~FSW=DF+!EhS| zi_gyx?0_I$lO*u5<`NP@g*MwZgU_K)Aer+w60oDQpW#Aq^Ox!O+h1Lv zsJD^Qs@9dJNy z)y}qu$4t!0Ep39p2Xj>_ZU|+;*lG-xtA|&AZvcCSU1~hDJ6F8f;&czHxrFdIGWz06 zG&2HAq0aQmqBVLS!%nKp(`))*JPFF%$uD{tuBpF{MAxM@^;(46iL7C$&l#$3mx*@C7uMKKMcHOyM*QIpD-ZaA6{40vA1=<9=sgi3*9&Da$&l}f}?Yw(eH#Bswl^Ow|Jwv|; z4c+)%de!sjiYZM2s^~f}VdGv1xGAH`27;&Jf$kMVUAuaRvHs5~)~$n07yj|}ygzHR z6ibQAqIwrD(ioO5u*Hrj4@N6(#Dm+v3fk);QLVZhcGSZFQ;D<$2fz2h=p$TNf0 z;KO@Jvi7YJyxj|8N_ef8;i5Ob*d2`0G>j_`IU9yZAvcBtXc&i5N=?-TRm|aBxx}40 zsmO{4O@j^cZSP-$_1RL$4RGe+4N+l2cciswBid(ELEm$Ijg{7H-(x4JiP>yB#c2m` zAy(R2q5WcLZq}U5y9neioSZ6mnZgF{^^xA_Ff%f@2^U4DH0R|XG4LEo`%UmVI{_d5 zaUlsy8I>FvTK7$mbKv%;rVZy4_w6F`b{VfjDK4yJT#v!utNx*f@z7YuVdBn5q;l@z z0Kq5zmfZ^3#uwk28cZ?T%B8<0i%4GPJ> z=IV-U?hIjt4|pcxR6{atGpluLev?LcUGlv)K@@Fng!wv#nYe|IQ?*}SDI&o=_SINl zmiT;m%mNDOnZ`WSDdG=0Au5T5;Wg!$m>eupkzo8r_bE21S^htA
xb?+3p+~V;T${90e@l7(gO;aA;-?D9I!rl3ntn2ov>};+p4%ueGRckw1o`OTC z*91t{Aa&>g-QInAIn01{VgFx44tE>3B*|q=+1ZU&QiA;PX~>YB9J6Mu!@hx?>Jch_ zzA9maV^MeT7g_>3C3a_DwuZx9<@W>(I4s88{r1Z@vTVpRw*kVEzIleR-K|~MJrX>`Va<+J7w!l5FgS%l?^+o1q>+Sc<(~OwE!2Y6bW69Ynwh<+{6ZueZqW0O_6KMnV9WEX z^xjgEs$6f-x~}pS@k#16&Zow^sx{g4<$0~yX1QnRam2~|>-y8o{0`OCIL@KQ6RPIi z#1=z{>te4OgF@qi-b(%hGvTSVwPIc^cB^{tMH4r8-rPdhr6UK20xc@{U)Y zi4CMyZTcw=bcJnJxAeA-NUv-%gT;%x;BQ)aIvpMfI7Sv*W?N+KaL`HSu>GM?9EgAP z3l&!F)280V1~B~h{gYDbGk0kV)9w+ z2E~Gh{m31Kxumu9v2t!eeGwNkWy3!+`W8?D`Sy&|0Y!W%Fay-J*KTHd59`7KO6Yyg zxIsJP64F!Xctzsx){yxIQG^c*tgh0?b}vR};!W+rndY(#-9-ggr}&z=j{mKPHE z6vn}u)OKsfH&Exz*Eq>8MWc&ko{|6faN)WK#ePi7o<;ej51$)ynJ%vK9~-y1-Mqi$ z;Q;v;WgJ&6!~v5w>o{0|`OLG4-_ouNP>QekQe2bMHeDJ4-+UQw6~q&(Xs=HUlLfE- zFBX6^;JfXjn(?k=PdNnH?RX%2dVz36J+QvYr;If9d8pLj{mXNar2CA;esWZ=)%0US z7?@Pwaf(NsYi*!{O?hi$%i@AY60QBrgQPX@gRBJcZD@pkdG&CC#1KVB^wRK)YI=2) zCj}>mK%B2fr5^Ie;dp=-j2!TRw@v~k?hUt*zvn8Q;OKu%IxAf}TNcw!>|8)0Wnexhn(%yWxEA#C+R`mN3LY*YxFFE8Q*tn15vKyZA7U@J|WNoySh z<5uaTMoMT;IF32H_m|3{fR|ZlBDOUMe42M;i)=J*Pps$ymbQnBZF<&VX<@7e4iFe` zD3SaCM{Pldsqylwy21pd5a&KukU+({m-8>)Y=+rxJGsxHBTU{PLarzj7LEDUFJ+@k zXw;S{Z{sy?LMo>pfBd?)PnLAcDUZNFymMGea7Q`$nmy6D{PtgRrN)Z_$lV`Tv#tX zFq^!i&Tz`&19wfV!q(ZUUH!dQgxVbp zvFkvYq1X0_0+>}s++Ha;hLp>s2EzDg)LbZL^{8i$)_dfjy*VZOR+kxd$v0TUycZF> zYjkSQ@ITPxn6Z!7P1CoN@!VVo7$~gs%hy&o>6Q3mjU?%W9OtK*H(evOkv*4O$0pY2 z$#{IHmlUQ&UMopmR`Tb1TC$D3)e59y?F%4=jzJo!&z;A!!VS&{8mod4h*Mch|G7{g&nS zkLLh1Vx*ZacwB%(L-*eM?y>KyFcHg~ha8nq<=Io4OHA(U5epd3WTH8Hwmp_xl^DoZlsyrelX})o zB`pN7Zi})FncvIXZ}U*yi}%@sN~cTt7!ss|(pPZ?8XBh-c{E@HjqgZtPHVF(*#9I~ z}ydS6bR$=Qy=Ir z(SL-ywXifX;H|{+ioH3sV@;lg&w6g^+P<(-N=wDxvL38kk48vH%l6)BDerMsLr!No zSoeubU`~k%8)SSdTG!0a(zqE5p#x_WhxM&IlZXXc-X9SU#0h)^d*#S(NprM)<9n{R zjiN zQa~4Yn_b6zb0#wd)dYp*(6)NLAq#=4s!`5b+Bz`Gq@zN zOs*u9^`qMXiY4Swb6IL9q|G}Aka_B1pH4JUen&Gs+u7}gawqeTq;^=06Q(Vs2RZ!Sd z)YpROuA4Ofe!fXoYfOJE{s%Luja(YIr+C{_M}-8RlD49h+_+XUT}c$snbqhGTpkajGbbhS|f z$Up4{ZeVBx!}pn;M)7+OB=F(4-@Yd6Kx(;{c+1?cmV1BMt=}&J7yPP)kfxK2F(c_W@Sx77aprnq;u=y(E3W!!A(tiW}) zW9R_%@oyfDoO<4vcXKv-Cdq+dNP~4&tAS+Q#%(GTCUM4n75pfcf^sTwPIRxz49iDPO?|2 zu^^kc*fV>P)jF%8<|`^JDabQ!mj-0&F?&q~KldM30dJT2BYOTC`&x%FwGh-iI6?_` zcG(D@2`Jfp^SjiWWwzXNNstnV9qsTlxxI&iEtAnj`Gju-OL#XFTglH<8J_vFj>Fwy z_C?*CjCwpz@I&}~pHdyI%lbrgs7E=>mD`}>*1=gxTh3VvX%4i6$=8Ve|jD1aJva|NtN zDPz1sG4YARDO0g!YpnEB;kbg3;>EM_5q`oSNsClRnYlS3C1`dxJ~rWF;zamwN({qg zw|wjG*$sAOHjCTgyi=zxWqTlgga|HOwzwbqhb64&3;tw01rRD{e-xx0*Yh}d%ynsI zC`WZdTw9B`hVbt(k?(4KKW19kc)lUgx!m+}griZnA{sfdlg_d)0!OS;J5AFeZ3FS2 zq)J*OjdHI0Ej&I4&i}p3RLLRaWA0bmVZIdwCkl1t31!OGt%A>-OMVd;uR`44Z`HbJ zjtV>n%)F8Y$44cg9P7fQ|x(3Y!%29Y0g>| zYkvrEAQTlse-jj=n@2W`s9YeXF4$%q??<#o4l9@(_yV&7@A_~zESLHk_)Y&rC6yjP75V6&;T&HI$d7Zc7btAzF1jD*^G`>8 zJtgw=ZjViCLZ1ow#PZ-I7EUb0T3Lmk0xf(c6bQAeK-kG?8(kI3-XF@eKl`F*J4BG% znPjh;aGajc@NNcuE_HwNr@E2%c1sBrkKefn(sjefL}{QBa%&+fs?KF>^v3l!J?q%5 zj(zZ@h;l!3ik6I6Ht|G$Vl!jp)z`+%@2$>$J{#sI3FzGU+}%C+gkQ9`2}jR~6%`;U z+SqG1>f0Vh3l}B7azHcq!WBul9<*tDsZ0K{AX?PULu_??L0Z4BqX;?*Vi+7stXUh9`+_h4GDroH%oz-g|@?}r( zgfcoD9sv1=$O$kAhDlUkIR|5NQy~`0`5HURL40=1MM}n*L$mOQpP;funYNFuO+KoC zwjrn`V={x&hj zx5F1ysFp6ReEZG)=vdw^^;B64QJ-H(W7|4zAm~$o-7FE}4!jhtbWQJ{R%pHwg6$hM zj#31X$GVmSgYn1hrZn`Ce_kO3Ljnt6PK_%B`VcV58?0?p%>U7pQhnghGz8U#C4YoD?} zn@vRrJ>z%uH<4qM$_V+IalO5<+K@G(QWA}UV0HPRI`GU}*^g(0K$5_m_pSCcj%dD8 za2S`gW9V8H!jshP-rVsjK<*9AcM-f+c)}2dg1DG*K@z2oMPgJHRtDa}CD%O|i+S6x zV{0VpfBf$|i2Q#w#2|qYho^33lM;p9!G>pY@!q1J(|`3U@Nr0 z{PrKrvxH%sqn?SW`&OxCxXbPf>I#uxwWJ90im_wP-Fr8C-hijsDBzjRO$$I)I{oFO z3wIp#K0p|tx$$R44F*%-P`H3%aigJ_LB<~MDc)Hk*Uqs|hR04Qk}jqN>WMQvaU;LW z8^1URX3ZwYaO)(hx7}nEaMyY50pgok5(?gz1Zv&#d(?mt%DVcxJoZ{O8qbVUGOO-5 z)jAWPf9M2vzGr?j;DCCC_#s2limq-49Ti%@{~851q;;$-C!W$3cFVEpH*X@DkqXu? zY*{GJy|H})YMfz7;;W*DYUUoRWT>kO?hJel5nEKUc!Mm(O&i5d|LO(WG4jc=;xiqM z`_3QPP&}b~LCxG?P*>mrdB-vcg1e)7WRl`-#Ry2oR1h$Po17a4nx%sb)>3N z7$P7Y1C6)Q^R7?+EiCh=!EPeWiie~R#ze$=Hn`8an5f(h%o5lcynkv4*y_+=#eoco z!nAHQI0M6y)2(`z5|X@;*m)@gv!S-&Vnoj6z=WKACrVpT5H1rBX3?=m<8E}0;t(E+ zfRr@&Jj`6OeD?>~kAAA%R1=wT+J7oRTQ%_F;6NBuk`uZdk5fFTjmD>p-TW=oZ~8;O zY4o1bZx6?1io6D_p+TsawJ|ldty>;pidFyb8_gQmwzD_cj*N59vajpld>%xMnc;7b zwKFlA&{xQxDi%4&TJ8w>D)k|L=+%J3M9pH^nc)k~hF!d6B}VaW}9e3e42mfv8tS~cUUomO|4 z2{4j&bAk};*%DpRZmlqPnZ^-)+7NgOhJg>;)6a-hTR~<`b*?eyRrI|-THS4y?@kQg zZyCn7D?|zPnD=WsKMg^yhglwZM6Uo-dPntZK6G__1a@9b7JyxDESxv!Zo^1l3^sut ztqf43H;%(z3J9I{@W)?9Mt{Y%sD0#Z^WzF-K|jGOFpS>lk5Q7WRl^o#_DNrZf4ij= ze_X;)sU4T-Z5LdUC1jj+4C@=4$sLTfl%U-R_f<3q#Z?OoES}JxkLM!!Tyr5Fi4g=@ zpfEHqLa#JhqwHxI@-pMyJN+tTH;dh78DnniAx9fpxYfnS`MbjVlZu|zDEvyH?{(ta zP}i~d7B}YS4q&v6RH3DNChici$2GZzbQ9-YM2S<$ne!t0>%!IdsID=aQ{r7|PQ?x< z>WEXTdUngGu%H)AL9~-26LjWt;ReUh#r##YJgv9@!qsfuM`hB9=P0Uj#zLNjDbKxu zRH;$JN0d=wPxJ%CTkZWbq-m9EG2CEfk63fmU~O0JSz0F(TFc!qJDG{b6__~WZ`O?> z#NZagQw<0G#!I%*OSFh47sylSVi3HgQ$DHAlm1iMioPYJaZizZ4NC{uDw~TXJI$FI z`_64Z(S`_9CGY}KqGhs*Z0oj2BIRQvjY!=IuYLw;tIG2QK0ez%WrTVtrf#e{XCS>G znco#CY`UN0y@mF}(k*D2_F4Vbpdo{VbV2STE;UPkWOI-Z1M`q93GUtn+{w!?wcNAO%; zg(2J%<;d}#b*Xnzno$XD8Ll;DXUc*VP6lx}Z&xQ-^7NN3LpghC>bfdl@Q2JY0hYc- zbDp<;6V5c*88#N7aO1oEr8F*kT z1em8FD6*k~0w*XA!lHhYO<>%~Jy+2(o}8WOD9fyRrBe%%`0C+(@p@SZH(G^aK1H$;FAb6zyy;0%O05; z>-`pN@%tIr<%F;N*O&X~rq)>+#a)!DbDH~CswcGcuHX#Ne{Q+Rz#f;E@LE{#&gP01 z;guA7;4ZPAmGHChV2hbyiOnGv2*_kUQ61uf&IrAfqeJK)EwN1xq6P~KgF?Z(5?aP= zK0(M5v?WAOqb(@&-q_{42Ls729wyeJdi-nn||y^Ko3dew;Xk&4Hkdy{$aGyXWbN1>G<-&{`_FuNdm z_T4q~4LsaI#}TPn9$GSC@UG!0e;e;h%*%(k<2s}YLHG% znZUNZ0O!R5bMQt0gj^&Z&zsT-|E8tPv_~zJR{Yo|mEI{yH}|~)@dDb3^0vD@IU#`g zvkE1lE|P4u32T)jeEGY-B*94uW{gN=SLScDs4C39Iqiyo*sb9r+E=2`t-2weq3_h1 zGzJ}xHmP^RB<%bO_n`Uq1?Dh|@bCoIWP!B1(dR$>Keno~K=Y(HXfwfb9wnS+kM#%Y z>Gx!}`ve!Hq=TdVBorm5gb0piN}p8=si{-D@VLyhP3TSrsJl2*M-@i(w)`Dx$_mTH z2CR))(1qUXD5XE=iS;kooFU8oB{O`WLGLe4^Qii}4vHu@J)Wq$t)0h?VF+uGSbL-^ z-c=oW>KJ;)OItz|M9xMIlvT8sH%Oh|p1VqWL;toq!C1>Oe;cW2;fIXRT zM#&%lXyNo`VKzZ8MaN&f)o$YoY~B!qeX(h~d~BW->vp$xP}^4p^&I^NmpN`EqCH_& z50jwQ=Jlli)v5oIk|~bC*;o<&y(3yXIYPb;^pCM&=lpbjdmVX1^Re>2*~mSm9eLVn zRSaCI3N3TvTsu%Z2XhQxkisaR@YXiiVQz!n=iDJqqEy4`AzZahy9C=OJ`7^2j~LH@ z2&4Ih4+61u9TOIpc_y)iSJshA6F$iQo~4tMTJTtdLyrW*dLwU(tU3Ecd>$Opz;(Ww z?YqbN*H(;eNTn?vT7P0{uUgzBKM=3uHm41=`&r@8<^>(apbxz-vtPSsw`2tc%qf#Y zts)=5ir?9A=dpLk$U>B2wKMHV#hSW1Rfr2Jn=dBV3crh&(|7rYM7zGTZnwcC;aj z9C}M%zNmO4*M|*aRk~gTS0Q?eCb!sFKnxP>n}d=gzsBAd5bn8*8r&1Ysjq|XQuxS! zR+Xuan*b89zfi zljCAlqNkEG$?Mh3Gg0TqWr}p8O{BOsky+nla!9DZ9=KB2p?Tb z2_Jcbq`)tL#pZ~mpEvZD7h8p&jx(W@)@ae(%JT~xrzZvm%Hv*eI1N6%A!&(Dy=jZd zS}=)<+J288CT&RQue;R1T(=wK5w>KVGj#o;ml{@=onPY8HG#J=d=(A6rho_gBmhXb zNo(-}8pWu}88+VIxWWn>1tu{(ze^~en|G(u;-&<`FQLbnYDAl@yY`BO{r2CIo0Twg z;tZ1y8~WR(>*Nw-^0|jCD|buOyPF?TC$nbcq}xp+Shr^Lx@vpHetdYyAdl12HO^r3 z8Bj!Nayl(L{IJV9qAxmQ{I~Wjz{j<=YIyoh35D1vvY&+D6SM#^Be~jQ~wI*irrmLWJe%LUVbxh3m#AqXwX^hGq!(=Iqtt?|-o@mOx%Q91g$-a|DWSvN4XJlWBh#7>G%04mH7$!2F zNHcZ@?>z7IetSQ?AKowL%YB{uI_J~5&i{Ykzjoe*$BEA!Qc@Nj;}50O!`im1Tlsn? zvI+Q*Sx{tc`-3(6jH0cl2J1oZFHZ{<)QJRmAa8G!yKe+UzzT(qfpj?>-rbvsOk4=m z4w?~;GwiVyhX#HZ==SHlfHE3H6jza%_g`*>fNfPyN(HSF=MmC^b&&$|kV#hYmR*hBbr)<%5os5B`Q!O3PCuBraLe(I-S$_N^Rm5R9rULDnKr%MP$!(o6?>?!sPJ2yuZt3C=2p0TZ!Di+VJSr1Y_lR6O!3J zTDy-pjT%g>8vxy~el|8@!aV*ZHA85XDhuAboTa$h6!n*DE~YE4QNI5L07^WM?hG{< zzhjq>y8xeAQkAsERFp&WB{_zk@V@KWez5hVpA$+=4GgrrWka%Dk@F*#M+Xl{00YVt&fyk2za2$555?;uM=J9F z+&wVow^v}9jb295xpP}%179oEVu6nOSve>2nFm5SV{zi^24asM!S|v*`v|j@FMQwz zMYiKjWt@Lz{3Y5-&J4abFy+vdSSM~oer&Rg>=T|Dc!@TDq5F{YHNTc(4(LabLMp1e zs4upKGR`rv*ybTpyuU1_{%f9Ka$cK{MJTu8Z8+We@2%zWn-WvAiEi^?<7j%LN zY-}g!|8N2D7?`_^-6|C9M1HDLFPs7-4zfhcPzq+$xi8$x+iWR4$&Mw55WFELEUIR- zV`ry-;pk(6-9Y?ilfu^nQdNTIZ!JctOD3cTKJY=I+QTD{+bLTK>(34)rVCWfiDhq` z`;19Qy*|7wV>LS&edPsXn)rIWzb*b;v=YRLKXNNiREdcAP-Ydw-{2%dio>(fwSk2z0wB9$jjI|4JQd#qe)@&(RAP^POWTiE`O&FC?QwQHW+EB> zOBvW)UB6cb5LwJ-l^i;sRz0G1dRit(S$LYuW_40Wp3R_BT4YVvM3~ath>LgICA#|D zQX!*~b1*t6-%;PBL31lC(6NhBFXv~?u4zpgf1h&0g6LM)c-Bdq?MzDmikhS^#^=-y zAJ1mICr_KAeF;C*wn=c6NHK0&JkW74^s7M0mWr6%n~r$0CpN z59w!GL*?_St@ksSEzRdGW+h9bA0<24kc>4Ah5Yli!f}Ji>O~_wTR!uOdOb%RHk@n` zWyH$;lw*}-b|{M)fK7L_pU@pjXUi&W4?7dQrZD7KF_T;fa?zK*7%cdTQfjis8T$oC z6rjHt^}cZ+LHB8vf0G@?oXBxEN@F*Vb_$6ZJ`fcg-+xpCMmXM`58jg-VyP-0IW#^F zhmRN>kui+STcb*YN-_e6XUp{cF8yjgb{*ep$`SnQ~_qyLDH%3P6wfjyHn!Ft2C z&i#CVh95B&a=!O{i)=1zf5wQJkzpT9zH+rp1doCoub6jwk&f@KKHG5**BN zH?;7Fb+Uq^(Y*lwS=hXq_Lv}{30_ErGL4U}z08d3!Vlg5qYvDnx3{!$5A5)0uyN1% z@{-#~Yd0zCTZ@&Rmw#dep?lrC#n}T?^LPvDdu?H}I{lLaUC*B{_sZ2CAJ{FdbrVfd zwAxT(lB)K%P9YW8^(+9}SxDR{U~be8!+;l|kTV0p7u_^2EI+QqQy|1xrmC)TDa^|+ zfSReBDm$Mt&dag>k^xN?IQUJY?3{c0i)pO3jaj&t^S5ca zbWv>S<*MMohZT+fuHP6o7c%l<@Y04&IqH^o61;-@Q~^O^&fz?rkg_wv3@5wu#L?#A z>vv9fUnji|F-96)6zI0(fg9dIAI?@{&bGVbLtI>IUqN*c+(WfQnmifswgJz6a zzwL+OjJ{bJ#-RHC=8&>S(+HNV17C#1?L^w}yvg9FqtM^S`Ox0j8qN|u+{5K5W%0a_ zryben(iPq9?p`JD-Qa$Nz?Yn^$8O2QX~x#-(lYa(SR403C*IMb6aK(Q)n+{kn$~|k zuthVtmF4!L7oKe#X4opNR4HE=7;bI#PD&M5n}fGsxrbL;s(h02^~S^o(MyN>-loCd zc&(bSf=^}R9^Edt<(KeNQUR~~wivQ8O?pFb+SANDV}1uta%G9KnJP&2-s4NE>FZIF zA5%+u^5zTC>mLwFQd>hki$3xnv_kz;T0v)e7N5|xVD<$P&mZ#DW|-gsvKlv=T`G$$15~>-lV@SWfFZ754G|0N<>mVhFnqK-Ix#s}BL~ zy(5WmAEqDp#eG8LxOnl5n}!E|wFJC9h~Pp!fw3Oux+j74JA`(&>}Tjg#)T2i5zr7r zPZa!SZG(WZ05~9`BX7jV4xekKduNXF6?SF`|2Wr4-N#r6=XpNF1L0j0*>Q^e-ni2$VyWXopeh3YS4=kqd2ct8#oMX1)4===~$3l$rc$ ziVNq@$g*fz!hQgWH=$NaVk6IsP;l*3v8ChCj^#Qn!<*AX-!^dsXg}3a_-gaLoG%0f z>-Q2^C8=Lkk>ci8QyxqK7`RH2vOayhX@JtDab1^sweYv9G+WARVfM z&RzPcB8XKhTqYYeuN&;dFA(E1qDw_0G$mOUurqYo9;-H)U-$2NO!v2;qIDY%qkq0F z8S5Nm6DnNX>Uk*_%=#BGU(PEfYCgjtk>l)gC+W*c8C)ga`VvdUjq_bm9B*;QBotU3 z`LS;36Vn^I#bd7<8D6%4KQ@WdTJtd`FnK0FC=V+BA4x#}BVcIZ2+{HlZhMKI$|5DP N8S9(r)#f literal 0 HcmV?d00001 From f90ac1285ea0fb7582a5462fb546b730f9139bbe Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Fri, 22 Sep 2023 13:25:39 +0000 Subject: [PATCH 22/83] Updates for R pipeline --- R/misc/Cropping_Rasters_to_three_cities.R | 4 +-- R/misc/read_crop.fn.R | 39 ----------------------- 2 files changed, 2 insertions(+), 41 deletions(-) diff --git a/R/misc/Cropping_Rasters_to_three_cities.R b/R/misc/Cropping_Rasters_to_three_cities.R index 7cf0188e..87537d98 100644 --- a/R/misc/Cropping_Rasters_to_three_cities.R +++ b/R/misc/Cropping_Rasters_to_three_cities.R @@ -2,7 +2,7 @@ rm(list=ls()) #setwd("~/Desktop/clim-recal/clim-recal/") -#setwd("/home/dyme/Desktop/clim-recal/clim-recal") +setwd("/home/dyme/Desktop/clim-recal/clim-recal") source("R/misc/read_crop.fn.R") library(tidyverse) @@ -80,7 +80,7 @@ lapply(cities, function(x){ cropname=x) }) -#### HADs - updated 360 calendar (to be run pending updated files) +#### HADs - updated 360 calendar var <- c("tasmax", "tasmin", "rainfall") diff --git a/R/misc/read_crop.fn.R b/R/misc/read_crop.fn.R index 48c235fb..a6a6e994 100644 --- a/R/misc/read_crop.fn.R +++ b/R/misc/read_crop.fn.R @@ -143,42 +143,3 @@ hads_read_crop_df <- function(var, #Character vector of selected variables - thi } - -## This function for the different file structure of the updated 360 calendar - to be updated when have confirmation about the files -hads_read_crop2 <- function(var, #Character vector of selected variables - this might need changing - fp, #filepath of parent d of folders where files are - eg paste0(dd, "Reprojected_infill/UKCP2.2/") - rd, #path to results directory eg paste0(dd, "Cropped/three.cities/CPM/") - file.date, #Character, Date of HADs file to crop from in YYYYMMDD - crop.area, #Polygon of area to crop to - any Spat obj accepted by terra::crop will work - cropname){ #Character - name of crop to be assigned to the returned vect - - var <- var - fp <- fp - bbox <- crop.area - cropname <- cropname - file.date <- file.date - - for(v in var){ - - HADs.files <- list.files(paste0(fp)) - files <- HADs.files[grepl(v, HADs.files)] - file.i <- grep(file.date,files) - files <- files[file.i:length(files)] - files.p <- paste0(fp, files) - - - # Load and convert remaining to single col dfs - dfL <- lapply(1:length(files.p), function(n){ - f <- files.p[[n]] - r <- rast(f) - r_c <- crop(r, bbox, snap="out") - - #Write - f <- files[[n]]#filename as it was read in - fn <- paste0(rd, cropname, "/" , f) - - writeCDF(r_c, fn, overwrite=TRUE) - }) - gc() - } -} From be16f8f0368b704d7ca9c161d90edb21a7528244 Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Tue, 26 Sep 2023 13:50:29 +0000 Subject: [PATCH 23/83] Adding pre-processing to within method (qmap in this eg) Removing files no longer needed --- R/{misc => LCAT}/Data_Processing_todf.R | 0 R/bias-correction-methods/ThreeCitiesQmap.RMD | 73 +++++++++---------- R/misc/ConvertingAllCPMdataTOdf.R | 51 ------------- R/misc/calc.mean.sd.daily.R | 62 ---------------- 4 files changed, 33 insertions(+), 153 deletions(-) rename R/{misc => LCAT}/Data_Processing_todf.R (100%) delete mode 100644 R/misc/ConvertingAllCPMdataTOdf.R delete mode 100644 R/misc/calc.mean.sd.daily.R diff --git a/R/misc/Data_Processing_todf.R b/R/LCAT/Data_Processing_todf.R similarity index 100% rename from R/misc/Data_Processing_todf.R rename to R/LCAT/Data_Processing_todf.R diff --git a/R/bias-correction-methods/ThreeCitiesQmap.RMD b/R/bias-correction-methods/ThreeCitiesQmap.RMD index 7ab21753..673feeb7 100644 --- a/R/bias-correction-methods/ThreeCitiesQmap.RMD +++ b/R/bias-correction-methods/ThreeCitiesQmap.RMD @@ -20,71 +20,64 @@ rm(list=ls()) library(qmap) library(terra) +library(tidyverse) +library(doParallel) dd <- "/mnt/vmfileshare/ClimateData/" ``` -## 1. Convert data to dataframes +## 1. Pre-processing -Qmap uses df as inpt +Qmap uses df as inpt so below CPM data is loaded and converted to df -```{r} +```{r cpm hads processing} cities <- c("London", "Manchester", "Glasgow") -cities.cpm.dfs <- lapply(cities, function(x){ - - fp <- paste0(dd, "/Interim/CPM/three.cities/",x,"/grouped/") - files <- list.files(fp) - files.paths <- paste0(fp, files) +#file locations for cropped versions of CPM and updated hads data +fps <- c(paste0(dd, "Cropped/three.cities/CPM/"), + paste0(dd, "Cropped/three.cities/Hads.updated360/")) - # Load and to df - dfL <- lapply(files.paths, function(i){ - r <- rast(i) - rdf <- as.data.frame(r, xy=T) - return(rdf) - }) - - names(dfL) <- files - return(dfL) - - }) +x <- paste0(fps, rep(cities, 2)) -names(cities.cpm.dfs) <- cities -``` +#Run the conversion to df in parallel +cores <- detectCores() +cl <- makeCluster(cores[1]-1) +registerDoParallel(cl) -```{r} + foreach(x = x, + .packages = c("terra", "tidyverse"), + .errorhandling = 'pass') %dopar% { -cities.Hads.dfs <- lapply(cities, function(x){ - - fp <- paste0(dd, "Interim/HadsUK/three.cities/",x, "/") - files <- list.files(fp) - files <- files[!grepl("aux.json", files)] - files.paths <- paste0(fp, files) + fp <- paste0(x, "/") + files <- list.files(fp) + files.paths <- paste0(fp, files) - # Load and to df - dfL <- lapply(files.paths, function(i){ + # Load and to df + dfL <- lapply(files.paths, function(i){ r <- rast(i) rdf <- as.data.frame(r, xy=T) return(rdf) - }) + }) + + names(dfL) <- files + + #assign a nice name for easy referencing + obj.name <- gsub("/mnt/vmfileshare/ClimateData/Cropped/three.cities/CPM/", "cpm.dfL.", x) + obj.name <- gsub("/mnt/vmfileshare/ClimateData/Cropped/three.cities/Hads.updated360/", "obs.dfL.", obj.name) + + assign(dfL, obj.name) - names(dfL) <- files - return(dfL) - }) - -names(cities.Hads.dfs) <- cities + } + + ``` ## 2. Apply bias correction by variable/run The called function was written to apply the following models: -For tasmax and tasmin: - - -For precip ```{r} diff --git a/R/misc/ConvertingAllCPMdataTOdf.R b/R/misc/ConvertingAllCPMdataTOdf.R deleted file mode 100644 index 26809d12..00000000 --- a/R/misc/ConvertingAllCPMdataTOdf.R +++ /dev/null @@ -1,51 +0,0 @@ -# Script for converting all UKCP CPM input data to dataframes -# Just running for now on the existing projected data - -library(terra) -library(sp) -library(tidyverse) -library(doParallel) -library(doSNOW) -library(foreach) - -dd <- "/Volumes/vmfileshare/ClimateData/" - -# Data is massive so running in parallel -#To load objects in nodes as spatrasters cannot be serialised - see issue here: https://github.com/rspatial/terra/issues/36 - -Runs <- c("01", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "15") - -file.paths <- lapply(Runs, function(i){ - fp <- paste0(dd, "Reprojected/UKCP2.2/tasmax/", i, "/latest/") - f <- list.files(fp) - files <- f[!grepl(".aux.xml", f)] - files.p <- paste0(fp, files) -}) - - -for(x in 1:12){ - - cores <- detectCores() - cl <- makeCluster(cores[1]-1) - registerDoSNOW(cl) - - Runpaths <- file.paths[[x]] #Subset to run paths - i <- 1:length(Runpaths) - - Run.dfs <- foreach(i = i, - .packages = c("terra"), - .errorhandling = 'pass') %dopar% { - p <- Runpaths[[i]] - r <- rast(p) - rdf <- as.data.frame(r) - return(rdf) - } - - stopCluster(cl) - - fn <- paste0("Run.i.",x,"as.df_check.RDS") - saveRDS(Run.dfs, file=fn) - - remove(Run.dfs) - gc() -} diff --git a/R/misc/calc.mean.sd.daily.R b/R/misc/calc.mean.sd.daily.R deleted file mode 100644 index a610c045..00000000 --- a/R/misc/calc.mean.sd.daily.R +++ /dev/null @@ -1,62 +0,0 @@ -rm(list=ls()) - -library(terra) -library(sp) -library(tidyverse) -library(doParallel) -library(doSNOW) -library(foreach) - -Runs <- c("01", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "15") - -#these files needs sorting - currently just after RDS -files <- list.files("/Users/rbowyer/Library/CloudStorage/OneDrive-TheAlanTuringInstitute/tempdata/") -files <-files[grepl("df.RDS",files)] -fp <- paste0("/Users/rbowyer/Library/CloudStorage/OneDrive-TheAlanTuringInstitute/tempdata/", files) - -setwd("/Users/rbowyer/Library/CloudStorage/OneDrive-TheAlanTuringInstitute/tempdata/") - -#Syst time reading df and converting - - -start <- Sys.time() - -for(i in 1:6){ - -dl <- readRDS(fp[[i]]) -df <- dl %>% reduce(cbind) - -#df.means_hist <- colMeans(df[c(1:7200)], na.rm=T) -#df.means_hist <- as.data.frame(df.means_hist) -#df.sds_hist <- sapply(df[c(1:7200)], sd, na.rm=T) -#df.sds_hist <- as.data.frame(df.sds_hist) -#df.avs_hist <- cbind(df.means_hist, df.sds_hist) - -#r <- Runs[[i]] -#fn <- paste0("df.avs_hist_Run",i, ".csv") -#write.csv(df.avs_hist, fn) - -#df.means_Y21_Y40 <- colMeans(df[c(7201:14400)], na.rm=T) -#df.means_Y21_Y40 <- as.data.frame(df.means_Y21_Y40) -#df.sds_Y21_Y40 <- sapply(df[c(7201:14400)], sd, na.rm=T) -#df.sds_Y21_Y40 <- as.data.frame(df.sds_Y21_Y40) -#df.avs_Y21_Y40 <- cbind(df.means_Y21_Y40, df.sds_Y21_Y40) - -#fn <- paste0("df.Y21_Y40_Run",i, ".csv") -#write.csv(df.avs_Y21_Y40, fn) - -df.means_Y61_Y80 <- colMeans(df[c(14401:21600)], na.rm=T) -df.means_Y61_Y80 <- as.data.frame(df.means_Y61_Y80) -df.sds_Y61_Y80 <- sapply(df[c(14401:21600)], sd, na.rm=T) -df.sds_Y61_Y80 <- as.data.frame(df.sds_Y61_Y80) -df.avs_Y61_Y80 <- cbind(df.means_Y61_Y80, df.sds_Y61_Y80) - -fn <- paste0("df.Y61_Y80_Run",i, ".csv") -write.csv(df.avs_Y61_Y80, fn) - -remove(dl) -remove(df) -gc() - -} -end <- Sys.time() From 879e6085968d271e2cd14bb1f74c74dbc4e8cc22 Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Tue, 26 Sep 2023 16:25:22 +0000 Subject: [PATCH 24/83] Sep'ing script for conversion to df --- .../converting_city_crops_to_df.R | 144 ++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 R/pre-processing/converting_city_crops_to_df.R diff --git a/R/pre-processing/converting_city_crops_to_df.R b/R/pre-processing/converting_city_crops_to_df.R new file mode 100644 index 00000000..5bcf08cb --- /dev/null +++ b/R/pre-processing/converting_city_crops_to_df.R @@ -0,0 +1,144 @@ +## R pre-processing - Converting to dataframe +# Most input to bias correction methods in R need dfs + +# Might be better in future to seperate it out differently (ie not run hads and cpm in same loop, or by variable ) + +dd <- "/mnt/vmfileshare/ClimateData/" +run <- c("05", "06", "07", "08") +var <- c("tasmax", "tasmin", "pr") + +x <- c("London", "Manchester", "Glasgow") + +# Where to write the results (note subfolder added as name of city/x above) +rd <- c(paste0(dd, "Interim/CPM/Data_as_df/"), paste0(dd, "Interim/Hads.updated360/Data_as_df/")) + +#file locations for cropped versions of CPM and updated hads data +fps <- c(paste0(dd, "Cropped/three.cities/CPM/"), + paste0(dd, "Cropped/three.cities/Hads.updated360/")) + +fps <- paste0(fps, rep(x, 2)) + +#Run the conversion to df in parallel +cores <- detectCores() +cl <- makeCluster(cores[1]-1) +registerDoParallel(cl) + +foreach(x = x, + .packages = c("terra", "tidyverse"), + .errorhandling = 'pass') %dopar% { + + fp <- fps[grepl(x, fps)] + fp <- paste0(fp, "/") + files <- list.files(fp) + files.paths.all <- paste0(fp, files) + + #group in runs and in variable + + for(v in var){ + if(v =="pr"){ + for(r in run){ + + files.paths <- files.paths.all[grepl(v, files.paths.all)& grepl(r, files.paths.all)&grepl("CPM", files.paths.all)] + + # Read in 1st runpath as df with xy coords to ensure overlay + p1 <- files.paths[[1]] + r <- rast(p1) + rdf1 <- as.data.frame(r, xy=T) + + # Load and convert remaining to single col dfs + dfL <- lapply(2:length(file.paths), function(i){ + p <- files.paths[[i]] + r <- rast(p) + rdf <- as.data.frame(r) + return(rdf) + }) + + df <- dfL %>% reduce(cbind) + df <- cbind(rdf1, df) + + rd1 <- rd[grepl("CPM", rd)] + fn <- paste0(rd1, x, "/", v, "_","Run",r, ".csv") + write.csv(df, fn, row.names = F) + + #Hads + files.paths <- files.paths.all[grepl(v, files.paths.all)&grepl("Hads", files.paths.all)] + + # Read in 1st runpath as df with xy coords to ensure overlay + p1 <- files.paths[[1]] + r <- rast(p1) + rdf1 <- as.data.frame(r, xy=T) + + # Load and convert remaining to single col dfs + dfL <- lapply(2:length(file.paths), function(i){ + p <- files.paths[[i]] + r <- rast(p) + rdf <- as.data.frame(r) + return(rdf) + }) + + df <- dfL %>% reduce(cbind) + df <- cbind(rdf1, df) + + rd2 <- rd[grepl("Hads", rd)] + fn <- paste0(rd2, x, "/", v, ".csv") + write.csv(df, fn, row.names = F) + + + } else{ #Sep run for where rainfall being called as different name in hads + for(r in run){ + + files.paths <- files.paths.all[grepl(v, files.paths.all)& grepl(r, files.paths.all)&grepl("CPM", files.paths.all)] + + # Read in 1st runpath as df with xy coords to ensure overlay + p1 <- files.paths[[1]] + r <- rast(p1) + rdf1 <- as.data.frame(r, xy=T) + + # Load and convert remaining to single col dfs + dfL <- lapply(2:length(file.paths), function(i){ + p <- files.paths[[i]] + r <- rast(p) + rdf <- as.data.frame(r) + return(rdf) + }) + + df <- dfL %>% reduce(cbind) + df <- cbind(rdf1, df) + + rd1 <- rd[grepl("CPM", rd)] + fn <- paste0(rd1, x, "/", v, "_","Run",r, ".csv") + write.csv(df, fn, row.names = F) + + #Hads + files.paths <- files.paths.all[grepl("rainfall", files.paths.all)&grepl("Hads", files.paths.all)] + + # Read in 1st runpath as df with xy coords to ensure overlay + p1 <- files.paths[[1]] + r <- rast(p1) + rdf1 <- as.data.frame(r, xy=T) + + # Load and convert remaining to single col dfs + dfL <- lapply(2:length(file.paths), function(i){ + p <- files.paths[[i]] + r <- rast(p) + rdf <- as.data.frame(r) + return(rdf) + }) + + df <- dfL %>% reduce(cbind) + df <- cbind(rdf1, df) + + rd2 <- rd[grepl("Hads", rd)] + fn <- paste0(rd2, x, "/", v, ".csv") + write.csv(df, fn, row.names = F) + + + } + } + } + + + +stopCluster(cl) +gc() + From 44175ac53fc89d5c7272d5606d8128771c270613 Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Thu, 28 Sep 2023 08:37:42 +0000 Subject: [PATCH 25/83] updating for easier R pipeline --- R/bias-correction-methods/ThreeCitiesQmap.RMD | 7 ++++-- .../apply_qmapQuant_to_crpd_df_fn.R | 24 +++++++++---------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/R/bias-correction-methods/ThreeCitiesQmap.RMD b/R/bias-correction-methods/ThreeCitiesQmap.RMD index 673feeb7..4416090f 100644 --- a/R/bias-correction-methods/ThreeCitiesQmap.RMD +++ b/R/bias-correction-methods/ThreeCitiesQmap.RMD @@ -67,11 +67,14 @@ registerDoParallel(cl) obj.name <- gsub("/mnt/vmfileshare/ClimateData/Cropped/three.cities/CPM/", "cpm.dfL.", x) obj.name <- gsub("/mnt/vmfileshare/ClimateData/Cropped/three.cities/Hads.updated360/", "obs.dfL.", obj.name) - assign(dfL, obj.name) + assign(obj.name, dfL, envir=.GlobalEnv) } - + stopCluster(cl) + gc() + + ``` ## 2. Apply bias correction by variable/run diff --git a/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R b/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R index e17111fc..85dfd5a9 100644 --- a/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R +++ b/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R @@ -8,23 +8,23 @@ library(data.table) library(qmap) -apply_bias_correction_to_cropped_df <- function(region, #Region code - needs to relate to the file name in a unique way to subset - var, #Meterological variables - Runs){ +apply_qmapQUANT_to_cropped_df <- function(region, #Region code - needs to relate to the file name in a unique way to subset + var, #Meterological variables - as in files + Runs, #Run as in name of files + ## These args to be passed to qmapQUANT itself: + qstep, # numeric value between 0 and 1, e.g 0.1. The quantile mapping is fitted only for the quantiles defined by quantile(0,1,probs=seq(0,1,by=qstep). + nboot, #numeric value 1 or greater - nboot number of bootstrap samples used for estimation of the observed quantiles.If nboot==1 the estimation is based on all (and not resampled) data. + + type #interpolation method to use for fitting data to the predictions )(eg linear, tricubic) + + ){ i <- region for(r in Runs){ for(v in var){ if(v!="pr"){ - dd <- "/mnt/vmfileshare/ClimateData/" - - #Subset to Area - #HADs grid observational data - fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") - files <- list.files(fp) - obs <- files[grepl(i, files)] - + obj.df <- ls(pattern=paste0(c(v,i,obs), collapse="|")) #Extract object based on name from global env #subset file list to var obs.var <- obs[grepl(v,obs)] @@ -151,7 +151,7 @@ for(r in Runs){ nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. - qm1.hist.a <- doQmapQUANT(cal.df, qm1.fit, type="linear") + qm1.hist.a <- doQmapQUANT(cal.df, qm1.fit, type=type) qm1.hist.b <- doQmapQUANT(cal.df, qm1.fit, type="tricub") qm1.proj.a <- doQmapQUANT(proj.df, qm1.fit, type="linear") From 4ca30d4bbe548d8a8adee3a19df7628e9e414d01 Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Thu, 28 Sep 2023 11:20:28 +0000 Subject: [PATCH 26/83] Update to script to convert city crops to data frame --- .../converting_city_crops_to_df.R | 185 +++++++++--------- 1 file changed, 96 insertions(+), 89 deletions(-) diff --git a/R/pre-processing/converting_city_crops_to_df.R b/R/pre-processing/converting_city_crops_to_df.R index 5bcf08cb..9bdd7ca3 100644 --- a/R/pre-processing/converting_city_crops_to_df.R +++ b/R/pre-processing/converting_city_crops_to_df.R @@ -1,7 +1,14 @@ ## R pre-processing - Converting to dataframe + +rm(list=ls()) + # Most input to bias correction methods in R need dfs # Might be better in future to seperate it out differently (ie not run hads and cpm in same loop, or by variable ) +library(qmap) +library(terra) +library(tidyverse) +library(doParallel) dd <- "/mnt/vmfileshare/ClimateData/" run <- c("05", "06", "07", "08") @@ -10,13 +17,11 @@ var <- c("tasmax", "tasmin", "pr") x <- c("London", "Manchester", "Glasgow") # Where to write the results (note subfolder added as name of city/x above) -rd <- c(paste0(dd, "Interim/CPM/Data_as_df/"), paste0(dd, "Interim/Hads.updated360/Data_as_df/")) - -#file locations for cropped versions of CPM and updated hads data -fps <- c(paste0(dd, "Cropped/three.cities/CPM/"), - paste0(dd, "Cropped/three.cities/Hads.updated360/")) +rd <- paste0(dd, "Interim/CPM/Data_as_df/three.cities/") -fps <- paste0(fps, rep(x, 2)) +#file locations for cropped versions of CPM +fps <- paste0(dd, "Cropped/three.cities/CPM/") +fps <- paste0(fps,x) #Run the conversion to df in parallel cores <- detectCores() @@ -35,110 +40,112 @@ foreach(x = x, #group in runs and in variable for(v in var){ - if(v =="pr"){ for(r in run){ files.paths <- files.paths.all[grepl(v, files.paths.all)& grepl(r, files.paths.all)&grepl("CPM", files.paths.all)] # Read in 1st runpath as df with xy coords to ensure overlay p1 <- files.paths[[1]] - r <- rast(p1) - rdf1 <- as.data.frame(r, xy=T) - - # Load and convert remaining to single col dfs - dfL <- lapply(2:length(file.paths), function(i){ - p <- files.paths[[i]] - r <- rast(p) - rdf <- as.data.frame(r) - return(rdf) - }) - - df <- dfL %>% reduce(cbind) - df <- cbind(rdf1, df) - - rd1 <- rd[grepl("CPM", rd)] - fn <- paste0(rd1, x, "/", v, "_","Run",r, ".csv") - write.csv(df, fn, row.names = F) - - #Hads - files.paths <- files.paths.all[grepl(v, files.paths.all)&grepl("Hads", files.paths.all)] - - # Read in 1st runpath as df with xy coords to ensure overlay - p1 <- files.paths[[1]] - r <- rast(p1) - rdf1 <- as.data.frame(r, xy=T) + rast <- rast(p1) + rdf1 <- as.data.frame(rast, xy=T) # Load and convert remaining to single col dfs - dfL <- lapply(2:length(file.paths), function(i){ + dfL <- lapply(2:length(files.paths), function(i){ p <- files.paths[[i]] - r <- rast(p) - rdf <- as.data.frame(r) + rast <- rast(p) + rdf <- as.data.frame(rast) return(rdf) }) df <- dfL %>% reduce(cbind) df <- cbind(rdf1, df) - rd2 <- rd[grepl("Hads", rd)] - fn <- paste0(rd2, x, "/", v, ".csv") + fn <- paste0(rd, x, "/", v, "_","Run",r, ".csv") write.csv(df, fn, row.names = F) - } else{ #Sep run for where rainfall being called as different name in hads - for(r in run){ - - files.paths <- files.paths.all[grepl(v, files.paths.all)& grepl(r, files.paths.all)&grepl("CPM", files.paths.all)] - - # Read in 1st runpath as df with xy coords to ensure overlay - p1 <- files.paths[[1]] - r <- rast(p1) - rdf1 <- as.data.frame(r, xy=T) - - # Load and convert remaining to single col dfs - dfL <- lapply(2:length(file.paths), function(i){ - p <- files.paths[[i]] - r <- rast(p) - rdf <- as.data.frame(r) - return(rdf) - }) - - df <- dfL %>% reduce(cbind) - df <- cbind(rdf1, df) - - rd1 <- rd[grepl("CPM", rd)] - fn <- paste0(rd1, x, "/", v, "_","Run",r, ".csv") - write.csv(df, fn, row.names = F) - - #Hads - files.paths <- files.paths.all[grepl("rainfall", files.paths.all)&grepl("Hads", files.paths.all)] - - # Read in 1st runpath as df with xy coords to ensure overlay - p1 <- files.paths[[1]] - r <- rast(p1) - rdf1 <- as.data.frame(r, xy=T) - - # Load and convert remaining to single col dfs - dfL <- lapply(2:length(file.paths), function(i){ - p <- files.paths[[i]] - r <- rast(p) - rdf <- as.data.frame(r) - return(rdf) - }) - - df <- dfL %>% reduce(cbind) - df <- cbind(rdf1, df) - - rd2 <- rd[grepl("Hads", rd)] - fn <- paste0(rd2, x, "/", v, ".csv") - write.csv(df, fn, row.names = F) + } + } + } +stopCluster(cl) +gc() + + +#HADS +rd <- paste0(dd, "Interim/HadsUK/Data_as_df/three.cities/") + +#file locations for cropped versions of HADs +fps <- paste0(dd, "Cropped/three.cities/Hads.updated360/") +fps <- paste0(fps,x) + +#Run the conversion to df in parallel +cores <- detectCores() +cl <- makeCluster(cores[1]-1) +registerDoParallel(cl) + +foreach(x = x, + .packages = c("terra", "tidyverse"), + .errorhandling = 'pass') %dopar% { + fp <- fps[grepl(x, fps)] + fp <- paste0(fp, "/") + files <- list.files(fp) + files.paths.all <- paste0(fp, files) - } - } - } + #group in runs and in variable + for(v in var){ + if(v!="pr"){ + + files.paths <- files.paths.all[grepl(v, files.paths.all)] + + # Read in 1st runpath as df with xy coords to ensure overlay + p1 <- files.paths[[1]] + rast <- rast(p1) + rdf1 <- as.data.frame(rast, xy=T) + + # Load and convert remaining to single col dfs + dfL <- lapply(2:length(files.paths), function(i){ + p <- files.paths[[i]] + rast <- rast(p) + rdf <- as.data.frame(rast) + return(rdf) + }) + + df <- dfL %>% reduce(cbind) + df <- cbind(rdf1, df) + - + fn <- paste0(rd, x, "/", v, ".csv") + write.csv(df, fn, row.names = F) + + + } else { + + files.paths <- files.paths.all[grepl("rainfall", files.paths.all)] + + # Read in 1st runpath as df with xy coords to ensure overlay + p1 <- files.paths[[1]] + rast <- rast(p1) + rdf1 <- as.data.frame(rast, xy=T) + + # Load and convert remaining to single col dfs + dfL <- lapply(2:length(files.paths), function(i){ + p <- files.paths[[i]] + rast <- rast(p) + rdf <- as.data.frame(rast) + return(rdf) + }) + + df <- dfL %>% reduce(cbind) + df <- cbind(rdf1, df) + + + fn <- paste0(rd, x, "/", v, ".csv") + write.csv(df, fn, row.names = F) + + } + } + } stopCluster(cl) gc() - From 580d36fce74d721da0bf08957cea9ef6a2ee5c03 Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Fri, 29 Sep 2023 20:12:23 +0000 Subject: [PATCH 27/83] Add notes and fixing function --- R/bias-correction-methods/ThreeCitiesQmap.RMD | 79 +- .../apply_qmapQuant_to_crpd_df_fn.R | 791 ++++-------------- .../converting_city_crops_to_df.R | 4 +- 3 files changed, 178 insertions(+), 696 deletions(-) diff --git a/R/bias-correction-methods/ThreeCitiesQmap.RMD b/R/bias-correction-methods/ThreeCitiesQmap.RMD index 4416090f..ba018f96 100644 --- a/R/bias-correction-methods/ThreeCitiesQmap.RMD +++ b/R/bias-correction-methods/ThreeCitiesQmap.RMD @@ -27,64 +27,57 @@ dd <- "/mnt/vmfileshare/ClimateData/" ``` -## 1. Pre-processing +## Apply bias correction by variable/run in `qmap` -Qmap uses df as inpt so below CPM data is loaded and converted to df +`qmap` offers a few different bias correction options. +The below chunks calls a function that wraps the qmap function to loop over the cities and apply the bias correction +It returns a list object, where for each model, we have: +- a matrix of the calibration observation period (01-12-1980 to 30-11-2010 ) +- a matrix of values relating to the validation obs period (hads data 01-12-2010 to 01-12-2020) +the raw values and the adjusted values for the CPM for the +"t.obs", "val.df", "t.cal", "t.proj", "qm1.hist", "qm1.val.proj" -```{r cpm hads processing} +Data has been pre-processed from cropped using 'converting_city_crops_to_df.R' to .csv -cities <- c("London", "Manchester", "Glasgow") +##1. Empirical Quantile Mapping -#file locations for cropped versions of CPM and updated hads data -fps <- c(paste0(dd, "Cropped/three.cities/CPM/"), - paste0(dd, "Cropped/three.cities/Hads.updated360/")) +```{r warning = F} +setwd("/home/dyme/Desktop/clim-recal/clim-recal") +source("R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R") -x <- paste0(fps, rep(cities, 2)) - -#Run the conversion to df in parallel -cores <- detectCores() -cl <- makeCluster(cores[1]-1) -registerDoParallel(cl) - - foreach(x = x, - .packages = c("terra", "tidyverse"), - .errorhandling = 'pass') %dopar% { - - fp <- paste0(x, "/") - files <- list.files(fp) - files.paths <- paste0(fp, files) - - # Load and to df - dfL <- lapply(files.paths, function(i){ - r <- rast(i) - rdf <- as.data.frame(r, xy=T) - return(rdf) - }) - - names(dfL) <- files - - #assign a nice name for easy referencing - obj.name <- gsub("/mnt/vmfileshare/ClimateData/Cropped/three.cities/CPM/", "cpm.dfL.", x) - obj.name <- gsub("/mnt/vmfileshare/ClimateData/Cropped/three.cities/Hads.updated360/", "obs.dfL.", obj.name) - - assign(obj.name, dfL, envir=.GlobalEnv) - - } +cities <- c("London", "Glasgow", "Manchester") +run <- c("Run05", "Run06", "Run07","Run08") +var <- c("tasmax", "tasmin", "pr") - stopCluster(cl) - gc() - +lapply(cities, function(x){ + rd <- paste0(dd, "Debiased/R/QuantileMapping/three.cities/", x, "/") + apply_qmapQUANT_to_cropped_df(region = x, + var = var, + Runs = run, + pd = paste0(dd, "Interim/CPM/Data_as_df/three.cities/"),#Parent directory where dataframes of cpm data are located + pd.obs = paste0(dd, "Interim/HadsUK/Data_as_df/three.cities/"),#Parent directory where dataframes of obs data are + val.startdate = "20101201", #The first date of the validation period. eg 20101201 All dates before this time will be taken as the calibration + ## These args to be passed to qmapQUANT itself: + wet.day = ifelse(v=="pr", T, F), + qstep = 0.1, # numeric value between 0 and 1, e.g 0.1. The quantile mapping is fitted only for the quantiles defined by quantile(0,1,probs=seq(0,1,by=qstep). + nboot = 1, #numeric value 1 or greater - nboot number of bootstrap samples used for estimation of the observed quantiles. If nboot==1 the estimation is based on all (and not resampled) data. + + type = "linear", #interpolation method to use for fitting data to the predictions )(eg linear, tricubic) + rd = rd) +}) + + ``` -## 2. Apply bias correction by variable/run + The called function was written to apply the following models: ```{r} -source("/home/dyme/Desktop/clim-recal/clim-recal/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R") +dsource("/home/dyme/Desktop/clim-recal/clim-recal/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R") var <- c("tasmax", "tasmin", "pr") Runs <-c("05", "06", "07", "08") diff --git a/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R b/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R index 85dfd5a9..fa004a80 100644 --- a/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R +++ b/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R @@ -11,708 +11,197 @@ library(qmap) apply_qmapQUANT_to_cropped_df <- function(region, #Region code - needs to relate to the file name in a unique way to subset var, #Meterological variables - as in files Runs, #Run as in name of files - ## These args to be passed to qmapQUANT itself: + pd, #Parent directory where dataframes of data are + pd.obs, #Parent directory where dataframes of obs data are + val.startdate, #The first date of the validation period. eg 20101201 All dates before this time will be taken as the calibration + rd, #where to store the results list output buy this + ## These args to be passed to qmapQUANT itself: + wet.day, qstep, # numeric value between 0 and 1, e.g 0.1. The quantile mapping is fitted only for the quantiles defined by quantile(0,1,probs=seq(0,1,by=qstep). nboot, #numeric value 1 or greater - nboot number of bootstrap samples used for estimation of the observed quantiles.If nboot==1 the estimation is based on all (and not resampled) data. - type #interpolation method to use for fitting data to the predictions )(eg linear, tricubic) + type) #interpolation method to use for fitting data to the predictions )(eg linear, tricubic) - ){ + { i <- region + pd <- pd + pd.obs <- pd.obs + rd <- rd + qstep <- qstep + nboot <- nboot + type <- type for(r in Runs){ for(v in var){ - if(v!="pr"){ - obj.df <- ls(pattern=paste0(c(v,i,obs), collapse="|")) #Extract object based on name from global env - #subset file list to var - obs.var <- obs[grepl(v,obs)] + wet.day <- wet.day + obs.files <- list.files(paste0(pd.obs, i), full.names = T) + obs.files.v <- obs.files[grepl(v, obs.files)] + obs.df <- read.csv(obs.files.v) - #subset to calibration years - obs.varc <- obs.var[grepl("1980", obs.var)] - obs.df <- fread(paste0(fp, obs.varc)) - obs.df <- as.data.frame(obs.df) - + #subset file list to var row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) obs.df$x <- NULL obs.df$y <- NULL - #Remove the dates not in the cpm - ## find col position of the first cpm date 19801201 - n1 <-min(grep("19801201", names(obs.df))) - obs.df <- obs.df[c(n1:ncol(obs.df))] - + #Create a data frame for the obs data in the validation period and leave 'obs.df' as the calibration + n1 <-min(grep(val.startdate, names(obs.df))) + val.df <- obs.df[c(n1:ncol(obs.df))] + obs.df <- obs.df[c(1:n1-1)] - #Using 1980 - 2010 as calibration period - fp <- paste0(dd, "Interim/CPM/Data_as_df/") - cpm.files <- list.files(fp) - - #Calibration years 1980 - 2010 - load in full one for 1980 - 2000 - cpm.cal <- cpm.files[grepl("1980|2000", cpm.files)] - - #Subset file list to area - cpm.cal <- cpm.cal[grepl(i, cpm.cal)] - - #subset to var and run - cpm.cal.var <- cpm.cal[grepl(v, cpm.cal)&grepl(r, cpm.cal)] - - #Load in - cal.df <- lapply(cpm.cal.var, function(x){ - df <- fread(paste0(fp, x)) - df <- as.data.frame(df) + #Load in CPM data - df contains all 100 years + cpm.files <- list.files(paste0(pd, i), full.names = T) + cpm.files.v <- cpm.files[grepl(v, cpm.files)&grepl(r, cpm.files)] - row.names(df)<- paste0(df$x, "_", df$y) - df$x <- NULL - df$y <- NULL - return(df) - }) + cpm.df <- fread(cpm.files.v) + cpm.df <- as.data.frame(cpm.df) + row.names(cpm.df) <- paste0(cpm.df$x, "_", cpm.df$y ) + cpm.df$x <- NULL + cpm.df$y <- NULL - cal.df <- cal.df %>% reduce(cbind) - - #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here - #Keep all of the files with these years - because the naming convention runs - #from Nov to following year we need to just take the first 30 days of the one starting with 20091201- - n2 <- min(grep("20091201-",names(cal.df))) + 29 + #Calibration years 1980 - 2010 - load in full one for 1980 - 2000 + cpm.cal <- cpm.df[c(1:n1-1)] + cpm.proj <- cpm.df[c(n1:ncol(cpm.df))] + remove(cpm.df) - #This is the first part of the validation dataset, but all the val will be added to the projection df for - #the sake of bias correction and assessed separately - proj.df1 <- cal.df[c((n2+1):ncol(cal.df))] - cal.df <- cal.df[c(1:n2)] + #Some cells missing in hads so subset here jic + cal.df <- cpm.cal[which(row.names(cpm.cal)%in%row.names(obs.df)),] + proj.df<- cpm.proj[which(row.names(cpm.proj)%in%row.names(obs.df)),] - gc() - - yi <- paste0(i,c(2020,2040,2060), collapse="|") - cpm.proj <- cpm.files[grepl(yi, cpm.files)] - - #Subset to Area, var and run - cpm.proj <- cpm.proj[grepl(i, cpm.proj)&grepl(v, cpm.proj)&grepl(r, cpm.proj)] - - #Load in - proj.df2 <- lapply(cpm.proj, function(x){ - df <- as.data.frame(fread(paste0(fp, x))) - #Remove x and y cols - df[c(3:ncol(df))] - }) - - names(proj.df2) <- cpm.proj - - proj.df <- c(list(proj.df1), proj.df2) %>% reduce(cbind) - - remove("proj.df1") - remove("proj.df2") - -## **2. Wrangle the data** - - #missing.in.hads.cpm.cal <- cal.df[-which(row.names(cal.df)%in%row.names(obs.df)),] - #missing.in.hads.cpm.proj <- proj.df[-which(row.names(proj.df)%in%row.names(obs.df)),] - - - cal.df <- cal.df[which(row.names(cal.df)%in%row.names(obs.df)),] - proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] - - #save the missing outputs - p <- paste0("checkpoint1", v, "_", i, "_", r, "_") - print(p) - #write.csv(missing.in.hads.cpm.cal, paste0(dd, "Debiased/R/QuantileMapping/missing.in.hads/",r,"_",i,"_",v, ".csv")) - - ### Update obs data to 360 days - - #The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted - - #Convert obs to 360 day year - has 40 more vars so remove the ones not in cal - remove <- c("0229_29", "0430_30", "0731_31", "0930_30", "1130_30") - remove <- paste0(remove, collapse = "|") - - obs.df <- obs.df[,!grepl(remove, names(obs.df))] - #This still pulls in the 31st Dec 2009 for some reason is in the hads so manual remove - obs.df <- obs.df[1:ncol(cal.df)] - -### Transpose the data sets + remove(cpm.cal) + remove(cpm.proj) - #Obs grid should be cols, observations (time) should be rows for linear scaling + #Grid ref should be cols, observations (time) should be rows cal.df <- t(cal.df) proj.df <- t(proj.df) obs.df <- t(obs.df) - - + + ## **3. Empirical Quantile Mapping** -#(from qmap vignette) - fitQmapQUANT estimates values of the empirical cumulative distribution function of observed and -#modelled time series for regularly spaced quantiles. doQmapQUANT uses these estimates to perform -#quantile mapping - p <- paste0("checkpoint2", v, "_", i, "_", r, "_") - print(p) - library(qmap) qm1.fit <- fitQmapQUANT(obs.df, cal.df, - wet.day = FALSE, - qstep = 0.01, - nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. - + wet.day = wet.day, + qstep = qstep, + nboot = nboot) #nboot number of bootstrap samples used for estimation of the observed quantiles. - qm1.hist.a <- doQmapQUANT(cal.df, qm1.fit, type=type) - qm1.hist.b <- doQmapQUANT(cal.df, qm1.fit, type="tricub") - qm1.proj.a <- doQmapQUANT(proj.df, qm1.fit, type="linear") - qm1.proj.b <- doQmapQUANT(proj.df, qm1.fit, type="tricub") + qm1.hist <- doQmapQUANT(cal.df, qm1.fit, type=type) + qm1.proj <- doQmapQUANT(proj.df, qm1.fit, type=type) ## **4. Save the data** - p <- paste0("checkpoint3", v, "_", i, "_", r, "_") - print(p) - # Save data - lists of dfs for now (will be easier for assessment) - results.L <- list(obs.df, cal.df, proj.df, qm1.hist.a, qm1.hist.b, qm1.proj.a, qm1.proj.b) - - names(results.L) <- c("t.obs", "t.cal", "t.proj", "qm1.hist.a", "qm1.hist.b", "qm1.proj.a", "qm1.proj.b") - p <- paste0("checkpoint4", v, "_", i, "_", r, "_") - print(p) - base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/resultsL", r,"_",i,"_",v, ".RDS")) - - p <- paste0("checkpoint5", v, "_", i, "_", r, "_") - print(p) - rm(list=setdiff(ls(), c("v", "i", "r", "var", "Runs"))) - - gc(reset=TRUE) - - } else { - -#### Precipitation - the HADs variable has is called 'rainfall' - dd <- "/mnt/vmfileshare/ClimateData/" - #Subset to Area - #HADs grid observational data - fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") - files <- list.files(fp) - obs <- files[grepl(i, files)] - - #subset file list to var - obs.var <- obs[grepl("rainfall",obs)] - - #subset to calibration years - obs.varc <- obs.var[grepl("1980", obs.var)] - obs.df <- fread(paste0(fp, obs.varc)) - obs.df <- as.data.frame(obs.df) - - row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) - obs.df$x <- NULL - obs.df$y <- NULL - - #Remove the dates not in the cpm - ## find col position of the first cpm date 19801201 - n1 <-min(grep("19801201", names(obs.df))) - obs.df <- obs.df[c(n1:ncol(obs.df))] - - - #Using 1980 - 2010 as calibration period - fp <- paste0(dd, "Interim/CPM/Data_as_df/") - cpm.files <- list.files(fp) - - #Calibration years 1980 - 2010 - load in full one for 1980 - 2000 - cpm.cal <- cpm.files[grepl("1980|2000", cpm.files)] - - #Subset file list to area - cpm.cal <- cpm.cal[grepl(i, cpm.cal)] - - #subset to var and run - cpm.cal.var <- cpm.cal[grepl(v, cpm.cal)&grepl(r, cpm.cal)] - - #Load in - cal.df <- lapply(cpm.cal.var, function(x){ - df <- fread(paste0(fp, x)) - df <- as.data.frame(df) - - row.names(df)<- paste0(df$x, "_", df$y) - df$x <- NULL - df$y <- NULL - return(df) - }) - - cal.df <- cal.df %>% reduce(cbind) - - #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here - #Keep all of the files with these years - because the naming convention runs - #from Nov to following year we need to just take the first 30 days of the one starting with 20091201- - n2 <- min(grep("20091201-",names(cal.df))) + 29 - - #This is the first part of the validation dataset, but all the val will be added to the projection df for - #the sake of bias correction and assessed separately - proj.df1 <- cal.df[c((n2+1):ncol(cal.df))] - cal.df <- cal.df[c(1:n2)] - - gc() - - yi <- paste0(i,c(2020,2040,2060), collapse="|") - cpm.proj <- cpm.files[grepl(yi, cpm.files)] - - #Subset to Area, var and run - cpm.proj <- cpm.proj[grepl(i, cpm.proj)&grepl(v, cpm.proj)&grepl(r, cpm.proj)] - - #Load in - proj.df2 <- lapply(cpm.proj, function(x){ - df <- as.data.frame(fread(paste0(fp, x))) - #Remove x and y cols - df[c(3:ncol(df))] - }) - - names(proj.df2) <- cpm.proj - - proj.df <- c(list(proj.df1), proj.df2) %>% reduce(cbind) - - remove("proj.df1") - remove("proj.df2") - - ## **2. Wrangle the data** - - #missing.in.hads.cpm.cal <- cal.df[-which(row.names(cal.df)%in%row.names(obs.df)),] - #missing.in.hads.cpm.proj <- proj.df[-which(row.names(proj.df)%in%row.names(obs.df)),] - - - cal.df <- cal.df[which(row.names(cal.df)%in%row.names(obs.df)),] - proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] - - #save the missing outputs - p <- paste0("checkpoint1", v, "_", i, "_", r, "_") - print(p) - #write.csv(missing.in.hads.cpm.cal, paste0(dd, "Debiased/R/QuantileMapping/missing.in.hads/",r,"_",i,"_",v, ".csv")) - - ### Update obs data to 360 days - - #The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted - - #Convert obs to 360 day year - has 40 more vars so remove the ones not in cal - remove <- c("0229_29", "0430_30", "0731_31", "0930_30", "1130_30") - remove <- paste0(remove, collapse = "|") - - obs.df <- obs.df[,!grepl(remove, names(obs.df))] - #This still pulls in the 31st Dec 2009 for some reason is in the hads so manual remove - obs.df <- obs.df[1:ncol(cal.df)] - - ### Transpose the data sets - - #Obs grid should be cols, observations (time) should be rows for linear scaling - - cal.df <- t(cal.df) - proj.df <- t(proj.df) - obs.df <- t(obs.df) - - ## **3. Empirical Quantile Mapping** - - #(from qmap vignette) - fitQmapQUANT estimates values of the empirical cumulative distribution function of observed and - #modelled time series for regularly spaced quantiles. doQmapQUANT uses these estimates to perform - #quantile mapping - p <- paste0("checkpoint2", v, "_", i, "_", r, "_") - print(p) - - - qm1.fit <- fitQmapQUANT(obs.df, cal.df, - wet.day = TRUE, #If wet.day=TRUE the empirical probability of nonzero observations is found (obs>=0) and the corresponding modelled value is selected as a threshold. All modelled values below this threshold are set to zero. If wet.day is numeric the same procedure is performed after setting all obs to zero. - qstep = 0.01, - nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. - - - qm1.hist.a <- doQmapQUANT(cal.df, qm1.fit, type="linear") - qm1.hist.b <- doQmapQUANT(cal.df, qm1.fit, type="tricub") - - qm1.proj.a <- doQmapQUANT(proj.df, qm1.fit, type="linear") - qm1.proj.b <- doQmapQUANT(proj.df, qm1.fit, type="tricub") - - ## **4. Save the data** - p <- paste0("checkpoint3", v, "_", i, "_", r, "_") - print(p) - # Save data - lists of dfs for now (will be easier for assessment) - results.L <- list(obs.df, cal.df, proj.df, qm1.hist.a, qm1.hist.b, qm1.proj.a, qm1.proj.b) - - names(results.L) <- c("t.obs", "t.cal", "t.proj", "qm1.hist.a", "qm1.hist.b", "qm1.proj.a", "qm1.proj.b") - p <- paste0("checkpoint4", v, "_", i, "_", r, "_") - print(p) - base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/resultsL", r,"_",i,"_",v, ".RDS")) - - p <- paste0("checkpoint5", v, "_", i, "_", r, "_") - print(p) - rm(list=setdiff(ls(), c("v", "i", "r", "var", "Runs"))) - - gc(reset=TRUE) + # Save data - lists of dfs for now (will be easier for assessment to have all to hand) + results.L <- list(obs.df, val.df, cal.df, proj.df, qm1.hist, qm1.proj) - - } - } + names(results.L) <- c("t.obs", "val.df", "t.cal", "t.proj", "qm1.hist", "qm1.val.proj") + base::saveRDS(results.L, file = paste0(rd, "/",r,"_",v, ".RDS")) + + rm(list=setdiff(ls(), c("v", "i", "pd", "pd.obs", "rd", "wet.day", "qstep", "nboot", "type", "r", "var", "Runs"))) + + gc(reset=TRUE) } + } } + -###################### Further cropping to the cropped dfs (!) - mostly for Scotland which is too big! -cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs to relate to the file name in a unique way to subset - var, #Meterological variables - Runs, #Runs in form 'Run08'- matched input - N.new.segments,...){ #Numeric, Number of dfs to break down to, eg 4 - + +apply_qmapQUANT_to_cropped_df <- function(region, #Region code - needs to relate to the file name in a unique way to subset + var, #Meterological variables - as in files + Runs, #Run as in name of files + pd, #Parent directory where dataframes of data are + pd.obs, #Parent directory where dataframes of obs data are + val.startdate, #The first date of the validation period. eg 20101201 All dates before this time will be taken as the calibration + rd, #where to store the results list output buy this + ## These args to be passed to qmapQUANT itself: + wet.day, + qstep, # numeric value between 0 and 1, e.g 0.1. The quantile mapping is fitted only for the quantiles defined by quantile(0,1,probs=seq(0,1,by=qstep). + nboot, #numeric value 1 or greater - nboot number of bootstrap samples used for estimation of the observed quantiles.If nboot==1 the estimation is based on all (and not resampled) data. + + type) #interpolation method to use for fitting data to the predictions )(eg linear, tricubic) + + { + i <- region - N.new.segments<- N.new.segments - Runs <- Runs - var <- var - - for(r in Runs){ - for(v in var){ - for(y in 1:N.new.segments){ - if(v!="pr"){ - dd <- "/mnt/vmfileshare/ClimateData/" - - #Subset to Area - #Load cpm first and then use this to subset the latter as there are more cells in cpm that hads - #Using 1980 - 2010 as calibration period - fp <- paste0(dd, "Interim/CPM/Data_as_df/") - cpm.files <- list.files(fp) - - #Calibration years 1980 - 2010 - load in full one for 1980 - 2000 - cpm.cal <- cpm.files[grepl("1980|2000", cpm.files)] - - #Subset file list to area - cpm.cal <- cpm.cal[grepl(i, cpm.cal)] - - #subset to var and run - cpm.cal.var <- cpm.cal[grepl(v, cpm.cal)&grepl(r, cpm.cal)] - - #Load in - cal.df <- lapply(cpm.cal.var, function(x){ - df <- fread(paste0(fp, x)) - df <- as.data.frame(df) - - row.names(df)<- paste0(df$x, "_", df$y) - df$x <- NULL - df$y <- NULL - return(df) - }) - - cal.df <- cal.df %>% reduce(cbind) - - - #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here - #Keep all of the files with these years - because the naming convention runs - #from Nov to following year we need to just take the first 30 days of the one starting with 20091201- - n2 <- min(grep("20091201-",names(cal.df))) + 29 - - #This is the first part of the validation dataset, but all the val will be added to the projection df for - #the sake of bias correction and assessed separately - proj.df1 <- cal.df[c((n2+1):ncol(cal.df))] - cal.df <- cal.df[c(1:n2)] - - #Subset the dataframe iteratively depending on y - nrows.seg <- nrow(cal.df)/N.new.segments - y_1 <- y-1 + pd <- pd + pd.obs <- pd.obs + rd <- rd + qstep <- qstep + nboot <- nboot + type <- type - nr1 <- round(nrows.seg*y_1) + 1 - nr2 <- round(nrows.seg*y) - cal.df <- cal.df[nr1:nr2,] - - #proj data - yi <- paste0(i,c(2020,2040,2060), collapse="|") - cpm.proj <- cpm.files[grepl(yi, cpm.files)] - - #Subset to Area, var and run - cpm.proj <- cpm.proj[grepl(i, cpm.proj)&grepl(v, cpm.proj)&grepl(r, cpm.proj)] - - #Load in - proj.df2 <- lapply(cpm.proj, function(x){ - df <- as.data.frame(fread(paste0(fp, x))) - #Remove x and y cols - df[c(3:ncol(df))] - }) - - names(proj.df2) <- cpm.proj - - proj.df <- c(list(proj.df1), proj.df2) %>% reduce(cbind) - proj.df <- proj.df[which(row.names(proj.df)%in%row.names(cal.df)),] - - remove("proj.df1") - remove("proj.df2") - - - #HADs grid observational data - fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") - files <- list.files(fp) - obs <- files[grepl(i, files)] - - #subset file list to var - obs.var <- obs[grepl(v,obs)] - - #subset to calibration years - obs.varc <- obs.var[grepl("1980", obs.var)] - obs.df <- fread(paste0(fp, obs.varc)) - obs.df <- as.data.frame(obs.df) +for(r in Runs){ + for(v in var){ + wet.day <- wet.day + obs.files <- list.files(paste0(pd.obs, i), full.names = T) + obs.files.v <- obs.files[grepl(v, obs.files)] + obs.df <- read.csv(obs.files.v) + #subset file list to var row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) obs.df$x <- NULL obs.df$y <- NULL + + #Create a data frame for the obs data in the validation period and leave 'obs.df' as the calibration + n1 <-min(grep(val.startdate, names(obs.df))) + val.df <- obs.df[c(n1:ncol(obs.df))] + obs.df <- obs.df[c(1:n1-1)] - #Subset to the rows which are in above (some will be missing) - obs.df <- obs.df[which(row.names(obs.df)%in%row.names(cal.df)),] - - #Remove the dates not in the cpm - ## find col position of the first cpm date 19801201 - n1 <-min(grep("19801201", names(obs.df))) - obs.df <- obs.df[c(n1:ncol(obs.df))] - - gc() - - - ## **2. Wrangle the data** - - #missing.in.hads.cpm.cal <- cal.df[-which(row.names(cal.df)%in%row.names(obs.df)),] - #missing.in.hads.cpm.proj <- proj.df[-which(row.names(proj.df)%in%row.names(obs.df)),] - - - cal.df <- cal.df[which(row.names(cal.df)%in%row.names(obs.df)),] - proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] - - #save the missing outputs - p <- paste0("checkpoint1", v, "_", i, "_", r, "_",y) - print(p) - #write.csv(missing.in.hads.cpm.cal, paste0(dd, "Debiased/R/QuantileMapping/missing.in.hads/",r,"_",i,"_",v, ".csv")) - - ### Update obs data to 360 days - - #The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted - - #Convert obs to 360 day year - has 40 more vars so remove the ones not in cal - remove <- c("0229_29", "0430_30", "0731_31", "0930_30", "1130_30") - remove <- paste0(remove, collapse = "|") - - obs.df <- obs.df[,!grepl(remove, names(obs.df))] - #This still pulls in the 31st Dec 2009 for some reason is in the hads so manual remove - obs.df <- obs.df[1:ncol(cal.df)] - - ### Transpose the data sets - - #Obs grid should be cols, observations (time) should be rows for linear scaling - - cal.df <- t(cal.df) - proj.df <- t(proj.df) - obs.df <- t(obs.df) - - - ## **3. Empirical Quantile Mapping** - - #(from qmap vignette) - fitQmapQUANT estimates values of the empirical cumulative distribution function of observed and - #modelled time series for regularly spaced quantiles. doQmapQUANT uses these estimates to perform - #quantile mapping - p <- paste0("checkpoint2", v, "_", i, "_", r, "_",y) - print(p) - - library(qmap) - qm1.fit <- fitQmapQUANT(obs.df, cal.df, - wet.day = FALSE, - qstep = 0.01, - nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. - - - qm1.hist.a <- doQmapQUANT(cal.df, qm1.fit, type="linear") - qm1.hist.b <- doQmapQUANT(cal.df, qm1.fit, type="tricub") - - qm1.proj.a <- doQmapQUANT(proj.df, qm1.fit, type="linear") - qm1.proj.b <- doQmapQUANT(proj.df, qm1.fit, type="tricub") - - ## **4. Save the data** - p <- paste0("checkpoint3", v, "_", i, "_", r, "_", y) - print(p) - # Save data - lists of dfs for now (will be easier for assessment) - results.L <- list(obs.df, cal.df, proj.df, qm1.hist.a, qm1.hist.b, qm1.proj.a, qm1.proj.b) - - names(results.L) <- c("t.obs", "t.cal", "t.proj", "qm1.hist.a", "qm1.hist.b", "qm1.proj.a", "qm1.proj.b") - p <- paste0("checkpoint4", v, "_", i, "_", r, "_", y) - print(p) - base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/resultsL", r,"_",i,"_",y,"_",v, ".RDS")) - - p <- paste0("checkpoint5", v, "_", i, "_", r, "_", y) - print(p) - rm(list=setdiff(ls(), c("v", "i", "r", "var", "Runs", "y", "N.new.segments"))) - - gc(reset=TRUE) - - - } else { - - #### Precipitation - the HADs variable has is called 'rainfall' - dd <- "/mnt/vmfileshare/ClimateData/" + #Load in CPM data - df contains all 100 years + cpm.files <- list.files(paste0(pd, i), full.names = T) + cpm.files.v <- cpm.files[grepl(v, cpm.files)&grepl(r, cpm.files)] - #Subset to Area - #Load cpm first and then use this to subset the latter as there are more cells in cpm that hads - #Using 1980 - 2010 as calibration period - fp <- paste0(dd, "Interim/CPM/Data_as_df/") - cpm.files <- list.files(fp) + cpm.df <- fread(cpm.files.v) + cpm.df <- as.data.frame(cpm.df) + row.names(cpm.df) <- paste0(cpm.df$x, "_", cpm.df$y ) + cpm.df$x <- NULL + cpm.df$y <- NULL #Calibration years 1980 - 2010 - load in full one for 1980 - 2000 - cpm.cal <- cpm.files[grepl("1980|2000", cpm.files)] - - #Subset file list to area - cpm.cal <- cpm.cal[grepl(i, cpm.cal)] - - #subset to var and run - cpm.cal.var <- cpm.cal[grepl(v, cpm.cal)&grepl(r, cpm.cal)] - - #Load in - cal.df <- lapply(cpm.cal.var, function(x){ - df <- fread(paste0(fp, x)) - df <- as.data.frame(df) - - row.names(df)<- paste0(df$x, "_", df$y) - df$x <- NULL - df$y <- NULL - return(df) - }) + cpm.cal <- cpm.df[c(1:n1-1)] + cpm.proj <- cpm.df[c(n1:ncol(cpm.df))] + remove(cpm.df) - cal.df <- cal.df %>% reduce(cbind) + #Some cells missing in hads so subset here jic + cal.df <- cpm.cal[which(row.names(cpm.cal)%in%row.names(obs.df)),] + proj.df<- cpm.proj[which(row.names(cpm.proj)%in%row.names(obs.df)),] - - #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here - #Keep all of the files with these years - because the naming convention runs - #from Nov to following year we need to just take the first 30 days of the one starting with 20091201- - n2 <- min(grep("20091201-",names(cal.df))) + 29 - - #This is the first part of the validation dataset, but all the val will be added to the projection df for - #the sake of bias correction and assessed separately - proj.df1 <- cal.df[c((n2+1):ncol(cal.df))] - cal.df <- cal.df[c(1:n2)] - - #Subset the dataframe iteratively depending on y - nrows.seg <- nrow(cal.df)/N.new.segments - y_1 <- y-1 + remove(cpm.cal) + remove(cpm.proj) - nr1 <- round(nrows.seg*y_1) + 1 - nr2 <- round(nrows.seg*y) - cal.df <- cal.df[nr1:nr2,] - - - #proj data - yi <- paste0(i,c(2020,2040,2060), collapse="|") - cpm.proj <- cpm.files[grepl(yi, cpm.files)] - - #Subset to Area, var and run - cpm.proj <- cpm.proj[grepl(i, cpm.proj)&grepl(v, cpm.proj)&grepl(r, cpm.proj)] - - #Load in - proj.df2 <- lapply(cpm.proj, function(x){ - df <- as.data.frame(fread(paste0(fp, x))) - #Remove x and y cols - df[c(3:ncol(df))] - }) - - names(proj.df2) <- cpm.proj - - proj.df <- c(list(proj.df1), proj.df2) %>% reduce(cbind) - proj.df <- proj.df[which(row.names(proj.df)%in%row.names(cal.df)),] - - remove("proj.df1") - remove("proj.df2") - - - #HADs grid observational data - fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") - files <- list.files(fp) - obs <- files[grepl(i, files)] - - #subset file list to var - obs.var <- obs[grepl("rainfall",obs)] - - #subset to calibration years - obs.varc <- obs.var[grepl("1980", obs.var)] - obs.df <- fread(paste0(fp, obs.varc)) - obs.df <- as.data.frame(obs.df) - - row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) - obs.df$x <- NULL - obs.df$y <- NULL - - #Subset to the rows which are in above (some will be missing) - obs.df <- obs.df[which(row.names(obs.df)%in%row.names(cal.df)),] - - #Remove the dates not in the cpm - ## find col position of the first cpm date 19801201 - n1 <-min(grep("19801201", names(obs.df))) - obs.df <- obs.df[c(n1:ncol(obs.df))] - - gc() - - - ## **2. Wrangle the data** - - #missing.in.hads.cpm.cal <- cal.df[-which(row.names(cal.df)%in%row.names(obs.df)),] - #missing.in.hads.cpm.proj <- proj.df[-which(row.names(proj.df)%in%row.names(obs.df)),] - - - cal.df <- cal.df[which(row.names(cal.df)%in%row.names(obs.df)),] - proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] - - #save the missing outputs - p <- paste0("checkpoint1", v, "_", i, "_", r, "_",y) - print(p) - #write.csv(missing.in.hads.cpm.cal, paste0(dd, "Debiased/R/QuantileMapping/missing.in.hads/",r,"_",i,"_",v, ".csv")) - - ### Update obs data to 360 days - - #The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted - - #Convert obs to 360 day year - has 40 more vars so remove the ones not in cal - remove <- c("0229_29", "0430_30", "0731_31", "0930_30", "1130_30") - remove <- paste0(remove, collapse = "|") - - obs.df <- obs.df[,!grepl(remove, names(obs.df))] - #This still pulls in the 31st Dec 2009 for some reason is in the hads so manual remove - obs.df <- obs.df[1:ncol(cal.df)] - - ### Transpose the data sets - - #Obs grid should be cols, observations (time) should be rows for linear scaling - - cal.df <- t(cal.df) - proj.df <- t(proj.df) - obs.df <- t(obs.df) - - - ## **3. Empirical Quantile Mapping** - - #(from qmap vignette) - fitQmapQUANT estimates values of the empirical cumulative distribution function of observed and - #modelled time series for regularly spaced quantiles. doQmapQUANT uses these estimates to perform - #quantile mapping - p <- paste0("checkpoint2", v, "_", i, "_", r, "_",y) - print(p) - - qm1.fit <- fitQmapQUANT(obs.df, cal.df, - wet.day = TRUE, #If wet.day=TRUE the empirical probability of nonzero observations is found (obs>=0) and the corresponding modelled value is selected as a threshold. All modelled values below this threshold are set to zero. If wet.day is numeric the same procedure is performed after setting all obs to zero. - qstep = 0.01, - nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. - - - qm1.hist.a <- doQmapQUANT(cal.df, qm1.fit, type="linear") - qm1.hist.b <- doQmapQUANT(cal.df, qm1.fit, type="tricub") - - qm1.proj.a <- doQmapQUANT(proj.df, qm1.fit, type="linear") - qm1.proj.b <- doQmapQUANT(proj.df, qm1.fit, type="tricub") - - - ## **4. Save the data** - p <- paste0("checkpoint3", v, "_", i, "_", r, "_", y) - print(p) - # Save data - lists of dfs for now (will be easier for assessment) - results.L <- list(obs.df, cal.df, proj.df, qm1.hist.a, qm1.hist.b, qm1.proj.a, qm1.proj.b) - - names(results.L) <- c("t.obs", "t.cal", "t.proj", "qm1.hist.a", "qm1.hist.b", "qm1.proj.a", "qm1.proj.b") - p <- paste0("checkpoint4", v, "_", i, "_", r, "_", y) - print(p) - base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/resultsL", r,"_",i,"_",y,"_",v, ".RDS")) - - p <- paste0("checkpoint5", v, "_", i, "_", r, "_", y) - print(p) - rm(list=setdiff(ls(), c("v", "i", "r", "var", "Runs", "y", "N.new.segments"))) - - gc(reset=TRUE) - - - - } - } - } - } - } + #Grid ref should be cols, observations (time) should be rows + + cal.df <- t(cal.df) + proj.df <- t(proj.df) + obs.df <- t(obs.df) + + +## **3. Empirical Quantile Mapping** + + library(qmap) + qm1.fit <- fitQmapQUANT(obs.df, cal.df, + wet.day = wet.day, + qstep = qstep, + nboot = nboot) #nboot number of bootstrap samples used for estimation of the observed quantiles. + + + qm1.hist <- doQmapQUANT(cal.df, qm1.fit, type=type) + qm1.proj <- doQmapQUANT(proj.df, qm1.fit, type=type) + +## **4. Save the data** + # Save data - lists of dfs for now (will be easier for assessment to have all to hand) + results.L <- list(obs.df, val.df, cal.df, proj.df, qm1.hist, qm1.proj) + names(results.L) <- c("t.obs", "val.df", "t.cal", "t.proj", "qm1.hist", "qm1.val.proj") + base::saveRDS(results.L, file = paste0(rd, "/",r,"_",v, ".RDS")) + + rm(list=setdiff(ls(), c("v", "i", "pd", "pd.obs", "rd", "wet.day", "qstep", "nboot", "type", "r", "var", "Runs"))) + + gc(reset=TRUE) + } + } +} \ No newline at end of file diff --git a/R/pre-processing/converting_city_crops_to_df.R b/R/pre-processing/converting_city_crops_to_df.R index 9bdd7ca3..df9cee3c 100644 --- a/R/pre-processing/converting_city_crops_to_df.R +++ b/R/pre-processing/converting_city_crops_to_df.R @@ -41,8 +41,8 @@ foreach(x = x, for(v in var){ for(r in run){ - - files.paths <- files.paths.all[grepl(v, files.paths.all)& grepl(r, files.paths.all)&grepl("CPM", files.paths.all)] + rr <- paste0("_",r,"_") + files.paths <- files.paths.all[grepl(v, files.paths.all)& grepl(rr, files.paths.all)&grepl("CPM", files.paths.all)] # Read in 1st runpath as df with xy coords to ensure overlay p1 <- files.paths[[1]] From 2e6fa1df2b6235b9ac5aee61f17b39610a3876d5 Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Tue, 3 Oct 2023 20:41:21 +0000 Subject: [PATCH 28/83] Small updated to wet.day and corrected related issue preventing run --- R/bias-correction-methods/ThreeCitiesQmap.RMD | 19 +--- .../apply_qmapQuant_to_crpd_df_fn.R | 107 +----------------- 2 files changed, 8 insertions(+), 118 deletions(-) diff --git a/R/bias-correction-methods/ThreeCitiesQmap.RMD b/R/bias-correction-methods/ThreeCitiesQmap.RMD index ba018f96..478ffe10 100644 --- a/R/bias-correction-methods/ThreeCitiesQmap.RMD +++ b/R/bias-correction-methods/ThreeCitiesQmap.RMD @@ -41,6 +41,8 @@ Data has been pre-processed from cropped using 'converting_city_crops_to_df.R' t ##1. Empirical Quantile Mapping + + ```{r warning = F} setwd("/home/dyme/Desktop/clim-recal/clim-recal") source("R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R") @@ -59,7 +61,6 @@ lapply(cities, function(x){ pd.obs = paste0(dd, "Interim/HadsUK/Data_as_df/three.cities/"),#Parent directory where dataframes of obs data are val.startdate = "20101201", #The first date of the validation period. eg 20101201 All dates before this time will be taken as the calibration ## These args to be passed to qmapQUANT itself: - wet.day = ifelse(v=="pr", T, F), qstep = 0.1, # numeric value between 0 and 1, e.g 0.1. The quantile mapping is fitted only for the quantiles defined by quantile(0,1,probs=seq(0,1,by=qstep). nboot = 1, #numeric value 1 or greater - nboot number of bootstrap samples used for estimation of the observed quantiles. If nboot==1 the estimation is based on all (and not resampled) data. @@ -71,20 +72,6 @@ lapply(cities, function(x){ ``` - -The called function was written to apply the following models: - - ```{r} -dsource("/home/dyme/Desktop/clim-recal/clim-recal/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R") - -var <- c("tasmax", "tasmin", "pr") -Runs <-c("05", "06", "07", "08") - -lapply(cities, function(x){ - apply_qmap_to_cropped_dfL(region=x, var=var, Runs = Runs)}) - - -``` - +``` \ No newline at end of file diff --git a/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R b/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R index fa004a80..e228c2a2 100644 --- a/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R +++ b/R/bias-correction-methods/apply_qmapQuant_to_crpd_df_fn.R @@ -34,13 +34,14 @@ apply_qmapQUANT_to_cropped_df <- function(region, #Region code - needs to relate for(r in Runs){ for(v in var){ - wet.day <- wet.day + wet.day <- ifelse(v=="pr", T, F) obs.files <- list.files(paste0(pd.obs, i), full.names = T) obs.files.v <- obs.files[grepl(v, obs.files)] - obs.df <- read.csv(obs.files.v) + obs.df <- fread(obs.files.v) + obs.df <- as.data.frame(obs.df) #subset file list to var - row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) + row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y) obs.df$x <- NULL obs.df$y <- NULL @@ -98,7 +99,7 @@ for(r in Runs){ names(results.L) <- c("t.obs", "val.df", "t.cal", "t.proj", "qm1.hist", "qm1.val.proj") base::saveRDS(results.L, file = paste0(rd, "/",r,"_",v, ".RDS")) - rm(list=setdiff(ls(), c("v", "i", "pd", "pd.obs", "rd", "wet.day", "qstep", "nboot", "type", "r", "var", "Runs"))) + rm(list=setdiff(ls(), c("v", "i", "pd", "pd.obs", "rd", "val.startdate", "qstep", "nboot", "type", "r", "var", "Runs"))) gc(reset=TRUE) } @@ -107,101 +108,3 @@ for(r in Runs){ - - -apply_qmapQUANT_to_cropped_df <- function(region, #Region code - needs to relate to the file name in a unique way to subset - var, #Meterological variables - as in files - Runs, #Run as in name of files - pd, #Parent directory where dataframes of data are - pd.obs, #Parent directory where dataframes of obs data are - val.startdate, #The first date of the validation period. eg 20101201 All dates before this time will be taken as the calibration - rd, #where to store the results list output buy this - ## These args to be passed to qmapQUANT itself: - wet.day, - qstep, # numeric value between 0 and 1, e.g 0.1. The quantile mapping is fitted only for the quantiles defined by quantile(0,1,probs=seq(0,1,by=qstep). - nboot, #numeric value 1 or greater - nboot number of bootstrap samples used for estimation of the observed quantiles.If nboot==1 the estimation is based on all (and not resampled) data. - - type) #interpolation method to use for fitting data to the predictions )(eg linear, tricubic) - - { - - i <- region - pd <- pd - pd.obs <- pd.obs - rd <- rd - qstep <- qstep - nboot <- nboot - type <- type - -for(r in Runs){ - for(v in var){ - wet.day <- wet.day - obs.files <- list.files(paste0(pd.obs, i), full.names = T) - obs.files.v <- obs.files[grepl(v, obs.files)] - obs.df <- read.csv(obs.files.v) - - #subset file list to var - row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) - obs.df$x <- NULL - obs.df$y <- NULL - - #Create a data frame for the obs data in the validation period and leave 'obs.df' as the calibration - n1 <-min(grep(val.startdate, names(obs.df))) - val.df <- obs.df[c(n1:ncol(obs.df))] - obs.df <- obs.df[c(1:n1-1)] - - #Load in CPM data - df contains all 100 years - cpm.files <- list.files(paste0(pd, i), full.names = T) - cpm.files.v <- cpm.files[grepl(v, cpm.files)&grepl(r, cpm.files)] - - cpm.df <- fread(cpm.files.v) - cpm.df <- as.data.frame(cpm.df) - row.names(cpm.df) <- paste0(cpm.df$x, "_", cpm.df$y ) - cpm.df$x <- NULL - cpm.df$y <- NULL - - #Calibration years 1980 - 2010 - load in full one for 1980 - 2000 - cpm.cal <- cpm.df[c(1:n1-1)] - cpm.proj <- cpm.df[c(n1:ncol(cpm.df))] - remove(cpm.df) - - #Some cells missing in hads so subset here jic - cal.df <- cpm.cal[which(row.names(cpm.cal)%in%row.names(obs.df)),] - proj.df<- cpm.proj[which(row.names(cpm.proj)%in%row.names(obs.df)),] - - remove(cpm.cal) - remove(cpm.proj) - - #Grid ref should be cols, observations (time) should be rows - - cal.df <- t(cal.df) - proj.df <- t(proj.df) - obs.df <- t(obs.df) - - -## **3. Empirical Quantile Mapping** - - library(qmap) - qm1.fit <- fitQmapQUANT(obs.df, cal.df, - wet.day = wet.day, - qstep = qstep, - nboot = nboot) #nboot number of bootstrap samples used for estimation of the observed quantiles. - - - qm1.hist <- doQmapQUANT(cal.df, qm1.fit, type=type) - qm1.proj <- doQmapQUANT(proj.df, qm1.fit, type=type) - -## **4. Save the data** - - # Save data - lists of dfs for now (will be easier for assessment to have all to hand) - results.L <- list(obs.df, val.df, cal.df, proj.df, qm1.hist, qm1.proj) - - names(results.L) <- c("t.obs", "val.df", "t.cal", "t.proj", "qm1.hist", "qm1.val.proj") - base::saveRDS(results.L, file = paste0(rd, "/",r,"_",v, ".RDS")) - - rm(list=setdiff(ls(), c("v", "i", "pd", "pd.obs", "rd", "wet.day", "qstep", "nboot", "type", "r", "var", "Runs"))) - - gc(reset=TRUE) - } - } -} \ No newline at end of file From 8678c078ad2e63597c267736e6522d4a38ab5d69 Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Wed, 4 Oct 2023 15:15:19 +0000 Subject: [PATCH 29/83] Rewriting BC assessment to be more general for input Resaving LCAT version to avoid confusion on merge --- R/LCAT/Assessing.BC.data.LCAT.RMD | 194 ++- .../WIP Assessing bias corrected data.Rmd | 1218 ----------------- .../Assessing_bc_data/Assessing_BC_Data.RMD | 797 +++++++++++ 3 files changed, 929 insertions(+), 1280 deletions(-) delete mode 100644 R/comparing-r-and-python/Reviewing-bias-correction-methods/WIP Assessing bias corrected data.Rmd create mode 100644 notebooks/Assessing_bc_data/Assessing_BC_Data.RMD diff --git a/R/LCAT/Assessing.BC.data.LCAT.RMD b/R/LCAT/Assessing.BC.data.LCAT.RMD index fb39507b..11752d27 100644 --- a/R/LCAT/Assessing.BC.data.LCAT.RMD +++ b/R/LCAT/Assessing.BC.data.LCAT.RMD @@ -1,5 +1,5 @@ --- -title: "Bias correction assessment of LCAT data" +title: "Bias correction assessment" author: "Ruth Bowyer" date: "`r format(Sys.Date())`" output: @@ -24,53 +24,157 @@ library(tmap) #pretty maps library(RColorBrewer) library(tidyverse) library(kableExtra) -library(plotrix) #For taylor diagrams - -dd <- "/mnt/vmfileshare/ClimateData/" ``` ## **0. About** -LCAT require 'bias corrected' data for the whole of the UK. -We have applied a widely used approach, quantile mapping, to the data. Specifically, we have used *non-parametric quantile mapping using empirical quantiles* as available in the `Qmap` package. -Because the data is so large, we have applied this bias correction to the UK broked down into regions, with Scotland brokedn down into further regions (see `R/LCAT/Region.Refs.csv`) +This is an example notebook for the assessment of bias corrected data, using output from the R 'qmap' package for the city of Glasgow and the variable 'tasmax'. -We will now: +**Input data** -- Assess the bias correction using some of the segments -- Process the data back to geotiff -- Either process as monthly data or as UK wide rasters (maybe just write them seperately) and av across runs +This script requires the following data: -The data is within `ClimateData/Debiased/R/QuantileMapping` and is in RDS format, with each object containing a list. +- 'obs.cal' - observation (HADs data) for the *calibration* period - the dataset used as the reference dataset in the bias correction +- 'obs.val' - as above, for the *validation* period +- 'cpm.cal.raw' - the raw (uncorrected) data for the *calibration* period +- 'cpm.cal.adj' - the adjusted (bias-corrected) data for the *calibration* period +- 'cpm.val.raw' - the raw (uncorrected) data for the *valibration* period +- 'cpm.val.adj' - the adjusted (bias-corrected) data for the *valibration* period +- 'cpm.proj.raw' - the raw (uncorrected) data for the *future/projected* period (optional) +- 'cpm.proj.radj' - the adjusted (bias-corrected) data for the *future/projected* period (optional) -The objects within this R list are as follows: -- 't.obs': transposed observation df -- 't.cal': transposed calibration df -- 't.proj': transposed projection df (included the validation period) -- 'qm1.hist.a' - bias corrected values for the historical period, values fitted with linear interpolation -- 'qm1.hist.b' - bias corrected values for the historical period, values fitted with tricubic interpolation -- 'qm1.proj.a' - bias corrected values for the validation/projection period, values fitted with linear interpolation -- 'qm1.proj.b' - bias corrected values for the validation/projection period, values fitted with tricubic interpolation +The data is required in raster format and dataframe formats -## **1. Bias Correction Assessment: trends** +**Calibration vs Validation dates** -### **London - tasmax = Run 08** +The calibration period runs between 01-12-1980 to the day prior to 01-12-2010 +The validation period runs between 01-12-2010 to the day prior to 01-12-2020 -Using the London region (UKI) as this is the smallest -- not this is the same regional area as the 'three.cities' crops but cut to shapefile edges rather than the square grid +```{r data loading, include=FALSE} + +#This chunk attempts to apply the conversion to python output data to a form that this script will also use. This could (and probably should) be moved to a source script -- also the R pre-processing should probably be moved to the bias correction script? + +dd <- "/mnt/vmfileshare/ClimateData/" #Data directoy of all data used in this script + +input <- "RDS" #Either df or raster -- R outputs are a group of dfs in list form saved as an RDS, python input is a raster +city <- "Glasgow" +var <- "tasmax" +runs <- c("05", "06", "07", "08") +if(input=="raster"){ + +####### PYTHON INPUTS HERE ###### + # This script uses both raster data and the raw data + # This script uses Lists to group everything by runs + # Therefore what is require from this here is to create a list object for each of the sets of the data as listed above, where the list items are the rasters or dataframes by run (ie each level of the list is a run) + # .nc and .tif files can be read with rast("path/to/file.nc") + # Conversion to df is just as.data.frame(raster, xy=T) - easiest thing is just to loop using lapply the files + #dfs are assumed to be cells x time + +} + + } else if(input=="RDS"){ + ### This R bit is a bit crazy because of the format output from the bias correction - at some point to be cleaned up and moved to a different script. + ## Load a source raster to extract the crs + ## Also to add needs to be changing the names of the dfs to make then same acorss all methods + r <- list.files(paste0(dd, "Reprojected/UKCP2.2/tasmax/05/latest/")) + r <- r[1] + rp <- paste0(dd, "Reprojected/UKCP2.2/tasmax/05/latest/", r) + rast <- rast(rp) + + crs <- crs(rast) + + ## The output created from the R bias correction framework is a list of dataframes containing all the data we need for this doc (although some are transposed). + rd <- "Debiased/R/QuantileMapping/three.cities/" + + files <- list.files(paste0(dd,rd,city),full.names=T) + files.v <- files[grepl(var, files)] + + allruns <- lapply(files.v, readRDS) + + names <- gsub(paste0(dd,rd,city,"|/|.RDS"),"",files.v) + names(allruns) <- names + + obs.val.df.L <- lapply(allruns, function(L){t(L[["t.obs"]])}) #This object stored in the results transposed for applying bias correction - maybe look at adding this to the bias correction script rather than here? + obs.cal.df.L <- lapply(allruns, function(L){L[["val.df"]]}) + + cpm.cal.raw.df.L <- lapply(allruns, function(L){t(L[["t.cal"]])}) + + #In the R scirpt, the validation is corrected with the projected data as well - so needs to be seperated out (and transposed) + cpm.val.raw.df.L <- lapply(allruns, function(L){ + proj <- as.data.frame(t(L[["t.proj"]])) + val.end.date <- min(grep("20201201-", names(proj)))-1 + cpm.val.raw.df <- proj[,1:val.end.date] + }) + + cpm.proj.raw.df.L <- lapply(allruns, function(L){ + proj <- as.data.frame(t(L[["t.proj"]])) + val.end.date <- min(grep("20201201-", names(proj))) + cpm.val.raw.df <- proj[,val.end.date:ncol(proj)] + }) + + cpm.cal.adj.df.L <- lapply(allruns, function(L){ + adj <- as.data.frame(t(L[["qm1.hist"]])) + }) + + cpm.val.adj.df.L <- lapply(allruns, function(L){ + proj <- as.data.frame(t(L[["qm1.val.proj"]])) + val.end.date <- min(grep("20201201-", names(proj)))-1 + proj[,1:val.end.date] + }) + + cpm.proj.adj.df.L <- lapply(allruns, function(L){ + proj <- as.data.frame(t(L[["qm1.val.proj"]])) + val.end.date <- min(grep("20201201-", names(proj))) + proj[,val.end.date:ncol(proj)] + }) + + ## Convert to rasters --requires creation of x and y cols from row names +``` + +#here - sorting out inputs and rasters in single chunk ```{r} + + df.rL <- lapply(runs, function(i){ + L <- allruns[[i]] + lapply(L, function(x){ + df <- t(x) + df <- as.data.frame(df) + rn <- row.names(df) #The rownames were saves as x_y coordinates + xi <- gsub("_.*", "", rn) + yi <- gsub(".*_", "", rn) + xy <- data.frame(x = xi, y = yi) + df <- cbind(xy, df) + L <- df.rL[[i]] + lapply(L, function(x){ + r <- rast(x, type="xyz") + crs(r) <- crs + return(r) + }) + }) + + names(df.rL) <- runs + +names(rasts) <- runs + + } else { + print("Invalid input") +} -runs <- c("Run05", "Run06", "Run07", "Run08") -London.allruns <- lapply(runs, function(i){ - rds <- paste0(dd,"/Debiased/R/QuantileMapping/resultsL",i,"_UKI_tasmax.RDS") - readRDS(rds)}) -names(London.allruns) <- runs ``` + +## **1. Bias Correction Assessment: trends** + + +### **London - tasmax = Run 08** + +Using the London region (UKI) as this is the smallest -- not this is the same regional area as the 'three.cities' crops but cut to shapefile edges rather than the square grid + Load in Hads validation data (So this can be run for all of the LCAT data, I'm going to read in the whole HADs files for the calibration years) @@ -97,40 +201,6 @@ The next set of chunks visualise the data This next chunk converts the dfs back to raster with the correct CRS ```{r convert to df and raster} -## Load a source raster to extract the crs -r <- list.files(paste0(dd, "Reprojected/UKCP2.2/tasmax/05/latest/")) -r <- r[1] -rp <- paste0(dd, "Reprojected/UKCP2.2/tasmax/05/latest/", r) -rast <- rast(rp) - -crs <- crs(rast) - -## Convert from matrix to df, transpose, create x and y cols - when run in chunk this works fine but for some reason can throw an error when run otherwise -London.df.rL <- lapply(runs, function(i){ - L <- London.allruns[[i]] - lapply(L, function(x){ - df <- t(x) - df <- as.data.frame(df) - rn <- row.names(df) #The rownames were saves as x_y coordinates - xi <- gsub("_.*", "", rn) - yi <- gsub(".*_", "", rn) - xy <- data.frame(x = xi, y = yi) - df <- cbind(xy, df)}) - }) - -names(London.df.rL) <- runs - -## Convert to rasters -London.rasts <- lapply(runs, function(i){ - L <- London.df.rL[[i]] - lapply(L, function(x){ - r <- rast(x, type="xyz") - crs(r) <- crs - return(r)}) -}) - -names(London.rasts) <- runs - ``` diff --git a/R/comparing-r-and-python/Reviewing-bias-correction-methods/WIP Assessing bias corrected data.Rmd b/R/comparing-r-and-python/Reviewing-bias-correction-methods/WIP Assessing bias corrected data.Rmd deleted file mode 100644 index e21d80ac..00000000 --- a/R/comparing-r-and-python/Reviewing-bias-correction-methods/WIP Assessing bias corrected data.Rmd +++ /dev/null @@ -1,1218 +0,0 @@ ---- -title: "WIP Assessing bias corrected data" -author: "Ruth C E Bowyer" -output: - github_document: - date: "`r format(Sys.Date())`" ---- - - -```{r libs and setup, message=FALSE, warning=F} -rm(list=ls()) - -knitr::opts_knit$set(root.dir="/Volumes/vmfileshare/ClimateData/") - -library(terra) -library(sp) -library(tidyverse) -#library(exactextractr) -#library(dplyr) -library(ggplot2) - -dd <- "/Volumes/vmfileshare/ClimateData/" - -``` - - -## **0. About** - -Assessing the bias correction applied using the ```cmethods``` package. - -Five methods have been applied in this interim dataset on run 1 of the UKCP CPM data. - -### **Delta method** (delta) - -**cemthods** implements Delta Method based on: Beyer, R. and Krapp, M. and Manica, A.: An empirical evaluation of bias correction methods for palaeoclimate simulations (https://doi.org/10.5194/cp-16-1493-2020), which describes the methods as follows: - -*The delta method consists of adding the difference between past and present-day simulated climate to present-day observed climate. As such, the delta method assumes that local (i.e. grid-cell-specific) model biases are constant over time (Maraun and Widmann, 2018). For temperature variables (including terrestrial and marine mean annual temperatures and terrestrial temperature of the warmest and coldest months considered here), the bias in a geographical location x is given by the difference between present-day observed and simulated temperature* - -*Precipitation is bounded below by zero and covers different orders of magnitude across different regions. A multiplicative rather than additive bias correction is therefore more common when applying the delta method for precipitation, which corresponds to applying the simulated relative change to the observations (Maraun and Widmann, 2018).* - -### **Linear scaling** (linear) - -### **Variance scaling** (variance) - -### **Quantile Mapping method** (QM) - -### **Quantile Delta method** (QDM) - -## **0. Raw data** - -Comparing to the 'Raw' (but reprojected CPM data) -- cropped to Scotland extent (cropping-CPM-to-Scotland.R) - -```{r load raw data} - -# NEEDS TO BE UPDATED WHEN CAN COPY THE FILES TO THE VM -# p <- paste0(dd, "Interim/Cropped_UKCPM/") -p <- "~/Library/CloudStorage/OneDrive-TheAlanTuringInstitute/tempdata/" -files <- list.files(p) -raw.files.p <- paste0(p, files) -#raw.dat <- lapply(raw.files.p, rast) - -historic.raw.fs <- raw.files.p[c(1:7200)] -proj.raw.fs <- raw.files.p[c(7201:length(files))] - -``` - - - -##**1. Bias corrected data ** - -Load and check files - -```{r load data 1} - -# Each method is a sep file per decade -# Create a character vector listing the methods - -p <- paste0(dd, "Debiased/tasmax/") -f <- list.files(p) -i <- c("debiased_delta_method_result", "debiased_linear_scaling_result", "debiased_quantile_delta_mapping", "debiased_quantile_mapping", "debiased_variance_scaling") - -# Returns a list of stacked rasters (one for each method) -debiased_data <- lapply(i, function(x){ - files <- f[grepl(x, f)] - files.p <- paste0(p, files) - #Reads in as list of Rasters - dat <- lapply(files.p, rast) - #Load all together so one raster per method - files.r <- rast(dat) -}) - -names(debiased_data) <- i - -``` - -## **2. Assessing the bias correction methods** - -### **2a. Time series** - -Check the time series over the whole dataset makes sense - -```{r} - -# Code to create a character vector ensuring the days are read in in the correct order - -ds <- rep(01:30, 12) -ds <- ifelse(ds<10, paste0(0,ds),paste(ds)) -ms <- rep(01:12, each=30) -ms <- ifelse(ms<10, paste0(0,ms), paste(ms)) -ys <- rep(c(2020:2040, 2060:2080), each=360) -dmy <- paste0(ys,"-",ms,"-",ds) -dmy2<- dmy[which(dmy=="2020-12-01"):which(dmy=="2080-11-30")] -rem <- which(dmy2=="2040-12-01"):which(dmy2=="2060-11-30") -dmy_dash <- dmy2[-rem] - -dmy_nodash <- gsub("-","", dmy_dash) - -``` - -#### **2ai. Raw CPM ** - -Projected time series - -```{r} - -#Reorder the file list so they read in chronologically -##Create a df with a numeric vector for sorting by extracting the day -i <- numeric() - -for (x in 0:39){ - i <- c(i,(x*360)+1) -} - -ii <- numeric() - -for (x in 1:40){ - ii <- c(ii,(x*360)) -} - -dd <- rep(1:360, 40) - -dmy.i<- rep(dmy_nodash[i], each=360) -dmy.ii <- rep(dmy_nodash[ii], each=360) - -ordered <- paste0("~/Library/CloudStorage/OneDrive-TheAlanTuringInstitute/tempdata/tasmax_rcp85_land-cpm_uk_2.2km_01_day_", - dmy.i, - "-", dmy.ii, - "_", dd, "_cropped.tif") -``` - -Setting cache = T in this chunk and subsequent where runtime is long so subsequent running is quicker - -```{r cache = T} - -#Read in order raw data and convert to data frame -proj.raw.df <- lapply(ordered, function(x){ - r <- rast(x) - rdf <- as.data.frame(r) -}) - -proj.raw.df2 <- proj.raw.df %>% reduce(cbind) -``` - -```{r} - -#Add in xycoords for latter comparisons and checks - -## Get cell number from single raster -r <- rast(ordered[[1]]) -cells <- cells(r) - -ukcp_c_xy <- sapply(cells, function(i){xyFromCell(r, i)}) -ukcp_c_xy <- as.data.frame(t(ukcp_c_xy)) -names(ukcp_c_xy) <- c("x", "y") -proj.raw.df.xy <- cbind(ukcp_c_xy, proj.raw.df2) - -``` - - -```{r create summary data frame} - -## Create a dataframe summarising the average tasmax for this area -x <- 3:ncol(proj.raw.df.xy) -proj.raw.df.g_ <- lapply(x, function(x){ - i <- proj.raw.df.xy[,x] - mean <- mean(i, na.rm=T) - sd <- sd(i, na.rm=T) - data.frame(row.names=names(proj.raw.df.xy)[x], - mean=mean, - sd.high=mean+sd, - sd.low=mean-sd) -}) - - -proj.raw.df.g <- proj.raw.df.g_ %>% purrr::reduce(rbind) - -proj.raw.df.g$dmy <- dmy_dash -``` - - -##### 'Raw' trend daily 2020 - 2040 - -```{r Raw trend 1} -proj.raw.df.g2020 <- proj.raw.df.g[which(proj.raw.df.g$dmy<2060),] - -ggplot(proj.raw.df.g2020) + - geom_ribbon(aes(x = 1:length(dmy), ymin = sd.low, ymax=sd.high), color="lightgrey", alpha=0.5) + - geom_line(aes(x=1:length(dmy), y=mean), color="cornflowerblue", group=1) + - theme_bw() + ylab("Av daily max temp oC") + - ggtitle("'Raw' - tasmax daily") + - scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Day") - -``` - - - -##### 'Raw' trend daily 2060 - 2080 - -```{r Raw trend 2} -proj.raw.df.g2060 <- proj.raw.df.g[which(proj.raw.df.g$dmy>2059),] - -ggplot(proj.raw.df.g2060) + - geom_ribbon(aes(x = 1:length(dmy), ymin = sd.low, ymax=sd.high), color="lightgrey", alpha=0.5) + - geom_line(aes(x=1:length(dmy), y=mean), color="cornflowerblue", group=1) + - theme_bw() + ylab("Av daily max temp oC") + - ggtitle("'Raw' - tasmax daily") + - scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Day") - -``` - -##### 'Raw' trend seasonal averages (full time series) - -```{r Raw trend seasonal} - -#Annotate season based on month index - -proj.raw.df.g$season <- ifelse(grepl("-12-|-01-|-02-", proj.raw.df.g$dmy), "Winter", - ifelse(grepl("-03-|-04-|-05-", proj.raw.df.g$dmy), "Spring", - ifelse(grepl("-06-|-07-|-08-", proj.raw.df.g$dmy), "Summer", "Autumn"))) - - -proj.raw.df.g$year <- as.numeric(sub("-.*", "", proj.raw.df.g$dmy)) - -#Create a season_year var than considers the same Winter season across 2 years -## i.e. - Jan 2021 is considered as Winter 2020 -proj.raw.df.g$season_year <- ifelse(proj.raw.df.g$season != "Winter"| grepl("-12-", proj.raw.df.g$dmy), - paste0(proj.raw.df.g$season, "_", proj.raw.df.g$year), paste0(proj.raw.df.g$season,"_", proj.raw.df.g$year-1)) - - -#Calculate seasonal mean and SD -seasonal.mean <- proj.raw.df.g %>% - group_by(season_year) %>% mutate(mean.seasonal = mean(mean), - sd.high.seasonal = mean.seasonal + sd(mean), - sd.low.seasonal = mean.seasonal - sd(mean)) - -#Remove daily vals to avoid confusion -seasonal.mean[c("mean", "sd.high", "sd.low")] <- NULL - -#Remove duplicate values -seasonal.mean <- distinct(seasonal.mean, season_year, .keep_all=T) #160 seasons - -``` - - -```{r Raw trend seasonal} - -#Add in missing years for clearer plotting of trend -dfg_sm <- seasonal.mean - -seas.miss <- rep(c("Spring", "Summer", "Autumn", "Winter"), 19) -year.miss <- rep(2041:2059, each=4) - -add.s.y <- paste0(seas.miss, "_", year.miss) -add.s.y <- c("Winter_2040", add.s.y) - -dfg_sm <- plyr::rbind.fill(dfg_sm, - data.frame(year=c(2040, year.miss), - season_year=add.s.y, - mean.seasonal=NA, - sd.low.seasonal=NA, - sd.high.seasonal=NA)) - -dfg_sm <- dfg_sm[order(dfg_sm$year),] -``` - - -**'Raw' - seasonal** - -```{r Raw seasonal} - -ggplot(dfg_sm) + - geom_ribbon(aes(x = 1:length(season_year), ymin = sd.low.seasonal, ymax=sd.high.seasonal), color="lightgrey", alpha=0.5) + - geom_line(aes(x=1:length(season_year), y=mean.seasonal), color="cornflowerblue", group=1) + - theme_bw() + ylab("Av daily max temp oC") + - ggtitle("'Raw' - tasmax seasonal") + - xlab("Season - Year") + - scale_x_discrete(labels = c(dfg_sm$season_year)) + - theme(axis.text.x = element_text(angle = 270, vjust = 0.5, hjust=1)) - -``` - - -**'Raw' - Winter only** - -```{r Raw seasonal winter} - -dfg_sm_w <- subset(dfg_sm, grepl("Winter", season_year)) - -ggplot(dfg_sm_w) + - geom_ribbon(aes(year, ymin = sd.low.seasonal, ymax=sd.high.seasonal), - fill="lightblue3", alpha=0.5) + - geom_line(aes(year, y=mean.seasonal), color="lightblue4", group=1) + - theme_bw() + ylab("Av daily max temp oC") + - ggtitle("'Raw' - tasmax seasonal - Winter only") + - xlab("Year") + - theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) - -``` - - -**'Raw' - Summer only** - -```{r Raw seasonal summer} - -dfg_sm_s <- subset(dfg_sm, grepl("Summer", season_year)) - -ggplot(dfg_sm_s) + - geom_ribbon(aes(year, ymin = sd.low.seasonal, ymax=sd.high.seasonal), - fill="darkgoldenrod", alpha=0.5) + - geom_line(aes(year, y=mean.seasonal), color="darkred", group=1) + - theme_bw() + ylab("Av daily max temp oC") + - ggtitle("'Raw' - tasmax seasonal - Summer only") + - xlab("Year") + - theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) - -``` - -##### 'Raw' trend - Annual tasmax - -```{r Raw Annual trend} - -#Calculate seasonal mean and SD -annual.mean <- proj.raw.df.g %>% - group_by(year) %>% mutate(mean.annual = mean(mean), - sd.high.annual = mean.annual + sd(mean), - sd.low.annual = mean.annual - sd(mean)) - -#Remove daily vals to avoid confusion -annual.mean[c("mean", "sd.high", "sd.low")] <- NULL - -#Remove duplicate values -annual.mean <- distinct(annual.mean, year, .keep_all=T) #42 year vals - -#Remove 2020 and 2080 as not whole months in those years -annual.mean <- subset(annual.mean, year!=2020|year!=2060|year!=2080) - -#Add missing years as NA -annual.mean <- plyr::rbind.fill(annual.mean, - data.frame(year=c(2040:2060), - mean.annual=NA, - sd.low.annual=NA, - sd.high.annual=NA)) - -annual.mean <- annual.mean[order(annual.mean$year),] - -``` - -```{r Raw annual} - -ggplot(annual.mean) + - geom_ribbon(aes(year, ymin = sd.low.annual, ymax=sd.high.annual), - fill="firebrick1", alpha=0.5) + - geom_line(aes(year, y=mean.annual), color="firebrick4", group=1) + - theme_bw() + ylab("tasmax oC") + - ggtitle("'Raw' - tasmax annual") + - xlab("Year") + - theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) - -``` - - -# HERE -Update the figs below as above -Add in annual fig for all -Check Summer 2080 again -.Rdata save 1104 - - -#### **2aii. Delta method** - - -```{r} -delta <- debiased_data$debiased_delta_method_result -nlyr(delta) #14400 - correct number for 40Y * (12m*30d) - ie 14400 days - -``` - -```{r cache=T} -names <- paste0("tasmax_", dmy_dash) -df <- as.data.frame(delta, xy=T) - -n2 <- c("x", "y", names) -names(df) <- n2 -``` - -```{r} -#Mean and sd of by day calculated for plotting -df.g <- apply(df[c(3:ncol(df))], 2, function(i){ - mean <- mean(i, na.rm=T) - sd <- sd(i, na.rm=T) - data.frame(mean=mean, - sd.high=mean+sd, - sd.low=mean-sd) -}) - -df.g <- df.g %>% purrr::reduce(rbind) -row.names(df.g) <- names -df.g$dmy <- dmy_dash -``` - -##### Delta trend daily 2020 - 2040 - -```{r delta trend 1} -df.g2020 <- df.g[which(df.g$dmy<2060),] - -ggplot(df.g2020) + - geom_ribbon(aes(x = 1:length(dmy), ymin = sd.low, ymax=sd.high), color="lightgrey", alpha=0.5) + - geom_line(aes(x=1:length(dmy), y=mean), color="cornflowerblue", group=1) + - theme_bw() + ylab("Av daily max temp oC") + - ggtitle("Delta - tasmax daily") + - scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Day") - -``` - - -##### Delta trend daily 2060 - 2080 - -```{r delta trend 2} -df.g2060 <- df.g[which(df.g$dmy>2059),] - -ggplot(df.g2060) + - geom_ribbon(aes(x = 1:length(dmy), ymin = sd.low, ymax=sd.high), color="lightgrey", alpha=0.5) + - geom_line(aes(x=1:length(dmy), y=mean), color="cornflowerblue", group=1) + - theme_bw() + ylab("Av daily max temp oC") + - ggtitle("Delta - tasmax daily") + - scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Day") - -``` - -##### Delta trend seasonal averages (full time series) - -```{r delta trend seasonal} - -#Annotate season based on month index -df.g$season <- ifelse(grepl("-12-|-1-|-2-", df.g$dmy), "Winter", - ifelse(grepl("-3-|-4-|-5-", df.g$dmy), "Spring", - ifelse(grepl("-6-|-7-|-8-", df.g$dmy), "Summer", "Autumn"))) - -df.g$year <- as.numeric(sub("-.*", "", df.g$dmy)) - -#Create a season_year var than considers the same Winter season across 2 years -## i.e. - Jan 2021 is considered as Winter 2020 -df.g$season_year <- ifelse(df.g$season != "Winter"| grepl("-12-", df.g$dmy), - paste0(df.g$season, "_", df.g$year), paste0(df.g$season,"_", df.g$year-1)) - - -#Calculate seasonal mean and SD -seasonal.mean <- df.g %>% - group_by(season_year) %>% mutate(mean.seasonal = mean(mean), - sd.high.seasonal = mean.seasonal + sd(mean), - sd.low.seasonal = mean.seasonal - sd(mean)) - -#Remove daily vals to avoid confusion -seasonal.mean[c("mean", "sd.high", "sd.low")] <- NULL - -#Remove duplicate values -seasonal.mean <- distinct(seasonal.mean, season_year, .keep_all=T) #160 seasons - -``` - - -```{r delta trend seasonal} - -#Add in missing years for clearer plotting of trend -dfg_sm <- seasonal.mean -seas.miss <- rep(c("Spring", "Summer", "Autumn", "Winter"), 20) -year.miss <- rep(2041:2060, each=4) -add.s.y <- paste0(seas.miss, "_", year.miss) -add.s.y <- c("Winter_2040", add.s.y) - -dfg_sm <- plyr::rbind.fill(dfg_sm, - data.frame(year=c(2040, year.miss), - season_year=add.s.y, - mean.seasonal=NA, - sd.low.seasonal=NA, - sd.high.seasonal=NA)) - -dfg_sm <- dfg_sm[order(dfg_sm$year),] -``` - - -**Delta - seasonal** - -```{r delta seasonal} - -ggplot(dfg_sm) + - geom_ribbon(aes(x = 1:length(season_year), ymin = sd.low.seasonal, ymax=sd.high.seasonal), color="lightgrey", alpha=0.5) + - geom_line(aes(x=1:length(season_year), y=mean.seasonal), color="cornflowerblue", group=1) + - theme_bw() + ylab("Av daily max temp oC") + - ggtitle("Delta - tasmax seasonal") + - xlab("Season - Year") + - scale_x_discrete(labels = c(dfg_sm$season_year)) + - theme(axis.text.x = element_text(angle = 270, vjust = 0.5, hjust=1)) - -``` - - -**Delta - Winter only** - -```{r delta seasonal winter} - -dfg_sm_w <- subset(dfg_sm, grepl("Winter", season_year)) - -ggplot(dfg_sm_w) + - geom_ribbon(aes(year, ymin = sd.low.seasonal, ymax=sd.high.seasonal), - fill="lightblue3", alpha=0.5) + - geom_line(aes(year, y=mean.seasonal), color="lightblue4", group=1) + - theme_bw() + ylab("Av daily max temp oC") + - ggtitle("Delta - tasmax seasonal - Winter only") + - xlab("Year") + - theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) - -``` - - -**Delta - Summer only** - -```{r delta seasonal summer} - -dfg_sm_s <- subset(dfg_sm, grepl("Summer", season_year)) - -ggplot(dfg_sm_w) + - geom_ribbon(aes(year, ymin = sd.low.seasonal, ymax=sd.high.seasonal), - fill="darkgoldenrod", alpha=0.5) + - geom_line(aes(year, y=mean.seasonal), color="darkred", group=1) + - theme_bw() + ylab("Av daily max temp oC") + - ggtitle("Delta - tasmax seasonal - Summer only") + - xlab("Year") + - theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) - -``` - - -#### **2aiii. Linear scaling method** - -**cemthods** implements - - - -```{r} -linear <- debiased_data$debiased_linear_method_result -nlyr(linear) #14400 - correct number for 40Y * (12m*30d) - ie 14400 days - -``` - -```{r cache=T} -names <- paste0("tasmax_", dmy_dash) -df <- as.data.frame(linear, xy=T) - -n2 <- c("x", "y", names) -names(df) <- n2 -``` - -```{r} -#Mean and sd of by day calculated for plotting -df.g <- apply(df[c(3:ncol(df))], 2, function(i){ - mean <- mean(i, na.rm=T) - sd <- sd(i, na.rm=T) - data.frame(mean=mean, - sd.high=mean+sd, - sd.low=mean-sd) -}) - -df.g <- df.g %>% purrr::reduce(rbind) -row.names(df.g) <- names -df.g$dmy <- dmy_dash -``` - -##### linear trend daily 2020 - 2040 - -```{r linear trend 1} -df.g2020 <- df.g[which(df.g$dmy<2060),] - -ggplot(df.g2020) + - geom_ribbon(aes(x = 1:length(dmy), ymin = sd.low, ymax=sd.high), color="lightgrey", alpha=0.5) + - geom_line(aes(x=1:length(dmy), y=mean), color="cornflowerblue", group=1) + - theme_bw() + ylab("Av daily max temp oC") + - ggtitle("linear - tasmax daily") + - scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Day") - -``` - - - -##### linear trend daily 2060 - 2080 - -```{r linear trend 2} -df.g2060 <- df.g[which(df.g$dmy>2059),] - -ggplot(df.g2060) + - geom_ribbon(aes(x = 1:length(dmy), ymin = sd.low, ymax=sd.high), color="lightgrey", alpha=0.5) + - geom_line(aes(x=1:length(dmy), y=mean), color="cornflowerblue", group=1) + - theme_bw() + ylab("Av daily max temp oC") + - ggtitle("linear - tasmax daily") + - scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Day") - -``` - -##### linear trend seasonal averages (full time series) - -```{r linear trend seasonal} - -#Annotate season based on month index -df.g$season <- ifelse(grepl("-12-|-1-|-2-", df.g$dmy), "Winter", - ifelse(grepl("-3-|-4-|-5-", df.g$dmy), "Spring", - ifelse(grepl("-6-|-7-|-8-", df.g$dmy), "Summer", "Autumn"))) - -df.g$year <- as.numeric(sub("-.*", "", df.g$dmy)) - -#Create a season_year var than considers the same Winter season across 2 years -## i.e. - Jan 2021 is considered as Winter 2020 -df.g$season_year <- ifelse(df.g$season != "Winter"| grepl("-12-", df.g$dmy), - paste0(df.g$season, "_", df.g$year), paste0(df.g$season,"_", df.g$year-1)) - - -#Calculate seasonal mean and SD -seasonal.mean <- df.g %>% - group_by(season_year) %>% mutate(mean.seasonal = mean(mean), - sd.high.seasonal = mean.seasonal + sd(mean), - sd.low.seasonal = mean.seasonal - sd(mean)) - -#Remove daily vals to avoid confusion -seasonal.mean[c("mean", "sd.high", "sd.low")] <- NULL - -#Remove duplicate values -seasonal.mean <- distinct(seasonal.mean, season_year, .keep_all=T) #160 seasons - -``` - - -```{r linear trend seasonal} - -#Add in missing years for clearer plotting of trend -dfg_sm <- seasonal.mean -seas.miss <- rep(c("Spring", "Summer", "Autumn", "Winter"), 20) -year.miss <- rep(2041:2060, each=4) -add.s.y <- paste0(seas.miss, "_", year.miss) -add.s.y <- c("Winter_2040", add.s.y) - -dfg_sm <- plyr::rbind.fill(dfg_sm, - data.frame(year=c(2040, year.miss), - season_year=add.s.y, - mean.seasonal=NA, - sd.low.seasonal=NA, - sd.high.seasonal=NA)) - -dfg_sm <- dfg_sm[order(dfg_sm$year),] -``` - - -**linear - seasonal** - -```{r linear seasonal} - -ggplot(dfg_sm) + - geom_ribbon(aes(x = 1:length(season_year), ymin = sd.low.seasonal, ymax=sd.high.seasonal), color="lightgrey", alpha=0.5) + - geom_line(aes(x=1:length(season_year), y=mean.seasonal), color="cornflowerblue", group=1) + - theme_bw() + ylab("Av daily max temp oC") + - ggtitle("linear - tasmax seasonal") + - xlab("Season - Year") + - scale_x_discrete(labels = c(dfg_sm$season_year)) + - theme(axis.text.x = element_text(angle = 270, vjust = 0.5, hjust=1)) - -``` - - -**linear - Winter only** - -```{r linear seasonal winter} - -dfg_sm_w <- subset(dfg_sm, grepl("Winter", season_year)) - -ggplot(dfg_sm_w) + - geom_ribbon(aes(year, ymin = sd.low.seasonal, ymax=sd.high.seasonal), - fill="lightblue3", alpha=0.5) + - geom_line(aes(year, y=mean.seasonal), color="lightblue4", group=1) + - theme_bw() + ylab("Av daily max temp oC") + - ggtitle("linear - tasmax seasonal - Winter only") + - xlab("Year") + - theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) - -``` - - -**linear - Summer only** - -```{r linear seasonal winter} - ggtitle("linear - tasmax seasonal - Summer only") + - xlab("Year") + - theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) - -``` - - -#### **2aii. Delta method** - - -```{r} -delta <- debiased_data$debiased_delta_method_result -nlyr(delta) #14400 - correct number for 40Y * (12m*30d) - ie 14400 days - -``` - -```{r cache=T} -names <- paste0("tasmax_", dmy_dash) -df <- as.data.frame(delta, xy=T) - -n2 <- c("x", "y", names) -names(df) <- n2 -``` - -```{r} -#Mean and sd of by day calculated for plotting -df.g <- apply(df[c(3:ncol(df))], 2, function(i){ - mean <- mean(i, na.rm=T) - sd <- sd(i, na.rm=T) - data.frame(mean=mean, - sd.high=mean+sd, - sd.low=mean-sd) -}) - -df.g <- df.g %>% purrr::reduce(rbind) -row.names(df.g) <- names -df.g$dmy <- dmy_dash -``` - -##### Delta trend daily 2020 - 2040 - -```{r delta trend 1} -df.g2020 <- df.g[which(df.g$dmy<2060),] - -ggplot(df.g2020) + - geom_ribbon(aes(x = 1:length(dmy), ymin = sd.low, ymax=sd.high), color="lightgrey", alpha=0.5) + - geom_line(aes(x=1:length(dmy), y=mean), color="cornflowerblue", group=1) + - theme_bw() + ylab("Av daily max temp oC") + - ggtitle("Delta - tasmax daily") + - scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Day") - -``` - - - -##### Delta trend daily 2060 - 2080 - -```{r delta trend 2} -df.g2060 <- df.g[which(df.g$dmy>2059),] - -ggplot(df.g2060) + - geom_ribbon(aes(x = 1:length(dmy), ymin = sd.low, ymax=sd.high), color="lightgrey", alpha=0.5) + - geom_line(aes(x=1:length(dmy), y=mean), color="cornflowerblue", group=1) + - theme_bw() + ylab("Av daily max temp oC") + - ggtitle("Delta - tasmax daily") + - scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Day") - -``` - -##### Delta trend seasonal averages (full time series) - -```{r delta trend seasonal} - -#Annotate season based on month index -df.g$season <- ifelse(grepl("-12-|-1-|-2-", df.g$dmy), "Winter", - ifelse(grepl("-3-|-4-|-5-", df.g$dmy), "Spring", - ifelse(grepl("-6-|-7-|-8-", df.g$dmy), "Summer", "Autumn"))) - -df.g$year <- as.numeric(sub("-.*", "", df.g$dmy)) - -#Create a season_year var than considers the same Winter season across 2 years -## i.e. - Jan 2021 is considered as Winter 2020 -df.g$season_year <- ifelse(df.g$season != "Winter"| grepl("-12-", df.g$dmy), - paste0(df.g$season, "_", df.g$year), paste0(df.g$season,"_", df.g$year-1)) - - -#Calculate seasonal mean and SD -seasonal.mean <- df.g %>% - group_by(season_year) %>% mutate(mean.seasonal = mean(mean), - sd.high.seasonal = mean.seasonal + sd(mean), - sd.low.seasonal = mean.seasonal - sd(mean)) - -#Remove daily vals to avoid confusion -seasonal.mean[c("mean", "sd.high", "sd.low")] <- NULL - -#Remove duplicate values -seasonal.mean <- distinct(seasonal.mean, season_year, .keep_all=T) #160 seasons - -``` - - -```{r delta trend seasonal} - -#Add in missing years for clearer plotting of trend -dfg_sm <- seasonal.mean -seas.miss <- rep(c("Spring", "Summer", "Autumn", "Winter"), 20) -year.miss <- rep(2041:2060, each=4) -add.s.y <- paste0(seas.miss, "_", year.miss) -add.s.y <- c("Winter_2040", add.s.y) - -dfg_sm <- plyr::rbind.fill(dfg_sm, - data.frame(year=c(2040, year.miss), - season_year=add.s.y, - mean.seasonal=NA, - sd.low.seasonal=NA, - sd.high.seasonal=NA)) - -dfg_sm <- dfg_sm[order(dfg_sm$year),] -``` - - -**Delta - seasonal** - -```{r delta seasonal} - -ggplot(dfg_sm) + - geom_ribbon(aes(x = 1:length(season_year), ymin = sd.low.seasonal, ymax=sd.high.seasonal), color="lightgrey", alpha=0.5) + - geom_line(aes(x=1:length(season_year), y=mean.seasonal), color="cornflowerblue", group=1) + - theme_bw() + ylab("Av daily max temp oC") + - ggtitle("Delta - tasmax seasonal") + - xlab("Season - Year") + - scale_x_discrete(labels = c(dfg_sm$season_year)) + - theme(axis.text.x = element_text(angle = 270, vjust = 0.5, hjust=1)) - -``` - - -**Delta - Winter only** - -```{r delta seasonal winter} - -dfg_sm_w <- subset(dfg_sm, grepl("Winter", season_year)) - -ggplot(dfg_sm_w) + - geom_ribbon(aes(year, ymin = sd.low.seasonal, ymax=sd.high.seasonal), - fill="lightblue3", alpha=0.5) + - geom_line(aes(year, y=mean.seasonal), color="lightblue4", group=1) + - theme_bw() + ylab("Av daily max temp oC") + - ggtitle("Delta - tasmax seasonal - Winter only") + - xlab("Year") + - theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) - -``` - - -**Delta - Summer only** - -```{r delta seasonal winter} - -dfg_sm_s <- subset(dfg_sm, grepl("Summer", season_year)) - -ggplot(dfg_sm_w) + - geom_ribbon(aes(year, ymin = sd.low.seasonal, ymax=sd.high.seasonal), - fill="lightblue3", alpha=0.5) + - geom_line(aes(year, y=mean.seasonal), color="lightblue4", group=1) + - theme_bw() + ylab("Av daily max temp oC") + - ggtitle("Delta - tasmax seasonal - Summer 2only") + - xlab("Year") + - theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) - -``` - - -#### **2aiii. Linear scaling method** - -**cemthods** implements - - - -```{r} -linear <- debiased_data$debiased_linear_method_result -nlyr(linear) #14400 - correct number for 40Y * (12m*30d) - ie 14400 days - -``` - -```{r cache=T} -names <- paste0("tasmax_", dmy_dash) -df <- as.data.frame(linear, xy=T) - -n2 <- c("x", "y", names) -names(df) <- n2 -``` - -```{r} -#Mean and sd of by day calculated for plotting -df.g <- apply(df[c(3:ncol(df))], 2, function(i){ - mean <- mean(i, na.rm=T) - sd <- sd(i, na.rm=T) - data.frame(mean=mean, - sd.high=mean+sd, - sd.low=mean-sd) -}) - -df.g <- df.g %>% purrr::reduce(rbind) -row.names(df.g) <- names -df.g$dmy <- dmy_dash -``` - -##### linear trend daily 2020 - 2040 - -```{r linear trend 1} -df.g2020 <- df.g[which(df.g$dmy<2060),] - -ggplot(df.g2020) + - geom_ribbon(aes(x = 1:length(dmy), ymin = sd.low, ymax=sd.high), color="lightgrey", alpha=0.5) + - geom_line(aes(x=1:length(dmy), y=mean), color="cornflowerblue", group=1) + - theme_bw() + ylab("Av daily max temp oC") + - ggtitle("linear - tasmax daily") + - scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Day") - -``` - - - -##### linear trend daily 2060 - 2080 - -```{r linear trend 2} -df.g2060 <- df.g[which(df.g$dmy>2059),] - -ggplot(df.g2060) + - geom_ribbon(aes(x = 1:length(dmy), ymin = sd.low, ymax=sd.high), color="lightgrey", alpha=0.5) + - geom_line(aes(x=1:length(dmy), y=mean), color="cornflowerblue", group=1) + - theme_bw() + ylab("Av daily max temp oC") + - ggtitle("linear - tasmax daily") + - scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Day") - -``` - -##### linear trend seasonal averages (full time series) - -```{r linear trend seasonal} - -#Annotate season based on month index -df.g$season <- ifelse(grepl("-12-|-1-|-2-", df.g$dmy), "Winter", - ifelse(grepl("-3-|-4-|-5-", df.g$dmy), "Spring", - ifelse(grepl("-6-|-7-|-8-", df.g$dmy), "Summer", "Autumn"))) - -df.g$year <- as.numeric(sub("-.*", "", df.g$dmy)) - -#Create a season_year var than considers the same Winter season across 2 years -## i.e. - Jan 2021 is considered as Winter 2020 -df.g$season_year <- ifelse(df.g$season != "Winter"| grepl("-12-", df.g$dmy), - paste0(df.g$season, "_", df.g$year), paste0(df.g$season,"_", df.g$year-1)) - - -#Calculate seasonal mean and SD -seasonal.mean <- df.g %>% - group_by(season_year) %>% mutate(mean.seasonal = mean(mean), - sd.high.seasonal = mean.seasonal + sd(mean), - sd.low.seasonal = mean.seasonal - sd(mean)) - -#Remove daily vals to avoid confusion -seasonal.mean[c("mean", "sd.high", "sd.low")] <- NULL - -#Remove duplicate values -seasonal.mean <- distinct(seasonal.mean, season_year, .keep_all=T) #160 seasons - -``` - - -```{r linear trend seasonal} - -#Add in missing years for clearer plotting of trend -dfg_sm <- seasonal.mean -seas.miss <- rep(c("Spring", "Summer", "Autumn", "Winter"), 20) -year.miss <- rep(2041:2060, each=4) -add.s.y <- paste0(seas.miss, "_", year.miss) -add.s.y <- c("Winter_2040", add.s.y) - -dfg_sm <- plyr::rbind.fill(dfg_sm, - data.frame(year=c(2040, year.miss), - season_year=add.s.y, - mean.seasonal=NA, - sd.low.seasonal=NA, - sd.high.seasonal=NA)) - -dfg_sm <- dfg_sm[order(dfg_sm$year),] -``` - - -**linear - seasonal** - -```{r linear seasonal} - -ggplot(dfg_sm) + - geom_ribbon(aes(x = 1:length(season_year), ymin = sd.low.seasonal, ymax=sd.high.seasonal), color="lightgrey", alpha=0.5) + - geom_line(aes(x=1:length(season_year), y=mean.seasonal), color="cornflowerblue", group=1) + - theme_bw() + ylab("Av daily max temp oC") + - ggtitle("linear - tasmax seasonal") + - xlab("Season - Year") + - scale_x_discrete(labels = c(dfg_sm$season_year)) + - theme(axis.text.x = element_text(angle = 270, vjust = 0.5, hjust=1)) - -``` - - -**linear - Winter only** - -```{r linear seasonal winter} - -dfg_sm_w <- subset(dfg_sm, grepl("Winter", season_year)) - -ggplot(dfg_sm_w) + - geom_ribbon(aes(year, ymin = sd.low.seasonal, ymax=sd.high.seasonal), - fill="lightblue3", alpha=0.5) + - geom_line(aes(year, y=mean.seasonal), color="lightblue4", group=1) + - theme_bw() + ylab("Av daily max temp oC") + - ggtitle("linear - tasmax seasonal - Winter only") + - xlab("Year") + - theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) - -``` - - -**linear - Summer only** - -```{r linear seasonal winter} - -dfg_sm_s <- subset(dfg_sm, grepl("Summer", season_year)) - -ggplot(dfg_sm_w) + - geom_ribbon(aes(year, ymin = sd.low.seasonal, ymax=sd.high.seasonal), - fill="lightblue3", alpha=0.5) + - geom_line(aes(year, y=mean.seasonal), color="lightblue4", group=1) + - theme_bw() + ylab("Av daily max temp oC") + - ggtitle("linear - tasmax seasonal - Summer 2only") + - xlab("Year") + - theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) - -``` - -#### **2aii. Delta method** - -**cemthods** implements Delta Method based on: Beyer, R. and Krapp, M. and Manica, A.: An empirical evaluation of bias correction methods for palaeoclimate simulations (https://doi.org/10.5194/cp-16-1493-2020), which describes the methods as follows: - -*The delta method consists of adding the difference between past and present-day simulated climate to present-day observed climate. As such, the delta method assumes that local (i.e. grid-cell-specific) model biases are constant over time (Maraun and Widmann, 2018). For temperature variables (including terrestrial and marine mean annual temperatures and terrestrial temperature of the warmest and coldest months considered here), the bias in a geographical location x is given by the difference between present-day observed and simulated temperature* - -*Precipitation is bounded below by zero and covers different orders of magnitude across different regions. A multiplicative rather than additive bias correction is therefore more common when applying the delta method for precipitation, which corresponds to applying the simulated relative change to the observations (Maraun and Widmann, 2018).* - - -```{r} -delta <- debiased_data$debiased_delta_method_result -nlyr(delta) #14400 - correct number for 40Y * (12m*30d) - ie 14400 days - -``` - -```{r cache=T} -names <- paste0("tasmax_", dmy_dash) -df <- as.data.frame(delta, xy=T) - -n2 <- c("x", "y", names) -names(df) <- n2 -``` - -```{r} -#Mean and sd of by day calculated for plotting -df.g <- apply(df[c(3:ncol(df))], 2, function(i){ - mean <- mean(i, na.rm=T) - sd <- sd(i, na.rm=T) - data.frame(mean=mean, - sd.high=mean+sd, - sd.low=mean-sd) -}) - -df.g <- df.g %>% purrr::reduce(rbind) -row.names(df.g) <- names -df.g$dmy <- dmy_dash -``` - -##### Delta trend daily 2020 - 2040 - -```{r delta trend 1} -df.g2020 <- df.g[which(df.g$dmy<2060),] - -ggplot(df.g2020) + - geom_ribbon(aes(x = 1:length(dmy), ymin = sd.low, ymax=sd.high), color="lightgrey", alpha=0.5) + - geom_line(aes(x=1:length(dmy), y=mean), color="cornflowerblue", group=1) + - theme_bw() + ylab("Av daily max temp oC") + - ggtitle("Delta - tasmax daily") + - scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Day") - -``` - - - -##### Delta trend daily 2060 - 2080 - -```{r delta trend 2} -df.g2060 <- df.g[which(df.g$dmy>2059),] - -ggplot(df.g2060) + - geom_ribbon(aes(x = 1:length(dmy), ymin = sd.low, ymax=sd.high), color="lightgrey", alpha=0.5) + - geom_line(aes(x=1:length(dmy), y=mean), color="cornflowerblue", group=1) + - theme_bw() + ylab("Av daily max temp oC") + - ggtitle("Delta - tasmax daily") + - scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Day") - -``` - -##### Delta trend seasonal averages (full time series) - -```{r delta trend seasonal} - -#Annotate season based on month index -df.g$season <- ifelse(grepl("-12-|-1-|-2-", df.g$dmy), "Winter", - ifelse(grepl("-3-|-4-|-5-", df.g$dmy), "Spring", - ifelse(grepl("-6-|-7-|-8-", df.g$dmy), "Summer", "Autumn"))) - -df.g$year <- as.numeric(sub("-.*", "", df.g$dmy)) - -#Create a season_year var than considers the same Winter season across 2 years -## i.e. - Jan 2021 is considered as Winter 2020 -df.g$season_year <- ifelse(df.g$season != "Winter"| grepl("-12-", df.g$dmy), - paste0(df.g$season, "_", df.g$year), paste0(df.g$season,"_", df.g$year-1)) - - -#Calculate seasonal mean and SD -seasonal.mean <- df.g %>% - group_by(season_year) %>% mutate(mean.seasonal = mean(mean), - sd.high.seasonal = mean.seasonal + sd(mean), - sd.low.seasonal = mean.seasonal - sd(mean)) - -#Remove daily vals to avoid confusion -seasonal.mean[c("mean", "sd.high", "sd.low")] <- NULL - -#Remove duplicate values -seasonal.mean <- distinct(seasonal.mean, season_year, .keep_all=T) #160 seasons - -``` - - -```{r delta trend seasonal} - -#Add in missing years for clearer plotting of trend -dfg_sm <- seasonal.mean -seas.miss <- rep(c("Spring", "Summer", "Autumn", "Winter"), 20) -year.miss <- rep(2041:2060, each=4) -add.s.y <- paste0(seas.miss, "_", year.miss) -add.s.y <- c("Winter_2040", add.s.y) - -dfg_sm <- plyr::rbind.fill(dfg_sm, - data.frame(year=c(2040, year.miss), - season_year=add.s.y, - mean.seasonal=NA, - sd.low.seasonal=NA, - sd.high.seasonal=NA)) - -dfg_sm <- dfg_sm[order(dfg_sm$year),] -``` - - -**Delta - seasonal** - -```{r delta seasonal} - -ggplot(dfg_sm) + - geom_ribbon(aes(x = 1:length(season_year), ymin = sd.low.seasonal, ymax=sd.high.seasonal), color="lightgrey", alpha=0.5) + - geom_line(aes(x=1:length(season_year), y=mean.seasonal), color="cornflowerblue", group=1) + - theme_bw() + ylab("Av daily max temp oC") + - ggtitle("Delta - tasmax seasonal") + - xlab("Season - Year") + - scale_x_discrete(labels = c(dfg_sm$season_year)) + - theme(axis.text.x = element_text(angle = 270, vjust = 0.5, hjust=1)) - -``` - - -**Delta - Winter only** - -```{r delta seasonal winter} - -dfg_sm_w <- subset(dfg_sm, grepl("Winter", season_year)) - -ggplot(dfg_sm_w) + - geom_ribbon(aes(year, ymin = sd.low.seasonal, ymax=sd.high.seasonal), - fill="lightblue3", alpha=0.5) + - geom_line(aes(year, y=mean.seasonal), color="lightblue4", group=1) + - theme_bw() + ylab("Av daily max temp oC") + - ggtitle("Delta - tasmax seasonal - Winter only") + - xlab("Year") + - theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) - -``` - - -**Delta - Summer only** - -```{r delta seasonal winter} - -dfg_sm_s <- subset(dfg_sm, grepl("Summer", season_year)) - -ggplot(dfg_sm_w) + - geom_ribbon(aes(year, ymin = sd.low.seasonal, ymax=sd.high.seasonal), - fill="lightblue3", alpha=0.5) + - geom_line(aes(year, y=mean.seasonal), color="lightblue4", group=1) + - theme_bw() + ylab("Av daily max temp oC") + - ggtitle("Delta - tasmax seasonal - Summer 2only") + - xlab("Year") + - theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) - -``` diff --git a/notebooks/Assessing_bc_data/Assessing_BC_Data.RMD b/notebooks/Assessing_bc_data/Assessing_BC_Data.RMD new file mode 100644 index 00000000..11752d27 --- /dev/null +++ b/notebooks/Assessing_bc_data/Assessing_BC_Data.RMD @@ -0,0 +1,797 @@ +--- +title: "Bias correction assessment" +author: "Ruth Bowyer" +date: "`r format(Sys.Date())`" +output: + html_document: + theme: cosmo + toc: TRUE + toc_float: TRUE + toc_depth: 4 + code_folding: hide + df_print: paged +--- + + +```{r libs and setup, message=FALSE, warning=F} +rm(list=ls()) + +knitr::opts_knit$set(root.dir="/mnt/vmfileshare/ClimateData/") + +library(ggplot2) +library(terra) +library(tmap) #pretty maps +library(RColorBrewer) +library(tidyverse) +library(kableExtra) + +``` + + +## **0. About** + +This is an example notebook for the assessment of bias corrected data, using output from the R 'qmap' package for the city of Glasgow and the variable 'tasmax'. + +**Input data** + +This script requires the following data: + +- 'obs.cal' - observation (HADs data) for the *calibration* period - the dataset used as the reference dataset in the bias correction +- 'obs.val' - as above, for the *validation* period +- 'cpm.cal.raw' - the raw (uncorrected) data for the *calibration* period +- 'cpm.cal.adj' - the adjusted (bias-corrected) data for the *calibration* period +- 'cpm.val.raw' - the raw (uncorrected) data for the *valibration* period +- 'cpm.val.adj' - the adjusted (bias-corrected) data for the *valibration* period +- 'cpm.proj.raw' - the raw (uncorrected) data for the *future/projected* period (optional) +- 'cpm.proj.radj' - the adjusted (bias-corrected) data for the *future/projected* period (optional) + +The data is required in raster format and dataframe formats + +**Calibration vs Validation dates** + +The calibration period runs between 01-12-1980 to the day prior to 01-12-2010 +The validation period runs between 01-12-2010 to the day prior to 01-12-2020 + +```{r data loading, include=FALSE} + +#This chunk attempts to apply the conversion to python output data to a form that this script will also use. This could (and probably should) be moved to a source script -- also the R pre-processing should probably be moved to the bias correction script? + +dd <- "/mnt/vmfileshare/ClimateData/" #Data directoy of all data used in this script + +input <- "RDS" #Either df or raster -- R outputs are a group of dfs in list form saved as an RDS, python input is a raster +city <- "Glasgow" +var <- "tasmax" +runs <- c("05", "06", "07", "08") + +if(input=="raster"){ + +####### PYTHON INPUTS HERE ###### + # This script uses both raster data and the raw data + # This script uses Lists to group everything by runs + # Therefore what is require from this here is to create a list object for each of the sets of the data as listed above, where the list items are the rasters or dataframes by run (ie each level of the list is a run) + # .nc and .tif files can be read with rast("path/to/file.nc") + # Conversion to df is just as.data.frame(raster, xy=T) - easiest thing is just to loop using lapply the files + #dfs are assumed to be cells x time + +} + + } else if(input=="RDS"){ + ### This R bit is a bit crazy because of the format output from the bias correction - at some point to be cleaned up and moved to a different script. + ## Load a source raster to extract the crs + ## Also to add needs to be changing the names of the dfs to make then same acorss all methods + r <- list.files(paste0(dd, "Reprojected/UKCP2.2/tasmax/05/latest/")) + r <- r[1] + rp <- paste0(dd, "Reprojected/UKCP2.2/tasmax/05/latest/", r) + rast <- rast(rp) + + crs <- crs(rast) + + ## The output created from the R bias correction framework is a list of dataframes containing all the data we need for this doc (although some are transposed). + rd <- "Debiased/R/QuantileMapping/three.cities/" + + files <- list.files(paste0(dd,rd,city),full.names=T) + files.v <- files[grepl(var, files)] + + allruns <- lapply(files.v, readRDS) + + names <- gsub(paste0(dd,rd,city,"|/|.RDS"),"",files.v) + names(allruns) <- names + + obs.val.df.L <- lapply(allruns, function(L){t(L[["t.obs"]])}) #This object stored in the results transposed for applying bias correction - maybe look at adding this to the bias correction script rather than here? + obs.cal.df.L <- lapply(allruns, function(L){L[["val.df"]]}) + + cpm.cal.raw.df.L <- lapply(allruns, function(L){t(L[["t.cal"]])}) + + #In the R scirpt, the validation is corrected with the projected data as well - so needs to be seperated out (and transposed) + cpm.val.raw.df.L <- lapply(allruns, function(L){ + proj <- as.data.frame(t(L[["t.proj"]])) + val.end.date <- min(grep("20201201-", names(proj)))-1 + cpm.val.raw.df <- proj[,1:val.end.date] + }) + + cpm.proj.raw.df.L <- lapply(allruns, function(L){ + proj <- as.data.frame(t(L[["t.proj"]])) + val.end.date <- min(grep("20201201-", names(proj))) + cpm.val.raw.df <- proj[,val.end.date:ncol(proj)] + }) + + cpm.cal.adj.df.L <- lapply(allruns, function(L){ + adj <- as.data.frame(t(L[["qm1.hist"]])) + }) + + cpm.val.adj.df.L <- lapply(allruns, function(L){ + proj <- as.data.frame(t(L[["qm1.val.proj"]])) + val.end.date <- min(grep("20201201-", names(proj)))-1 + proj[,1:val.end.date] + }) + + cpm.proj.adj.df.L <- lapply(allruns, function(L){ + proj <- as.data.frame(t(L[["qm1.val.proj"]])) + val.end.date <- min(grep("20201201-", names(proj))) + proj[,val.end.date:ncol(proj)] + }) + + ## Convert to rasters --requires creation of x and y cols from row names +``` + +#here - sorting out inputs and rasters in single chunk +```{r} + + df.rL <- lapply(runs, function(i){ + L <- allruns[[i]] + lapply(L, function(x){ + df <- t(x) + df <- as.data.frame(df) + rn <- row.names(df) #The rownames were saves as x_y coordinates + xi <- gsub("_.*", "", rn) + yi <- gsub(".*_", "", rn) + xy <- data.frame(x = xi, y = yi) + df <- cbind(xy, df) + L <- df.rL[[i]] + lapply(L, function(x){ + r <- rast(x, type="xyz") + crs(r) <- crs + return(r) + }) + }) + + names(df.rL) <- runs + +names(rasts) <- runs + + } else { + print("Invalid input") +} + + + +``` + + +## **1. Bias Correction Assessment: trends** + + +### **London - tasmax = Run 08** + +Using the London region (UKI) as this is the smallest -- not this is the same regional area as the 'three.cities' crops but cut to shapefile edges rather than the square grid + +Load in Hads validation data +(So this can be run for all of the LCAT data, I'm going to read in the whole HADs files for the calibration years) + +**The calibration period is 2009-12-01 to 2019-11-30 to relate to the CPM month grouping** + +Hads data were also cropped to the regional files for the calibration years - some of the dates might need to be added from the observation (or just be ignored for ease) + +```{r} +fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") +f <- list.files(fp) +v <- "tasmax" +reg <- "UKI" +f <- f[grepl("2010_2020", f)&grepl(v,f)&grepl(reg, f)] + +obs.val.df <- read.csv(paste0(fp,f)) # Starts here from 2010 - 01 -01 -- because for the BC I removed these vals to align with the cpm years we're missing the month of Dec - so need to update the cpm data to reflect this in the assessment -- wont be a problem for other BC data + +``` + + +### **1b. Check trends** + +The next set of chunks visualise the data + +This next chunk converts the dfs back to raster with the correct CRS +```{r convert to df and raster} + + +``` + +#### **Raster vis comparison** + +Random selection of 3 days of the observation, calibration and two adjusted cals, for three historic days +(Note: I'm just plotting the bias corrected with linear interpolation so as to overwhelm with plots) + +##### Fig. *Day 1 - 1980-12-01* + +```{r, fig.show="hold", out.width="33%"} +tm_shape(London.rasts$Run05$t.obs[[1]]) + tm_raster(title="Observation, 1980-12-01") #Obviously just one call of the observation +tm_shape(London.rasts$Run05$t.cal[[1]]) + tm_raster(title="Calibration, Run 05, Raw 1980-12-01") +tm_shape(London.rasts$Run06$t.cal[[1]]) + tm_raster(title="Calibration, Run 06, Raw 1980-12-01") +tm_shape(London.rasts$Run07$t.cal[[1]]) + tm_raster(title="Calibration, Run 07, Raw 1980-12-01") +tm_shape(London.rasts$Run08$t.cal[[1]]) + tm_raster(title="Calibration, Run 08, Raw 1980-12-01") +tm_shape(London.rasts$Run05$qm1.hist.a[[1]]) + tm_raster(title="Calibration, Run 05, BC 1980-12-01") +tm_shape(London.rasts$Run06$qm1.hist.a[[1]]) + tm_raster(title="Calibration, Run 06, BC 1980-12-01") +tm_shape(London.rasts$Run07$qm1.hist.a[[1]]) + tm_raster(title="Calibration, Run 07, BC 1980-12-01") +tm_shape(London.rasts$Run08$qm1.hist.a[[1]]) + tm_raster(title="Calibration, Run 08, BC 1980-12-01") + + +``` + +##### Fig. *Day 2 - 1991-06-01* + +Just to note I was so suprised by how much lower the observation data was for this raster I loaded the raw HADs to check (in resampled_2.2km/tasmax and the original 1km grid it does reflect it - it just seems very low) + +```{r, fig.show="hold", out.width="33%"} +tm_shape(London.rasts$Run05$t.obs[[3781]]) + tm_raster(title="Observation, 1991-06-01") #Obviously just one call of the observation +tm_shape(London.rasts$Run05$t.cal[[3781]]) + tm_raster(title="Calibration, Run 05, Raw 1991-06-01") +tm_shape(London.rasts$Run06$t.cal[[3781]]) + tm_raster(title="Calibration, Run 06, Raw 1991-06-01") +tm_shape(London.rasts$Run07$t.cal[[3781]]) + tm_raster(title="Calibration, Run 07, Raw 1991-06-01") +tm_shape(London.rasts$Run08$t.cal[[3781]]) + tm_raster(title="Calibration, Run 08, Raw 1991-06-01") +tm_shape(London.rasts$Run05$qm1.hist.a[[3781]]) + tm_raster(title="Calibration, Run 05, BC 1991-06-01") +tm_shape(London.rasts$Run06$qm1.hist.a[[3781]]) + tm_raster(title="Calibration, Run 06, BC 1991-06-01") +tm_shape(London.rasts$Run07$qm1.hist.a[[3781]]) + tm_raster(title="Calibration, Run 07, BC 1991-06-01") +tm_shape(London.rasts$Run08$qm1.hist.a[[3781]]) + tm_raster(title="Calibration, Run 08, BC 1991-06-01") + +``` + + + +##### Fig. *Day 3 - 2000-08-01* + + +```{r, fig.show="hold", out.width="33%"} +tm_shape(London.rasts$Run05$t.obs[[7081]]) + tm_raster(title="Observation, 2000-08-01") #Obviously just one call of the observation +tm_shape(London.rasts$Run05$t.cal[[7081]]) + tm_raster(title="Calibration, Run 05, Raw 2000-08-01") +tm_shape(London.rasts$Run06$t.cal[[7081]]) + tm_raster(title="Calibration, Run 06, Raw 2000-08-01") +tm_shape(London.rasts$Run07$t.cal[[7081]]) + tm_raster(title="Calibration, Run 07, Raw 2000-08-01") +tm_shape(London.rasts$Run08$t.cal[[7081]]) + tm_raster(title="Calibration, Run 08, Raw 2000-08-01") +tm_shape(London.rasts$Run05$qm1.hist.a[[7081]]) + tm_raster(title="Calibration, Run 05, BC 2000-08-01") +tm_shape(London.rasts$Run06$qm1.hist.a[[7081]]) + tm_raster(title="Calibration, Run 06, BC 2000-08-01") +tm_shape(London.rasts$Run07$qm1.hist.a[[7081]]) + tm_raster(title="Calibration, Run 07, BC 2000-08-01") +tm_shape(London.rasts$Run08$qm1.hist.a[[7081]]) + tm_raster(title="Calibration, Run 08, BC 2000-08-01") + +``` + +#### **Calibration period - annual trends** + + +```{r} +#Returns a list of dfs in handy format for graphing +London.dfg.rL <- lapply(runs, function(i){ + L <- London.df.rL[[i]] + names(L)[1:3] <- c("obs", "cal", "proj") + dfg <- lapply(names(L), function(ii){ + dfi <- L[[ii]] + x <- 3:ncol(dfi) #ignore cols 1 & 2 with x y + #Calc mean and sd + dfx <- lapply(x, function(x){ + y <- dfi[,x] + mean <- mean(y, na.rm=T) + sd <- sd(y, na.rm=T) + dfr <- data.frame(mean=mean, + sd.high=mean+sd, + sd.low=mean-sd) + names(dfr) <- paste0(ii, ".", names(dfr)) + dfr$day <- names(dfi)[x] + return(dfr) + }) + + dfx_g <- dfx %>% purrr::reduce(rbind) + }) + + names(dfg) <- c("obs.daymeans", "raw.cal.daymeans", + "raw.proj.daymeans", "bc.a.cal.daymeans", + "bc.b.cal.daymeans", "bc.a.proj.daymeans", + "bc.b.proj.daymeans") + + return(dfg) +}) + +names(London.dfg.rL) <- runs +``` + + +```{r} +#Create a df for all of the runs to plot +##Add a day index to align the cal and obs + +London.dfg.calp.L <- lapply(runs, function(i){ + dfg <- London.dfg.rL[[i]] + dfg.calp <- dfg[c("obs.daymeans", "raw.cal.daymeans", + "bc.b.cal.daymeans", "bc.a.cal.daymeans")] + + dfg.calp <- lapply(dfg.calp, function(x){ + x$dayi <- 1:nrow(x) + x$day<- NULL + return(x) + }) + + + dfg.calp <- dfg.calp %>% reduce(merge, "dayi") + dfg.calp$Run <- i + return(dfg.calp)}) + +names(London.dfg.calp.L) <- runs + +London.dfg.calp <- London.dfg.calp.L %>% reduce(rbind) + +``` + +```{r} + +London.dfg.calp_m <- reshape2::melt(London.dfg.calp, id=c("dayi", "Run")) #create long df for plotting multiple lines + +London.dfg.calp_mm <- London.dfg.calp_m[grepl("mean", London.dfg.calp_m$variable),] #For easy vis, only keep mean vals +``` + +#### Fig. Calibration period - annual mean + +```{r Historic trend 1} + +ggplot(London.dfg.calp_mm, aes(dayi, value, group=variable, colour=variable)) + + geom_line(alpha=0.7) + + facet_wrap(.~Run) + + theme_bw() + ylab("Av daily max temp oC") + + ggtitle("Tasmax Hisotric trends") + + scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Day, 1980.12.01 - 2009.12.01") + + scale_color_brewer(palette="Set1", name="Model", labels=c("Obs (Hads)", "Raw CPM", "BC CPM 1", "BC CPM 2")) + +``` + +#### **Seasonal trends - Calibration period ** + +Annotate season based on month index - the dates have different formats depending on the input data (ie hads vs cpm) so am pulling out the necessary to adjust sep + +```{r} + +seasonal.means <- lapply(runs, function(r){ + dfg <- London.dfg.rL[[r]] + #Hads/obs df + obs.daymeans.df <- dfg$obs.daymeans + + x <- obs.daymeans.df$day + obs.daymeans.df$season <- ifelse(grepl("1231_|0131_|0228_|0229_", x), + "Winter", + ifelse(grepl("0331_|0430_|0531_", x), "Spring", + ifelse(grepl("0630_|0731_|0831_", x), "Summer", "Autumn"))) + +#Note: the seasons should each have 90 days but seemingly Winter and Autumn have 89 and Spring and Summer have 91 - this is due to how the manual aligning worked out and should be updated when the hads data is re-run + + #Create season_year - All Winter months apart from Dec to be added to the previous year (ie Winter 2000) would be the Dec of 2000 to the Feb of 2001 + year <- gsub("^[^_]*_", "", x) + year <- as.numeric(substr(year, 1,4)) + obs.daymeans.df$season_year <- ifelse(grepl("0131_|0228_|0229_", x), + paste0(year-1, obs.daymeans.df$season), + paste0(year, obs.daymeans.df$season)) + # Mutate to a seasonal mean df + obs.seasonal.mean.df <- aggregate(obs.daymeans.df[[1]], list(obs.daymeans.df[["season_year"]]), function(x) c(seasonal.mean = mean(x), sd.high.seasonal = mean(x) + sd(x), sd.low.seasonal = mean(x) - sd(x))) + obs.seasonal.mean.df<- data.frame(season_year=obs.seasonal.mean.df$Group.1, + seasonal.mean=obs.seasonal.mean.df$x[,"seasonal.mean"], + sd.high.seasonal = obs.seasonal.mean.df$x[,"sd.high.seasonal"], + sd.low.seasonal = obs.seasonal.mean.df$x[,"sd.low.seasonal"]) + + + #Grouping variable for later vars + obs.seasonal.mean.df$model <- "obs" + + + dfg.seasonal.mean <- lapply(c("raw.cal.daymeans", "bc.b.cal.daymeans", + "bc.a.cal.daymeans"), function(i){ + df <- dfg[[i]] + x <- df$day + x <- gsub(".*_", "", x) + x <- as.numeric(x) + #The CPM days are consecutive 1 - 360 by year + df$season <- ifelse(x<91, "Winter", + ifelse(x<181, "Spring", + ifelse(x<271, "Summer", "Autumn"))) + + #Create season_year - All Winter months apart from Dec to be added to the previous year (ie Winter 2000) would be the Dec of 2000 to the Feb of 2001 + year <- gsub(".*day_", "", df$day) + year <- as.numeric(substr(year, 1,4)) + df$season_year <- ifelse(x>29&x<91, + paste0(year-1, df$season), + paste0(year, df$season)) + + # Mutate to a seasonal mean -- cant get this to run in tidyverse within loop as cant seem to get col indexing working so: + df2 <- aggregate(df[[1]], list(df[["season_year"]]), function(x) c(seasonal.mean = mean(x), sd.high.seasonal = mean(x) + sd(x), sd.low.seasonal = mean(x) - sd(x))) + + df2 <- data.frame(season_year=df2$Group.1, + seasonal.mean=df2$x[,"seasonal.mean"], + sd.high.seasonal = df2$x[,"sd.high.seasonal"], + sd.low.seasonal = df2$x[,"sd.low.seasonal"]) + + df2$model <- gsub(".daymeans","",i) + + return(df2)}) + + dff <- c(list(obs.seasonal.mean.df), dfg.seasonal.mean) %>% reduce(rbind) + dff$Run <- r + return(dff) +}) + +names(seasonal.means) <- runs + +seasonal.means.df <- seasonal.means %>% reduce(rbind) + +``` + +#### Fig. Calibration period - seasonal mean + +```{r} + +ggplot(seasonal.means.df, aes(season_year, seasonal.mean, group=model, colour=model)) + + geom_line() + + facet_wrap(.~Run) + + theme_bw() + ylab("Av daily max temp oC") + + ggtitle("Tasmax Hisotric trends") + + scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Seasonal averages, 1980.12.01 - 2009.12.01") + + scale_color_brewer(palette="Set1", name="Model") + +``` + + +##### *Summer only* + +```{r Raw seasonal winter} + +dfg_sm<- subset(seasonal.means.df, grepl("Summer", season_year)) + +ggplot(dfg_sm, aes(season_year, seasonal.mean, group=model, colour=model)) + + geom_line(alpha=0.7) + + facet_wrap(.~Run) + + theme_bw() + ylab("Av daily max temp oC -Summer average") + + ggtitle("Tasmax Hisotric trends") + + scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Summer averages, 1980.12.01 - 2009.12.01") + + scale_color_brewer(palette="Set1", name="Model") + +``` +It looks purple because the two bc methods aren't revealing much difference so subsetting to just one instead + + +```{r} + +dfg_sm<- subset(seasonal.means.df, !grepl("bc.b.cal", model)&grepl("Summer", season_year)) + +ggplot(dfg_sm, aes(season_year, seasonal.mean, group=model, colour=model)) + + geom_line(alpha=0.7) + + facet_wrap(.~Run) + + theme_bw() + ylab("Av daily max temp oC -Summer average") + + ggtitle("Tasmax Hisotric trends") + + scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Seasonal averages, 1980.12.01 - 2009.12.01") + + scale_color_brewer(palette="Set1", name="Model") + +``` + +#### *Annual trends - seasonal max* + +For tasmax - grouping to season and calculating the seasonal maxima vals (i.e. rather than means above) + +```{r} + +#Convert to max, out put a df in easy fig format +London.dfg.max <- lapply(runs, function(r){ + L <- London.df.rL[[r]] + names(L)[1:3] <- c("obs", "cal", "proj") + dfg <- lapply(names(L), function(ii){ + dfi <- L[[ii]] + x <- 3:ncol(dfi) #ignore cols 1 & 2 with x y + #Calc maxima of the + dfx <- lapply(x, function(x){ + xx <- dfi[,x] + data.frame(max=max(xx, na.rm=T), day= names(dfi)[x]) + }) + + dfx_g <- dfx %>% purrr::reduce(rbind) + }) + + names(dfg) <- paste0(names(L), ".max") + return(dfg) +}) + +names(London.dfg.max) <- runs + +seasonal.max.cal <- lapply(runs, function(r){ + dfg <- London.dfg.max[[r]] + #Hads/obs df + df1 <- dfg$obs.max + x <- df1$day + df1$season <- ifelse(grepl("1231_|0131_|0228_|0229_", x), + "Winter", + ifelse(grepl("0331_|0430_|0531_", x), "Spring", + ifelse(grepl("0630_|0731_|0831_", x), "Summer", "Autumn"))) + +#Note: the seasons should each have 90 days but seemingly Winter and Autumn have 89 and Spring and Summer have 91 - this is due to how the manual aligning worked out and should be updated when the hads data is re-run + + #Create season_year - All Winter months apart from Dec to be added to the previous year (ie Winter 2000) would be the Dec of 2000 to the Feb of 2001 + year <- gsub("^[^_]*_", "", x) + year <- as.numeric(substr(year, 1,4)) + df1$season_year <- ifelse(grepl("0131_|0228_|0229_", x), + paste0(year-1, df1$season), + paste0(year, df1$season)) + # Mutate to a seasonal mean df + obs.seasonal.max.df <- aggregate(df1[[1]], list(df1[["season_year"]]), max) + #Grouping variable for later vars + obs.seasonal.max.df$model <- "obs" + + dfg.seasonal.max <- lapply(c("cal.max", "qm1.hist.a.max", + "qm1.hist.b.max"), function(i){ + df <- dfg[[i]] + x <- df$day + x <- gsub(".*_", "", x) + x <- as.numeric(x) + #The CPM days are consecutive 1 - 360 by year + df$season <- ifelse(x<91, "Winter", + ifelse(x<181, "Spring", + ifelse(x<271, "Summer", "Autumn"))) + + #Create season_year - All Winter months apart from Dec to be added to the previous year (ie Winter 2000) would be the Dec of 2000 to the Feb of 2001 + year <- gsub(".*day_", "", df$day) + year <- as.numeric(substr(year, 1,4)) + df$season_year <- ifelse(x>29&x<91, + paste0(year-1, df$season), + paste0(year, df$season)) + + # Mutate to a seasonal mean -- cant get this to run in tidyverse within loop as cant seem to get col indexing working so: + df2 <- aggregate(df[[1]], list(df[["season_year"]]), max) + + df2$model <- gsub(".max","",i) + + return(df2)}) + + dff <- c(list(obs.seasonal.max.df), dfg.seasonal.max) %>% reduce(rbind) + dff$Run <- r + return(dff) +}) + +names(seasonal.max.cal) <- runs + +seasonal.maxima.df <- seasonal.max.cal %>% reduce(rbind) +names(seasonal.maxima.df) <- c("season_year", "max", "model", "Run") +``` + +#### Fig. Calibration period - seasonal max + +```{r} + +ggplot(seasonal.maxima.df, aes(season_year, max, group=model, colour=model)) + + geom_line() + + facet_wrap(.~Run) + + theme_bw() + ylab("Max daily max temp oC") + + ggtitle("Tasmax Hisotric trends") + + scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Seasonal averages, 1980.12.01 - 2009.12.01") + + scale_color_brewer(palette="Set1", name="Model") + +``` + +#### Fig. Calibration period - *Summer only* + +```{r} + +dfg_sm<- subset(seasonal.maxima.df, !grepl("qm1.hist.b", model)&grepl("Summer", season_year)) + +ggplot(dfg_sm, aes(season_year, max, group=model, colour=model)) + + geom_line(alpha=0.7) + + facet_wrap(.~Run) + + theme_bw() + ylab("Av daily max temp oC -Summer average") + + ggtitle("Tasmax Historic trends") + + scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Seasonal Summer averages, 1980.12.01 - 2009.12.01") + + scale_color_brewer(palette="Set1", name="Model") + +``` + +#### Create validaton df list + +Adding in the observational HADs data and aligning based on dates + +*Note* So as not to re-run the UK wide LCAT data processing, a workaround was added to the bias correction function used to group the obs data - this means that to align the validation cpm data we have to remove a month in the beginning + +```{r} + +#Extract validation period of raw and bias corrected CPM data +val.dfs <- lapply(runs, function(r){ + London.df <- London.df.rL[[r]] + cpm.val.dfs <- lapply(London.df[c("t.proj", "qm1.proj.a", "qm1.proj.b")], function(x){ + i <- grep("20191201-20201130_30", names(x))[1] + df <- x[,1:i] + }) + + #Using the old cpm data for the hads obs - so need to remove the dates to ensure theres 30 days per year + remove <- c("0229_29", "0430_30", "0731_31", "0930_30", "1130_30") + remove <- paste0(remove, collapse = "|") + + obs.val.df <- obs.val.df[,!grepl(remove, names(obs.val.df))] + row.names(obs.val.df) <- paste0(obs.val.df$x, "_", obs.val.df$y) + + val.dfs <- c(list(obs.val.df), cpm.val.dfs) + names(val.dfs) <- c("obs.val.df", "raw.cpm.val", "bc1.cpm.val", "bc2.cpm.val") + return(val.dfs) + }) + +names(val.dfs) <- runs +``` + + +#### *Validation period - annual trends - seasonal mean* + +(To be added) + +#### *Validation period - annual trends - seasonal max* + +(To be added) + +## **2. Bias Correction Assessment: Metrics** + +Using the validation data set for this + + +```{r} +#Convert dfs to a vector +val.dfs.v <- lapply(runs, function(r){ + dfs <- val.dfs[[r]] + dfs2 <- lapply(dfs, function(d){ + #Remove x and y + d$x <- NULL + d$y <- NULL + #Convert to single vector + unlist(as.vector(d))}) + names(dfs2) <- names(dfs) + +val.dfs.v.df <- dfs2 %>% reduce(cbind) +val.dfs.v.df <- as.data.frame(val.dfs.v.df)}) + +names(val.dfs.v) <- runs +``` + +```{r} +val.dfs.v <- lapply(runs, function(r){ + df <- val.dfs.v[[r]] + names(df) <-paste0(r, ".", c("obs.val.df", "raw.cpm.val", "bc1.cpm.val", "bc2.cpm.val")) + return(df) +}) + +#Convert to a single df +val.dfs.v.allruns <- val.dfs.v %>% reduce(cbind) + +#Remove duplicate obs (pulled through across each run) +val.dfs.v.allruns[c("Run06.obs.val.df", "Run07.obs.val.df", "Run08.obs.val.df")] <- NULL +names(val.dfs.v.allruns)[1] <- "obs.val" +``` + +### **2a. Descriptive statistics** + +```{r descriptives validation} + +descriptives <- apply(val.dfs.v.allruns,2, function(x){ + per <- data.frame(as.list(quantile(x, probs=c(0.1, 0.9)))) + data.frame(mean=mean(x), sd=sd(x), min = min(x), per10th=per$X10.,per90th=per$X90., max = max(x)) +}) + +descriptives <- descriptives %>% reduce(rbind) +row.names(descriptives) <- names(val.dfs.v.allruns) +t(descriptives) +``` + + + +#### **Distribution** + +```{r} + +names(val.dfs.v) <- runs +val.dfs.v_fordist <- lapply(runs, function(r){ + df <- val.dfs.v[[r]] + names(df) <- c("obs", "raw.cpm", "bc1.cpm", "bc2.cpm") + df$run <- paste0(r) + return(df) +}) + +#Convert to a single df +val.dfs.v.allruns_fordist <- val.dfs.v_fordist %>% reduce(rbind) +val.dfg <- reshape2::melt(val.dfs.v.allruns_fordist, id="run") +``` + +#### Fig.Density plot of validation period + +```{r} +ggplot(subset(val.dfg, variable!="bc2.cpm"), aes(value, fill=variable, colour=variable)) + + geom_density(alpha = 0.3, position="identity") + + facet_wrap(.~ run) + + theme_minimal() + + scale_fill_brewer(palette = "Set1") + + scale_color_brewer(palette = "Set1") + +``` +### **2b. Model fit statistics** + +Using the following to assess overall fit: + +- **R-squared (rsq)** +- **Root Square Mean Error (RMSE)** +- **Nash-Sutcliffe Efficiency (NSE):** Magnitude of residual variance compared to measured data variance, ranges -∞ to 1, 1 = perfect match to observations +- **Percent bias (PBIAS):** The optimal value of PBIAS is 0.0, with low-magnitude values indicating accurate model simulation. Positive values indicate overestimation bias, whereas negative values indicate model underestimation bias. + +```{r rsq} +actual <- val.dfs.v.allruns$obs.val + +rsq <- sapply(val.dfs.v.allruns[c(2:ncol(val.dfs.v.allruns))], function(x){ + cor(actual, x)^2 +}) + + t(data.frame(as.list(rsq), row.names = "RSQ")) +``` + +```{r rmse} + +rmse <- sapply(val.dfs.v.allruns[c(2:ncol(val.dfs.v.allruns))], function(x){ + sqrt(mean((actual - x)^2)) +}) + +``` + +```{r pbias} + +pbias <- sapply(val.dfs.v.allruns[c(2:ncol(val.dfs.v.allruns))], function(x){ + hydroGOF::pbias(x, actual) +}) + +``` + +```{r nse} +nse <- sapply(val.dfs.v.allruns[c(2:ncol(val.dfs.v.allruns))], function(x){ + hydroGOF::NSE(x, actual) +}) + +``` + +Highlighting the bias corrected statistics + +```{r pretty kable} + +k <- cbind(rsq, rmse, pbias, nse) +k %>% + kable(booktabs = T) %>% + kable_styling() %>% + row_spec(grep(".bc.",row.names(k)), background = "lightgrey") + +``` + + + +## **3. Bias Correction Assessment: Metric specific - tasmax** + +### **3b Days above 30 degrees** + +(Not considered consecutively here) + +```{r} +val.dfs.v.allruns$year <- substr(row.names(val.dfs.v.allruns), 8,11) + +over30 <- lapply(names(val.dfs.v.allruns), function(i){ + x <- val.dfs.v.allruns[,i] + df <- aggregate(x, list(val.dfs.v.allruns$year), function(x){sum(x>=30)}) + names(df) <- c("year", paste0("Days.over.30.", i)) + return(df) +}) + +over30 %>% reduce(left_join, "year") +``` + + +### **Number of heatwaves per annum** + +(to be added) + +#### **For future work** + +The number of quantiles selected will effect the efficacy of the bias correction: lots of options therefore with this specific method + + From 70c4bb0aaaf11ac4be2d3f55dbfcbbbd43d1bb4b Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Wed, 4 Oct 2023 15:15:41 +0000 Subject: [PATCH 30/83] Resaving to avoid confusion --- R/LCAT/Assessing.BC.data.LCAT.RMD | 198 ++++++++++-------------------- 1 file changed, 63 insertions(+), 135 deletions(-) diff --git a/R/LCAT/Assessing.BC.data.LCAT.RMD b/R/LCAT/Assessing.BC.data.LCAT.RMD index 11752d27..18e48b29 100644 --- a/R/LCAT/Assessing.BC.data.LCAT.RMD +++ b/R/LCAT/Assessing.BC.data.LCAT.RMD @@ -1,5 +1,5 @@ --- -title: "Bias correction assessment" +title: "Bias correction assessment of LCAT data" author: "Ruth Bowyer" date: "`r format(Sys.Date())`" output: @@ -24,157 +24,53 @@ library(tmap) #pretty maps library(RColorBrewer) library(tidyverse) library(kableExtra) +library(plotrix) #For taylor diagrams + +dd <- "/mnt/vmfileshare/ClimateData/" ``` ## **0. About** -This is an example notebook for the assessment of bias corrected data, using output from the R 'qmap' package for the city of Glasgow and the variable 'tasmax'. - -**Input data** - -This script requires the following data: - -- 'obs.cal' - observation (HADs data) for the *calibration* period - the dataset used as the reference dataset in the bias correction -- 'obs.val' - as above, for the *validation* period -- 'cpm.cal.raw' - the raw (uncorrected) data for the *calibration* period -- 'cpm.cal.adj' - the adjusted (bias-corrected) data for the *calibration* period -- 'cpm.val.raw' - the raw (uncorrected) data for the *valibration* period -- 'cpm.val.adj' - the adjusted (bias-corrected) data for the *valibration* period -- 'cpm.proj.raw' - the raw (uncorrected) data for the *future/projected* period (optional) -- 'cpm.proj.radj' - the adjusted (bias-corrected) data for the *future/projected* period (optional) - -The data is required in raster format and dataframe formats - -**Calibration vs Validation dates** - -The calibration period runs between 01-12-1980 to the day prior to 01-12-2010 -The validation period runs between 01-12-2010 to the day prior to 01-12-2020 - -```{r data loading, include=FALSE} +LCAT require 'bias corrected' data for the whole of the UK. +We have applied a widely used approach, quantile mapping, to the data. Specifically, we have used *non-parametric quantile mapping using empirical quantiles* as available in the `Qmap` package. +Because the data is so large, we have applied this bias correction to the UK broked down into regions, with Scotland brokedn down into further regions (see `R/LCAT/Region.Refs.csv`) -#This chunk attempts to apply the conversion to python output data to a form that this script will also use. This could (and probably should) be moved to a source script -- also the R pre-processing should probably be moved to the bias correction script? +We will now: -dd <- "/mnt/vmfileshare/ClimateData/" #Data directoy of all data used in this script +- Assess the bias correction using some of the segments +- Process the data back to geotiff +- Either process as monthly data or as UK wide rasters (maybe just write them seperately) and av across runs -input <- "RDS" #Either df or raster -- R outputs are a group of dfs in list form saved as an RDS, python input is a raster -city <- "Glasgow" -var <- "tasmax" -runs <- c("05", "06", "07", "08") +The data is within `ClimateData/Debiased/R/QuantileMapping` and is in RDS format, with each object containing a list. -if(input=="raster"){ +The objects within this R list are as follows: +- 't.obs': transposed observation df +- 't.cal': transposed calibration df +- 't.proj': transposed projection df (included the validation period) +- 'qm1.hist.a' - bias corrected values for the historical period, values fitted with linear interpolation +- 'qm1.hist.b' - bias corrected values for the historical period, values fitted with tricubic interpolation +- 'qm1.proj.a' - bias corrected values for the validation/projection period, values fitted with linear interpolation +- 'qm1.proj.b' - bias corrected values for the validation/projection period, values fitted with tricubic interpolation -####### PYTHON INPUTS HERE ###### - # This script uses both raster data and the raw data - # This script uses Lists to group everything by runs - # Therefore what is require from this here is to create a list object for each of the sets of the data as listed above, where the list items are the rasters or dataframes by run (ie each level of the list is a run) - # .nc and .tif files can be read with rast("path/to/file.nc") - # Conversion to df is just as.data.frame(raster, xy=T) - easiest thing is just to loop using lapply the files - #dfs are assumed to be cells x time +## **1. Bias Correction Assessment: trends** -} +### **London - tasmax = Run 08** - } else if(input=="RDS"){ - ### This R bit is a bit crazy because of the format output from the bias correction - at some point to be cleaned up and moved to a different script. - ## Load a source raster to extract the crs - ## Also to add needs to be changing the names of the dfs to make then same acorss all methods - r <- list.files(paste0(dd, "Reprojected/UKCP2.2/tasmax/05/latest/")) - r <- r[1] - rp <- paste0(dd, "Reprojected/UKCP2.2/tasmax/05/latest/", r) - rast <- rast(rp) - - crs <- crs(rast) - - ## The output created from the R bias correction framework is a list of dataframes containing all the data we need for this doc (although some are transposed). - rd <- "Debiased/R/QuantileMapping/three.cities/" - - files <- list.files(paste0(dd,rd,city),full.names=T) - files.v <- files[grepl(var, files)] - - allruns <- lapply(files.v, readRDS) - - names <- gsub(paste0(dd,rd,city,"|/|.RDS"),"",files.v) - names(allruns) <- names - - obs.val.df.L <- lapply(allruns, function(L){t(L[["t.obs"]])}) #This object stored in the results transposed for applying bias correction - maybe look at adding this to the bias correction script rather than here? - obs.cal.df.L <- lapply(allruns, function(L){L[["val.df"]]}) - - cpm.cal.raw.df.L <- lapply(allruns, function(L){t(L[["t.cal"]])}) - - #In the R scirpt, the validation is corrected with the projected data as well - so needs to be seperated out (and transposed) - cpm.val.raw.df.L <- lapply(allruns, function(L){ - proj <- as.data.frame(t(L[["t.proj"]])) - val.end.date <- min(grep("20201201-", names(proj)))-1 - cpm.val.raw.df <- proj[,1:val.end.date] - }) - - cpm.proj.raw.df.L <- lapply(allruns, function(L){ - proj <- as.data.frame(t(L[["t.proj"]])) - val.end.date <- min(grep("20201201-", names(proj))) - cpm.val.raw.df <- proj[,val.end.date:ncol(proj)] - }) - - cpm.cal.adj.df.L <- lapply(allruns, function(L){ - adj <- as.data.frame(t(L[["qm1.hist"]])) - }) - - cpm.val.adj.df.L <- lapply(allruns, function(L){ - proj <- as.data.frame(t(L[["qm1.val.proj"]])) - val.end.date <- min(grep("20201201-", names(proj)))-1 - proj[,1:val.end.date] - }) - - cpm.proj.adj.df.L <- lapply(allruns, function(L){ - proj <- as.data.frame(t(L[["qm1.val.proj"]])) - val.end.date <- min(grep("20201201-", names(proj))) - proj[,val.end.date:ncol(proj)] - }) - - ## Convert to rasters --requires creation of x and y cols from row names -``` +Using the London region (UKI) as this is the smallest -- not this is the same regional area as the 'three.cities' crops but cut to shapefile edges rather than the square grid -#here - sorting out inputs and rasters in single chunk ```{r} - - df.rL <- lapply(runs, function(i){ - L <- allruns[[i]] - lapply(L, function(x){ - df <- t(x) - df <- as.data.frame(df) - rn <- row.names(df) #The rownames were saves as x_y coordinates - xi <- gsub("_.*", "", rn) - yi <- gsub(".*_", "", rn) - xy <- data.frame(x = xi, y = yi) - df <- cbind(xy, df) - L <- df.rL[[i]] - lapply(L, function(x){ - r <- rast(x, type="xyz") - crs(r) <- crs - return(r) - }) - }) - - names(df.rL) <- runs - -names(rasts) <- runs - - } else { - print("Invalid input") -} +runs <- c("Run05", "Run06", "Run07", "Run08") +London.allruns <- lapply(runs, function(i){ + rds <- paste0(dd,"/Debiased/R/QuantileMapping/resultsL",i,"_UKI_tasmax.RDS") + readRDS(rds)}) +names(London.allruns) <- runs ``` - -## **1. Bias Correction Assessment: trends** - - -### **London - tasmax = Run 08** - -Using the London region (UKI) as this is the smallest -- not this is the same regional area as the 'three.cities' crops but cut to shapefile edges rather than the square grid - Load in Hads validation data (So this can be run for all of the LCAT data, I'm going to read in the whole HADs files for the calibration years) @@ -201,6 +97,40 @@ The next set of chunks visualise the data This next chunk converts the dfs back to raster with the correct CRS ```{r convert to df and raster} +## Load a source raster to extract the crs +r <- list.files(paste0(dd, "Reprojected/UKCP2.2/tasmax/05/latest/")) +r <- r[1] +rp <- paste0(dd, "Reprojected/UKCP2.2/tasmax/05/latest/", r) +rast <- rast(rp) + +crs <- crs(rast) + +## Convert from matrix to df, transpose, create x and y cols - when run in chunk this works fine but for some reason can throw an error when run otherwise +London.df.rL <- lapply(runs, function(i){ + L <- London.allruns[[i]] + lapply(L, function(x){ + df <- t(x) + df <- as.data.frame(df) + rn <- row.names(df) #The rownames were saves as x_y coordinates + xi <- gsub("_.*", "", rn) + yi <- gsub(".*_", "", rn) + xy <- data.frame(x = xi, y = yi) + df <- cbind(xy, df)}) + }) + +names(London.df.rL) <- runs + +## Convert to rasters +London.rasts <- lapply(runs, function(i){ + L <- London.df.rL[[i]] + lapply(L, function(x){ + r <- rast(x, type="xyz") + crs(r) <- crs + return(r)}) +}) + +names(London.rasts) <- runs + ``` @@ -792,6 +722,4 @@ over30 %>% reduce(left_join, "year") #### **For future work** -The number of quantiles selected will effect the efficacy of the bias correction: lots of options therefore with this specific method - - +The number of quantiles selected will effect the efficacy of the bias correction: lots of options therefore with this specific method \ No newline at end of file From 2b90ca008b4e29fd9f47999bce981505c8a0cb8c Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Sun, 8 Oct 2023 21:38:01 +0000 Subject: [PATCH 31/83] Sorting out raster plotting --- .../Assessing_bc_data/Assessing_BC_Data.RMD | 446 ++++++++++++------ 1 file changed, 307 insertions(+), 139 deletions(-) diff --git a/notebooks/Assessing_bc_data/Assessing_BC_Data.RMD b/notebooks/Assessing_bc_data/Assessing_BC_Data.RMD index 11752d27..f45b14e8 100644 --- a/notebooks/Assessing_bc_data/Assessing_BC_Data.RMD +++ b/notebooks/Assessing_bc_data/Assessing_BC_Data.RMD @@ -56,7 +56,7 @@ The validation period runs between 01-12-2010 to the day prior to 01-12-2020 #This chunk attempts to apply the conversion to python output data to a form that this script will also use. This could (and probably should) be moved to a source script -- also the R pre-processing should probably be moved to the bias correction script? -dd <- "/mnt/vmfileshare/ClimateData/" #Data directoy of all data used in this script +dd <- "/mnt/vmfileshare/ClimateData/" #Data directory of all data used in this script input <- "RDS" #Either df or raster -- R outputs are a group of dfs in list form saved as an RDS, python input is a raster city <- "Glasgow" @@ -73,17 +73,11 @@ if(input=="raster"){ # Conversion to df is just as.data.frame(raster, xy=T) - easiest thing is just to loop using lapply the files #dfs are assumed to be cells x time -} - - } else if(input=="RDS"){ +} else if (input=="RDS"){ ### This R bit is a bit crazy because of the format output from the bias correction - at some point to be cleaned up and moved to a different script. ## Load a source raster to extract the crs - ## Also to add needs to be changing the names of the dfs to make then same acorss all methods - r <- list.files(paste0(dd, "Reprojected/UKCP2.2/tasmax/05/latest/")) - r <- r[1] - rp <- paste0(dd, "Reprojected/UKCP2.2/tasmax/05/latest/", r) - rast <- rast(rp) - + r <- list.files(paste0(dd, "Reprojected_infill/UKCP2.2/tasmax/05/latest/"), full.names = T)[1] + rast <- rast(r) crs <- crs(rast) ## The output created from the R bias correction framework is a list of dataframes containing all the data we need for this doc (although some are transposed). @@ -97,10 +91,13 @@ if(input=="raster"){ names <- gsub(paste0(dd,rd,city,"|/|.RDS"),"",files.v) names(allruns) <- names - obs.val.df.L <- lapply(allruns, function(L){t(L[["t.obs"]])}) #This object stored in the results transposed for applying bias correction - maybe look at adding this to the bias correction script rather than here? - obs.cal.df.L <- lapply(allruns, function(L){L[["val.df"]]}) + #This was returned for ease where multiple runs have been looped to apply this paritcular function, but actually we don't need a cope for each nor this data in a list. Therefore: + obs.cal.df <- as.data.frame(t(allruns[[1]]$t.obs)) + obs.val.df <- allruns[[1]]$val.df - cpm.cal.raw.df.L <- lapply(allruns, function(L){t(L[["t.cal"]])}) + cpm.cal.raw.df.L <- lapply(allruns, function(L){ + as.data.frame(t(L[["t.cal"]])) + }) #In the R scirpt, the validation is corrected with the projected data as well - so needs to be seperated out (and transposed) cpm.val.raw.df.L <- lapply(allruns, function(L){ @@ -109,7 +106,7 @@ if(input=="raster"){ cpm.val.raw.df <- proj[,1:val.end.date] }) - cpm.proj.raw.df.L <- lapply(allruns, function(L){ + cpm.proj.raw.df.L <- lapply(allruns, function(L){ proj <- as.data.frame(t(L[["t.proj"]])) val.end.date <- min(grep("20201201-", names(proj))) cpm.val.raw.df <- proj[,val.end.date:ncol(proj)] @@ -132,32 +129,50 @@ if(input=="raster"){ }) ## Convert to rasters --requires creation of x and y cols from row names -``` +## For the comparison, just converting the observation and cpm for the cal and val perios (ie not the projection datasets) + +obsrastL <- lapply(list(obs.cal.df, obs.val.df), function(i){ + rn <- row.names(i) #The rownames were saves as x_y coordinates + xi <- gsub("_.*", "", rn) + yi <- gsub(".*_", "", rn) + xy <- data.frame(x = xi, y = yi) + df <- cbind(xy, i) + r <- rast(df, type="xyz") + crs(r) <- crs + return(r) +}) -#here - sorting out inputs and rasters in single chunk -```{r} +names(obsrastL) <- c("obs.cal.rasts", "obs.val.rasts") +list2env(obsrastL, .GlobalEnv) +remove(obsrastL) + +list2rast <- list(cpm.cal.raw.df.L, cpm.cal.adj.df.L, cpm.val.raw.df.L, cpm.val.adj.df.L) +rastsL <- lapply(list2rast, function(x){ + allruns <- x df.rL <- lapply(runs, function(i){ - L <- allruns[[i]] - lapply(L, function(x){ - df <- t(x) - df <- as.data.frame(df) + df <- allruns[[grep(i, names(allruns))]] #extract df based on run id rn <- row.names(df) #The rownames were saves as x_y coordinates xi <- gsub("_.*", "", rn) yi <- gsub(".*_", "", rn) xy <- data.frame(x = xi, y = yi) df <- cbind(xy, df) - L <- df.rL[[i]] - lapply(L, function(x){ - r <- rast(x, type="xyz") + r <- rast(df, type="xyz") crs(r) <- crs return(r) }) - }) - names(df.rL) <- runs + return(df.rL) + }) + +names(rastsL) <- c("cpm.cal.raw.rasts.L", "cpm.cal.adj.rasts.L", "cpm.val.raw.rasts.L", "cpm.val.adj.rasts.L") -names(rasts) <- runs +list2env(rastsL, .GlobalEnv) + +remove(rastsL) +remove(list2rast) + +gc() } else { print("Invalid input") @@ -170,132 +185,316 @@ names(rasts) <- runs ## **1. Bias Correction Assessment: trends** +An visual comparison of trends across observation, raw and adjusted data for the same time period -### **London - tasmax = Run 08** +### **1a. Raster comparison** -Using the London region (UKI) as this is the smallest -- not this is the same regional area as the 'three.cities' crops but cut to shapefile edges rather than the square grid +Random selection of 3 days of the observation, calibration and two adjusted cals, for three historic days -Load in Hads validation data -(So this can be run for all of the LCAT data, I'm going to read in the whole HADs files for the calibration years) +Adding in the city shapeoutline for prettier maps -**The calibration period is 2009-12-01 to 2019-11-30 to relate to the CPM month grouping** +```{r} -Hads data were also cropped to the regional files for the calibration years - some of the dates might need to be added from the observation (or just be ignored for ease) +shape <-sf::st_as_sf(vect(paste0(dd, "shapefiles/three.cities/", city, "/", city, ".shp"))) -```{r} -fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") -f <- list.files(fp) -v <- "tasmax" -reg <- "UKI" -f <- f[grepl("2010_2020", f)&grepl(v,f)&grepl(reg, f)] +``` + + + +#### **Day 1 - 1980-12-01 - calibration period ** {.tabset} + +##### Run05 + +```{r, fig.show="hold", out.width="33%"} + + tm_shape(obs.cal.rasts[[1]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") -obs.val.df <- read.csv(paste0(fp,f)) # Starts here from 2010 - 01 -01 -- because for the BC I removed these vals to align with the cpm years we're missing the month of Dec - so need to update the cpm data to reflect this in the assessment -- wont be a problem for other BC data +tm_shape(cpm.cal.raw.rasts.L$`05`[[1]]) + + tm_raster(title="CPM, Raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.adj.rasts.L$`05`[[1]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +``` + + +##### Run06 + +```{r, fig.show="hold", out.width="33%"} +tm_shape(obs.cal.rasts.L[[1]]) + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.raw.rasts.L$`06`[[1]]) + + tm_raster(title="CPM, Raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.adj.rasts.L$`06`[[1]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") ``` -### **1b. Check trends** +##### Run07 -The next set of chunks visualise the data +```{r, fig.show="hold", out.width="33%"} -This next chunk converts the dfs back to raster with the correct CRS -```{r convert to df and raster} +tm_shape(obs.cal.rasts[[1]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") +tm_shape(cpm.cal.raw.rasts.L$`07`[[1]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") +tm_shape(cpm.cal.adj.rasts.L$`07`[[1]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") ``` -#### **Raster vis comparison** -Random selection of 3 days of the observation, calibration and two adjusted cals, for three historic days -(Note: I'm just plotting the bias corrected with linear interpolation so as to overwhelm with plots) +##### Run08 + +```{r, fig.show="hold", out.width="33%"} + +tm_shape(obs.cal.rasts[[1]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.raw.rasts.L$`08`[[1]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.adj.rasts.L$`08`[[1]]) + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +``` + +#### **Day 2 - 2008-08-01 - calibration period ** {.tabset} + +##### Run05 + +```{r, fig.show="hold", out.width="33%"} + + tm_shape(obs.cal.rasts[[7081]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + + +tm_shape(cpm.cal.raw.rasts.L$`05`[[7081]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.adj.rasts.L$`05`[[7081]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +``` + + +##### Run06 + +```{r, fig.show="hold", out.width="33%"} +tm_shape(obs.cal.rasts[[7081]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.raw.rasts.L$`06`[[7081]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.adj.rasts.L$`06`[[7081]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") +``` -##### Fig. *Day 1 - 1980-12-01* + +##### Run07 ```{r, fig.show="hold", out.width="33%"} -tm_shape(London.rasts$Run05$t.obs[[1]]) + tm_raster(title="Observation, 1980-12-01") #Obviously just one call of the observation -tm_shape(London.rasts$Run05$t.cal[[1]]) + tm_raster(title="Calibration, Run 05, Raw 1980-12-01") -tm_shape(London.rasts$Run06$t.cal[[1]]) + tm_raster(title="Calibration, Run 06, Raw 1980-12-01") -tm_shape(London.rasts$Run07$t.cal[[1]]) + tm_raster(title="Calibration, Run 07, Raw 1980-12-01") -tm_shape(London.rasts$Run08$t.cal[[1]]) + tm_raster(title="Calibration, Run 08, Raw 1980-12-01") -tm_shape(London.rasts$Run05$qm1.hist.a[[1]]) + tm_raster(title="Calibration, Run 05, BC 1980-12-01") -tm_shape(London.rasts$Run06$qm1.hist.a[[1]]) + tm_raster(title="Calibration, Run 06, BC 1980-12-01") -tm_shape(London.rasts$Run07$qm1.hist.a[[1]]) + tm_raster(title="Calibration, Run 07, BC 1980-12-01") -tm_shape(London.rasts$Run08$qm1.hist.a[[1]]) + tm_raster(title="Calibration, Run 08, BC 1980-12-01") +tm_shape(obs.cal.rasts[[7081]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.raw.rasts.L$`07`[[7081]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") +tm_shape(cpm.cal.adj.rasts.L$`07`[[7081]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") ``` -##### Fig. *Day 2 - 1991-06-01* -Just to note I was so suprised by how much lower the observation data was for this raster I loaded the raw HADs to check (in resampled_2.2km/tasmax and the original 1km grid it does reflect it - it just seems very low) +##### Run08 ```{r, fig.show="hold", out.width="33%"} -tm_shape(London.rasts$Run05$t.obs[[3781]]) + tm_raster(title="Observation, 1991-06-01") #Obviously just one call of the observation -tm_shape(London.rasts$Run05$t.cal[[3781]]) + tm_raster(title="Calibration, Run 05, Raw 1991-06-01") -tm_shape(London.rasts$Run06$t.cal[[3781]]) + tm_raster(title="Calibration, Run 06, Raw 1991-06-01") -tm_shape(London.rasts$Run07$t.cal[[3781]]) + tm_raster(title="Calibration, Run 07, Raw 1991-06-01") -tm_shape(London.rasts$Run08$t.cal[[3781]]) + tm_raster(title="Calibration, Run 08, Raw 1991-06-01") -tm_shape(London.rasts$Run05$qm1.hist.a[[3781]]) + tm_raster(title="Calibration, Run 05, BC 1991-06-01") -tm_shape(London.rasts$Run06$qm1.hist.a[[3781]]) + tm_raster(title="Calibration, Run 06, BC 1991-06-01") -tm_shape(London.rasts$Run07$qm1.hist.a[[3781]]) + tm_raster(title="Calibration, Run 07, BC 1991-06-01") -tm_shape(London.rasts$Run08$qm1.hist.a[[3781]]) + tm_raster(title="Calibration, Run 08, BC 1991-06-01") + +tm_shape(obs.cal.rasts[[7081]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.raw.rasts.L$`08`[[7081]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.adj.rasts.L$`08`[[7081]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") ``` -##### Fig. *Day 3 - 2000-08-01* +#### **Day 3 - 2015-05-01 - calibration period ** {.tabset} + +##### Run05 + +```{r, fig.show="hold", out.width="33%"} + + tm_shape(obs.val.rasts[[1590]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + + +tm_shape(cpm.val.raw.rasts.L$`05`[[1590]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.val.adj.rasts.L$`05`[[1590]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +``` + + +##### Run06 + +```{r, fig.show="hold", out.width="33%"} +tm_shape(obs.val.rasts[[1590]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.val.raw.rasts.L$`06`[[1590]]) + + tm_raster(title="CPM, Raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.val.adj.rasts.L$`06`[[1590]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") +``` + + +##### Run07 + +```{r, fig.show="hold", out.width="33%"} + +tm_shape(obs.val.rasts[[1590]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.val.raw.rasts.L$`07`[[1590]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.val.adj.rasts.L$`07`[[1590]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") +``` + +##### Run08 ```{r, fig.show="hold", out.width="33%"} -tm_shape(London.rasts$Run05$t.obs[[7081]]) + tm_raster(title="Observation, 2000-08-01") #Obviously just one call of the observation -tm_shape(London.rasts$Run05$t.cal[[7081]]) + tm_raster(title="Calibration, Run 05, Raw 2000-08-01") -tm_shape(London.rasts$Run06$t.cal[[7081]]) + tm_raster(title="Calibration, Run 06, Raw 2000-08-01") -tm_shape(London.rasts$Run07$t.cal[[7081]]) + tm_raster(title="Calibration, Run 07, Raw 2000-08-01") -tm_shape(London.rasts$Run08$t.cal[[7081]]) + tm_raster(title="Calibration, Run 08, Raw 2000-08-01") -tm_shape(London.rasts$Run05$qm1.hist.a[[7081]]) + tm_raster(title="Calibration, Run 05, BC 2000-08-01") -tm_shape(London.rasts$Run06$qm1.hist.a[[7081]]) + tm_raster(title="Calibration, Run 06, BC 2000-08-01") -tm_shape(London.rasts$Run07$qm1.hist.a[[7081]]) + tm_raster(title="Calibration, Run 07, BC 2000-08-01") -tm_shape(London.rasts$Run08$qm1.hist.a[[7081]]) + tm_raster(title="Calibration, Run 08, BC 2000-08-01") + +tm_shape(obs.val.rasts[[1590]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.val.raw.rasts.L$`08`[[1590]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.val.adj.rasts.L$`08`[[1590]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") ``` +#### {-} + +### **1b. Trend comparison** + #### **Calibration period - annual trends** +# HERE --check works with your adjustment above! ```{r} + +dfL <- c(list(obs.cal.df), cpm.cal.raw.df.L, cpm.cal.adj.df.L) +names(dfL) <- c("obs.df", paste0("cpm.cal.raw.", names(cpm.cal.raw.df.L)), + paste0("cpm.cal.adj.", names(cpm.cal.raw.df.L))) + #Returns a list of dfs in handy format for graphing -London.dfg.rL <- lapply(runs, function(i){ - L <- London.df.rL[[i]] - names(L)[1:3] <- c("obs", "cal", "proj") - dfg <- lapply(names(L), function(ii){ - dfi <- L[[ii]] - x <- 3:ncol(dfi) #ignore cols 1 & 2 with x y +dfg.rL <- lapply(dfL, function(i){ + x <- 3:ncol(i) #ignore cols 1 & 2 with x y #Calc mean and sd dfx <- lapply(x, function(x){ - y <- dfi[,x] + y <- i[,x] mean <- mean(y, na.rm=T) sd <- sd(y, na.rm=T) dfr <- data.frame(mean=mean, sd.high=mean+sd, sd.low=mean-sd) - names(dfr) <- paste0(ii, ".", names(dfr)) - dfr$day <- names(dfi)[x] + dfr$day <- names(i)[x] return(dfr) }) - dfx_g <- dfx %>% purrr::reduce(rbind) }) - names(dfg) <- c("obs.daymeans", "raw.cal.daymeans", - "raw.proj.daymeans", "bc.a.cal.daymeans", - "bc.b.cal.daymeans", "bc.a.proj.daymeans", - "bc.b.proj.daymeans") - - return(dfg) -}) -names(London.dfg.rL) <- runs +names(dfg.rL) <- names(dfL) ``` @@ -303,8 +502,8 @@ names(London.dfg.rL) <- runs #Create a df for all of the runs to plot ##Add a day index to align the cal and obs -London.dfg.calp.L <- lapply(runs, function(i){ - dfg <- London.dfg.rL[[i]] +dfg.calp.L <- lapply(runs, function(i){ + dfg <- dfg.rL[[i]] dfg.calp <- dfg[c("obs.daymeans", "raw.cal.daymeans", "bc.b.cal.daymeans", "bc.a.cal.daymeans")] @@ -319,24 +518,24 @@ London.dfg.calp.L <- lapply(runs, function(i){ dfg.calp$Run <- i return(dfg.calp)}) -names(London.dfg.calp.L) <- runs +names(dfg.calp.L) <- runs -London.dfg.calp <- London.dfg.calp.L %>% reduce(rbind) +dfg.calp <- dfg.calp.L %>% reduce(rbind) ``` ```{r} -London.dfg.calp_m <- reshape2::melt(London.dfg.calp, id=c("dayi", "Run")) #create long df for plotting multiple lines +dfg.calp_m <- reshape2::melt(dfg.calp, id=c("dayi", "Run")) #create long df for plotting multiple lines -London.dfg.calp_mm <- London.dfg.calp_m[grepl("mean", London.dfg.calp_m$variable),] #For easy vis, only keep mean vals +dfg.calp_mm <- dfg.calp_m[grepl("mean", dfg.calp_m$variable),] #For easy vis, only keep mean vals ``` #### Fig. Calibration period - annual mean ```{r Historic trend 1} -ggplot(London.dfg.calp_mm, aes(dayi, value, group=variable, colour=variable)) + +ggplot(dfg.calp_mm, aes(dayi, value, group=variable, colour=variable)) + geom_line(alpha=0.7) + facet_wrap(.~Run) + theme_bw() + ylab("Av daily max temp oC") + @@ -353,7 +552,7 @@ Annotate season based on month index - the dates have different formats dependin ```{r} seasonal.means <- lapply(runs, function(r){ - dfg <- London.dfg.rL[[r]] + dfg <- dfg.rL[[r]] #Hads/obs df obs.daymeans.df <- dfg$obs.daymeans @@ -478,8 +677,8 @@ For tasmax - grouping to season and calculating the seasonal maxima vals (i.e. r ```{r} #Convert to max, out put a df in easy fig format -London.dfg.max <- lapply(runs, function(r){ - L <- London.df.rL[[r]] +dfg.max <- lapply(runs, function(r){ + L <- df.rL[[r]] names(L)[1:3] <- c("obs", "cal", "proj") dfg <- lapply(names(L), function(ii){ dfi <- L[[ii]] @@ -497,10 +696,10 @@ London.dfg.max <- lapply(runs, function(r){ return(dfg) }) -names(London.dfg.max) <- runs +names(dfg.max) <- runs seasonal.max.cal <- lapply(runs, function(r){ - dfg <- London.dfg.max[[r]] + dfg <- dfg.max[[r]] #Hads/obs df df1 <- dfg$obs.max x <- df1$day @@ -588,37 +787,6 @@ ggplot(dfg_sm, aes(season_year, max, group=model, colour=model)) + ``` -#### Create validaton df list - -Adding in the observational HADs data and aligning based on dates - -*Note* So as not to re-run the UK wide LCAT data processing, a workaround was added to the bias correction function used to group the obs data - this means that to align the validation cpm data we have to remove a month in the beginning - -```{r} - -#Extract validation period of raw and bias corrected CPM data -val.dfs <- lapply(runs, function(r){ - London.df <- London.df.rL[[r]] - cpm.val.dfs <- lapply(London.df[c("t.proj", "qm1.proj.a", "qm1.proj.b")], function(x){ - i <- grep("20191201-20201130_30", names(x))[1] - df <- x[,1:i] - }) - - #Using the old cpm data for the hads obs - so need to remove the dates to ensure theres 30 days per year - remove <- c("0229_29", "0430_30", "0731_31", "0930_30", "1130_30") - remove <- paste0(remove, collapse = "|") - - obs.val.df <- obs.val.df[,!grepl(remove, names(obs.val.df))] - row.names(obs.val.df) <- paste0(obs.val.df$x, "_", obs.val.df$y) - - val.dfs <- c(list(obs.val.df), cpm.val.dfs) - names(val.dfs) <- c("obs.val.df", "raw.cpm.val", "bc1.cpm.val", "bc2.cpm.val") - return(val.dfs) - }) - -names(val.dfs) <- runs -``` - #### *Validation period - annual trends - seasonal mean* From 972502d80cdd87505cb59c8b97e2e7b599e7eb73 Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Mon, 9 Oct 2023 08:23:28 +0000 Subject: [PATCH 32/83] Updating metrics so as to work with new data format --- .../Assessing_bc_data/Assessing_BC_Data.RMD | 217 +++++------------- 1 file changed, 63 insertions(+), 154 deletions(-) diff --git a/notebooks/Assessing_bc_data/Assessing_BC_Data.RMD b/notebooks/Assessing_bc_data/Assessing_BC_Data.RMD index f45b14e8..f1d25bec 100644 --- a/notebooks/Assessing_bc_data/Assessing_BC_Data.RMD +++ b/notebooks/Assessing_bc_data/Assessing_BC_Data.RMD @@ -93,7 +93,8 @@ if(input=="raster"){ #This was returned for ease where multiple runs have been looped to apply this paritcular function, but actually we don't need a cope for each nor this data in a list. Therefore: obs.cal.df <- as.data.frame(t(allruns[[1]]$t.obs)) - obs.val.df <- allruns[[1]]$val.df + + obs.val.df <- allruns[[1]]$val.df[c(1:3601)] #To run until 30th Nov 2020 cpm.cal.raw.df.L <- lapply(allruns, function(L){ as.data.frame(t(L[["t.cal"]])) @@ -229,7 +230,7 @@ tm_shape(cpm.cal.adj.rasts.L$`05`[[1]]) + ##### Run06 ```{r, fig.show="hold", out.width="33%"} -tm_shape(obs.cal.rasts.L[[1]]) + tm_raster(title="Observation") + +tm_shape(obs.cal.rasts[[1]]) + tm_raster(title="Observation") + tm_layout(legend.outside = T) + tm_shape(shape) + tm_borders(col="black") @@ -466,19 +467,15 @@ tm_shape(cpm.val.adj.rasts.L$`08`[[1590]]) + ### **1b. Trend comparison** -#### **Calibration period - annual trends** - -# HERE --check works with your adjustment above! - ```{r} - +#Lists of dfs to summarise the means of dfL <- c(list(obs.cal.df), cpm.cal.raw.df.L, cpm.cal.adj.df.L) -names(dfL) <- c("obs.df", paste0("cpm.cal.raw.", names(cpm.cal.raw.df.L)), +names(dfL) <- c("obs.cal", paste0("cpm.cal.raw.", names(cpm.cal.raw.df.L)), paste0("cpm.cal.adj.", names(cpm.cal.raw.df.L))) #Returns a list of dfs in handy format for graphing -dfg.rL <- lapply(dfL, function(i){ - x <- 3:ncol(i) #ignore cols 1 & 2 with x y +dfg.daily.means <- lapply(dfL, function(i){ + x <- 1:ncol(i) #ignore cols 1 & 2 with x y #Calc mean and sd dfx <- lapply(x, function(x){ y <- i[,x] @@ -494,94 +491,52 @@ dfg.rL <- lapply(dfL, function(i){ }) -names(dfg.rL) <- names(dfL) +names(dfg.daily.means) <- names(dfL) ``` +*Note* : Can add a plot here for daily averages but it's quite visually confusing so omitting for now -```{r} -#Create a df for all of the runs to plot -##Add a day index to align the cal and obs - -dfg.calp.L <- lapply(runs, function(i){ - dfg <- dfg.rL[[i]] - dfg.calp <- dfg[c("obs.daymeans", "raw.cal.daymeans", - "bc.b.cal.daymeans", "bc.a.cal.daymeans")] - - dfg.calp <- lapply(dfg.calp, function(x){ - x$dayi <- 1:nrow(x) - x$day<- NULL - return(x) - }) - - - dfg.calp <- dfg.calp %>% reduce(merge, "dayi") - dfg.calp$Run <- i - return(dfg.calp)}) - -names(dfg.calp.L) <- runs - -dfg.calp <- dfg.calp.L %>% reduce(rbind) +#### **Seasonal trends - Calibration period ** -``` ```{r} -dfg.calp_m <- reshape2::melt(dfg.calp, id=c("dayi", "Run")) #create long df for plotting multiple lines - -dfg.calp_mm <- dfg.calp_m[grepl("mean", dfg.calp_m$variable),] #For easy vis, only keep mean vals -``` - -#### Fig. Calibration period - annual mean +#Annotate season based on month index - the dates have different formats depending on the input data (ie hads vs cpm) so am pulling out the necessary to adjust sep -```{r Historic trend 1} +obs.cal.season.mean <- dfg.daily.means$obs.cal -ggplot(dfg.calp_mm, aes(dayi, value, group=variable, colour=variable)) + - geom_line(alpha=0.7) + - facet_wrap(.~Run) + - theme_bw() + ylab("Av daily max temp oC") + - ggtitle("Tasmax Hisotric trends") + - scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Day, 1980.12.01 - 2009.12.01") + - scale_color_brewer(palette="Set1", name="Model", labels=c("Obs (Hads)", "Raw CPM", "BC CPM 1", "BC CPM 2")) +x <- dfg.daily.means$obs.cal$day -``` - -#### **Seasonal trends - Calibration period ** - -Annotate season based on month index - the dates have different formats depending on the input data (ie hads vs cpm) so am pulling out the necessary to adjust sep - -```{r} - -seasonal.means <- lapply(runs, function(r){ - dfg <- dfg.rL[[r]] - #Hads/obs df - obs.daymeans.df <- dfg$obs.daymeans - - x <- obs.daymeans.df$day - obs.daymeans.df$season <- ifelse(grepl("1231_|0131_|0228_|0229_", x), +obs.cal.season.mean$season <- ifelse(grepl("1231_|0131_|0228_|0229_", x), "Winter", ifelse(grepl("0331_|0430_|0531_", x), "Spring", ifelse(grepl("0630_|0731_|0831_", x), "Summer", "Autumn"))) -#Note: the seasons should each have 90 days but seemingly Winter and Autumn have 89 and Spring and Summer have 91 - this is due to how the manual aligning worked out and should be updated when the hads data is re-run +#Note = the n days per season is not quite evenly split between the 4 seasons because of how the hads resamples across the year for 360 days #Create season_year - All Winter months apart from Dec to be added to the previous year (ie Winter 2000) would be the Dec of 2000 to the Feb of 2001 - year <- gsub("^[^_]*_", "", x) + rem <- nchar(var) + 39 + year <- substr(x, rem, rem+3) year <- as.numeric(substr(year, 1,4)) - obs.daymeans.df$season_year <- ifelse(grepl("0131_|0228_|0229_", x), - paste0(year-1, obs.daymeans.df$season), - paste0(year, obs.daymeans.df$season)) + obs.cal.season.mean$season_year <- ifelse(grepl("0131_|0228_|0229_", x), + paste0(year-1, obs.cal.season.mean$season), + paste0(year, obs.cal.season.mean$season)) # Mutate to a seasonal mean df - obs.seasonal.mean.df <- aggregate(obs.daymeans.df[[1]], list(obs.daymeans.df[["season_year"]]), function(x) c(seasonal.mean = mean(x), sd.high.seasonal = mean(x) + sd(x), sd.low.seasonal = mean(x) - sd(x))) - obs.seasonal.mean.df<- data.frame(season_year=obs.seasonal.mean.df$Group.1, - seasonal.mean=obs.seasonal.mean.df$x[,"seasonal.mean"], - sd.high.seasonal = obs.seasonal.mean.df$x[,"sd.high.seasonal"], - sd.low.seasonal = obs.seasonal.mean.df$x[,"sd.low.seasonal"]) + obs.cal.season.mean <- aggregate(obs.cal.season.mean[[1]], list(obs.cal.season.mean[["season_year"]]), function(x) c(seasonal.mean = mean(x), sd.high.seasonal = mean(x) + sd(x), sd.low.seasonal = mean(x) - sd(x))) + obs.cal.season.mean<- data.frame(season_year=obs.cal.season.mean$Group.1, + seasonal.mean=obs.cal.season.mean$x[,"seasonal.mean"], + sd.high.seasonal = obs.cal.season.mean$x[,"sd.high.seasonal"], + sd.low.seasonal = obs.cal.season.mean$x[,"sd.low.seasonal"]) #Grouping variable for later vars - obs.seasonal.mean.df$model <- "obs" - - + obs.cal.season.mean$model <- "obs" + +``` + +## Ruth to finish cleaning up this bit (it won't run at the moment) + +```{r eval=FALSE, include=FALSE} dfg.seasonal.mean <- lapply(c("raw.cal.daymeans", "bc.b.cal.daymeans", "bc.a.cal.daymeans"), function(i){ df <- dfg[[i]] @@ -625,7 +580,7 @@ seasonal.means.df <- seasonal.means %>% reduce(rbind) #### Fig. Calibration period - seasonal mean -```{r} +```{r eval=FALSE, include=FALSE} ggplot(seasonal.means.df, aes(season_year, seasonal.mean, group=model, colour=model)) + geom_line() + @@ -640,7 +595,7 @@ ggplot(seasonal.means.df, aes(season_year, seasonal.mean, group=model, colour=mo ##### *Summer only* -```{r Raw seasonal winter} +```{r Raw seasonal winter, eval=FALSE, include=FALSE} dfg_sm<- subset(seasonal.means.df, grepl("Summer", season_year)) @@ -653,28 +608,13 @@ ggplot(dfg_sm, aes(season_year, seasonal.mean, group=model, colour=model)) + scale_color_brewer(palette="Set1", name="Model") ``` -It looks purple because the two bc methods aren't revealing much difference so subsetting to just one instead -```{r} - -dfg_sm<- subset(seasonal.means.df, !grepl("bc.b.cal", model)&grepl("Summer", season_year)) - -ggplot(dfg_sm, aes(season_year, seasonal.mean, group=model, colour=model)) + - geom_line(alpha=0.7) + - facet_wrap(.~Run) + - theme_bw() + ylab("Av daily max temp oC -Summer average") + - ggtitle("Tasmax Hisotric trends") + - scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Seasonal averages, 1980.12.01 - 2009.12.01") + - scale_color_brewer(palette="Set1", name="Model") - -``` - #### *Annual trends - seasonal max* For tasmax - grouping to season and calculating the seasonal maxima vals (i.e. rather than means above) -```{r} +```{r eval=FALSE, include=FALSE} #Convert to max, out put a df in easy fig format dfg.max <- lapply(runs, function(r){ @@ -759,7 +699,7 @@ names(seasonal.maxima.df) <- c("season_year", "max", "model", "Run") #### Fig. Calibration period - seasonal max -```{r} +```{r eval=FALSE, include=FALSE} ggplot(seasonal.maxima.df, aes(season_year, max, group=model, colour=model)) + geom_line() + @@ -773,7 +713,7 @@ ggplot(seasonal.maxima.df, aes(season_year, max, group=model, colour=model)) + #### Fig. Calibration period - *Summer only* -```{r} +```{r eval=FALSE, include=FALSE} dfg_sm<- subset(seasonal.maxima.df, !grepl("qm1.hist.b", model)&grepl("Summer", season_year)) @@ -802,82 +742,50 @@ Using the validation data set for this ```{r} + +val.dfs <- c(list(obs.val.df), cpm.val.raw.df.L, cpm.val.adj.df.L) + #Convert dfs to a vector -val.dfs.v <- lapply(runs, function(r){ - dfs <- val.dfs[[r]] - dfs2 <- lapply(dfs, function(d){ - #Remove x and y - d$x <- NULL - d$y <- NULL +val.dfs.v <- lapply(val.dfs, function(d){ #Convert to single vector unlist(as.vector(d))}) - names(dfs2) <- names(dfs) -val.dfs.v.df <- dfs2 %>% reduce(cbind) -val.dfs.v.df <- as.data.frame(val.dfs.v.df)}) +val.dfs.v.df <- as.data.frame(val.dfs.v) +names(val.dfs.v.df) <- c("obs.val", paste0("Run", rep(runs, 2), "_", var, ".",rep(c("raw", "adj", 4)))) # Names for easy reference -names(val.dfs.v) <- runs ``` -```{r} -val.dfs.v <- lapply(runs, function(r){ - df <- val.dfs.v[[r]] - names(df) <-paste0(r, ".", c("obs.val.df", "raw.cpm.val", "bc1.cpm.val", "bc2.cpm.val")) - return(df) -}) - -#Convert to a single df -val.dfs.v.allruns <- val.dfs.v %>% reduce(cbind) - -#Remove duplicate obs (pulled through across each run) -val.dfs.v.allruns[c("Run06.obs.val.df", "Run07.obs.val.df", "Run08.obs.val.df")] <- NULL -names(val.dfs.v.allruns)[1] <- "obs.val" -``` ### **2a. Descriptive statistics** ```{r descriptives validation} -descriptives <- apply(val.dfs.v.allruns,2, function(x){ +descriptives <- apply(val.dfs.v.df, 2, function(x){ per <- data.frame(as.list(quantile(x, probs=c(0.1, 0.9)))) data.frame(mean=mean(x), sd=sd(x), min = min(x), per10th=per$X10.,per90th=per$X90., max = max(x)) }) descriptives <- descriptives %>% reduce(rbind) -row.names(descriptives) <- names(val.dfs.v.allruns) +row.names(descriptives) <- names(val.dfs.v.df) t(descriptives) ``` +#### Fig.Density plot of validation period -#### **Distribution** +**Note** - need to add back in some facetting to this fig ```{r} +m <- reshape2::melt(val.dfs.v.df) -names(val.dfs.v) <- runs -val.dfs.v_fordist <- lapply(runs, function(r){ - df <- val.dfs.v[[r]] - names(df) <- c("obs", "raw.cpm", "bc1.cpm", "bc2.cpm") - df$run <- paste0(r) - return(df) -}) - -#Convert to a single df -val.dfs.v.allruns_fordist <- val.dfs.v_fordist %>% reduce(rbind) -val.dfg <- reshape2::melt(val.dfs.v.allruns_fordist, id="run") -``` - -#### Fig.Density plot of validation period - -```{r} -ggplot(subset(val.dfg, variable!="bc2.cpm"), aes(value, fill=variable, colour=variable)) + +ggplot(m, aes(value, fill=variable, colour=variable)) + geom_density(alpha = 0.3, position="identity") + - facet_wrap(.~ run) + theme_minimal() + scale_fill_brewer(palette = "Set1") + scale_color_brewer(palette = "Set1") ``` + ### **2b. Model fit statistics** Using the following to assess overall fit: @@ -888,18 +796,17 @@ Using the following to assess overall fit: - **Percent bias (PBIAS):** The optimal value of PBIAS is 0.0, with low-magnitude values indicating accurate model simulation. Positive values indicate overestimation bias, whereas negative values indicate model underestimation bias. ```{r rsq} -actual <- val.dfs.v.allruns$obs.val +actual <- val.dfs.v.df$obs.val -rsq <- sapply(val.dfs.v.allruns[c(2:ncol(val.dfs.v.allruns))], function(x){ +rsq <- sapply(val.dfs.v.df[c(2:ncol(val.dfs.v.df))], function(x){ cor(actual, x)^2 }) - t(data.frame(as.list(rsq), row.names = "RSQ")) ``` ```{r rmse} -rmse <- sapply(val.dfs.v.allruns[c(2:ncol(val.dfs.v.allruns))], function(x){ +rmse <- sapply(val.dfs.v.df[c(2:ncol(val.dfs.v.df))], function(x){ sqrt(mean((actual - x)^2)) }) @@ -907,14 +814,14 @@ rmse <- sapply(val.dfs.v.allruns[c(2:ncol(val.dfs.v.allruns))], function(x){ ```{r pbias} -pbias <- sapply(val.dfs.v.allruns[c(2:ncol(val.dfs.v.allruns))], function(x){ +pbias <- sapply(val.dfs.v.df[c(2:ncol(val.dfs.v.df))], function(x){ hydroGOF::pbias(x, actual) }) ``` ```{r nse} -nse <- sapply(val.dfs.v.allruns[c(2:ncol(val.dfs.v.allruns))], function(x){ +nse <- sapply(val.dfs.v.df[c(2:ncol(val.dfs.v.df))], function(x){ hydroGOF::NSE(x, actual) }) @@ -933,19 +840,21 @@ k %>% ``` - ## **3. Bias Correction Assessment: Metric specific - tasmax** ### **3b Days above 30 degrees** (Not considered consecutively here) -```{r} -val.dfs.v.allruns$year <- substr(row.names(val.dfs.v.allruns), 8,11) +```{r eval=FALSE, include=FALSE} + +### Ruth to update + +val.dfs.v.df$year <- substr(row.names(val.dfs.v.df), 8,11) -over30 <- lapply(names(val.dfs.v.allruns), function(i){ - x <- val.dfs.v.allruns[,i] - df <- aggregate(x, list(val.dfs.v.allruns$year), function(x){sum(x>=30)}) +over30 <- lapply(names(val.dfs.v.df), function(i){ + x <- val.dfs.v.df[,i] + df <- aggregate(x, list(val.dfs.v.df$year), function(x){sum(x>=30)}) names(df) <- c("year", paste0("Days.over.30.", i)) return(df) }) From ad8851fade1398e079e078936a036916e1df1269 Mon Sep 17 00:00:00 2001 From: RuthBowyer Date: Mon, 9 Oct 2023 08:37:24 +0000 Subject: [PATCH 33/83] tidy markdown add kable --- .../Assessing_bc_data/Assessing_BC_Data.RMD | 10 +- .../Assessing_bc_data/Assessing_BC_Data.html | 4061 +++++++++++++++++ 2 files changed, 4069 insertions(+), 2 deletions(-) create mode 100644 notebooks/Assessing_bc_data/Assessing_BC_Data.html diff --git a/notebooks/Assessing_bc_data/Assessing_BC_Data.RMD b/notebooks/Assessing_bc_data/Assessing_BC_Data.RMD index f1d25bec..ab4502de 100644 --- a/notebooks/Assessing_bc_data/Assessing_BC_Data.RMD +++ b/notebooks/Assessing_bc_data/Assessing_BC_Data.RMD @@ -767,7 +767,13 @@ descriptives <- apply(val.dfs.v.df, 2, function(x){ descriptives <- descriptives %>% reduce(rbind) row.names(descriptives) <- names(val.dfs.v.df) -t(descriptives) +d <- t(descriptives) + +d %>% + kable(booktabs = T) %>% + kable_styling() %>% + row_spec(grep(".bc.",row.names(d)), background = "lightgrey") + ``` @@ -775,7 +781,7 @@ t(descriptives) **Note** - need to add back in some facetting to this fig -```{r} +```{r warning=F, message=F} m <- reshape2::melt(val.dfs.v.df) ggplot(m, aes(value, fill=variable, colour=variable)) + diff --git a/notebooks/Assessing_bc_data/Assessing_BC_Data.html b/notebooks/Assessing_bc_data/Assessing_BC_Data.html new file mode 100644 index 00000000..7ecfb9ca --- /dev/null +++ b/notebooks/Assessing_bc_data/Assessing_BC_Data.html @@ -0,0 +1,4061 @@ + + + + + + + + + + + + + + + +Bias correction assessment + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+
+
+
+
+ +
+ + + + + + + +
rm(list=ls())
+
+knitr::opts_knit$set(root.dir="/mnt/vmfileshare/ClimateData/")
+
+library(ggplot2)
+library(terra)
+library(tmap) #pretty maps
+library(RColorBrewer)
+library(tidyverse)
+library(kableExtra)
+
+

0. About

+

This is an example notebook for the assessment of bias corrected +data, using output from the R ‘qmap’ package for the city of Glasgow and +the variable ‘tasmax’.

+

Input data

+

This script requires the following data:

+
    +
  • ‘obs.cal’ - observation (HADs data) for the calibration +period - the dataset used as the reference dataset in the bias +correction
  • +
  • ‘obs.val’ - as above, for the validation period
  • +
  • ‘cpm.cal.raw’ - the raw (uncorrected) data for the +calibration period
  • +
  • ‘cpm.cal.adj’ - the adjusted (bias-corrected) data for the +calibration period
  • +
  • ‘cpm.val.raw’ - the raw (uncorrected) data for the +valibration period
  • +
  • ‘cpm.val.adj’ - the adjusted (bias-corrected) data for the +valibration period
  • +
  • ‘cpm.proj.raw’ - the raw (uncorrected) data for the +future/projected period (optional)
  • +
  • ‘cpm.proj.radj’ - the adjusted (bias-corrected) data for the +future/projected period (optional)
  • +
+

The data is required in raster format and dataframe formats

+

Calibration vs Validation dates

+

The calibration period runs between 01-12-1980 to the day prior to +01-12-2010 The validation period runs between 01-12-2010 to the day +prior to 01-12-2020

+
+ +
+

Ruth to finish cleaning up this bit (it won’t run at the +moment)

+
+

Fig. Calibration period - seasonal mean

+
+
Summer only
+
+
+ +
+

Fig. Calibration period - seasonal max

+
+
+

Fig. Calibration period - Summer only

+
+ + +
+
+

2. Bias Correction Assessment: Metrics

+

Using the validation data set for this

+
val.dfs <- c(list(obs.val.df), cpm.val.raw.df.L, cpm.val.adj.df.L)
+
+#Convert dfs to a vector
+val.dfs.v <- lapply(val.dfs, function(d){
+  #Convert to single vector
+  unlist(as.vector(d))})
+
+val.dfs.v.df <- as.data.frame(val.dfs.v)
+names(val.dfs.v.df) <- c("obs.val", paste0("Run", rep(runs, 2), "_", var, ".",rep(c("raw", "adj", 4)))) # Names for easy reference
+
+

2a. Descriptive statistics

+
descriptives <- apply(val.dfs.v.df, 2, function(x){ 
+  per <- data.frame(as.list(quantile(x, probs=c(0.1, 0.9))))
+  data.frame(mean=mean(x), sd=sd(x), min = min(x), per10th=per$X10.,per90th=per$X90., max = max(x))
+})
+
+descriptives <- descriptives %>% reduce(rbind)
+row.names(descriptives) <- names(val.dfs.v.df)
+d <- t(descriptives)
+
+d %>% 
+  kable(booktabs = T) %>%
+  kable_styling() %>%
+  row_spec(grep(".bc.",row.names(d)), background = "lightgrey")
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +obs.val + +Run05_tasmax.raw + +Run06_tasmax.adj + +Run07_tasmax.4 + +Run08_tasmax.raw + +Run05_tasmax.adj + +Run06_tasmax.4 + +Run07_tasmax.raw + +Run08_tasmax.adj +
+mean + +12.990120 + +12.460644 + +12.047759 + +12.233846 + +12.764617 + +13.551728 + +13.128147 + +13.4808839 + +13.199155 +
+sd + +5.375065 + +5.987282 + +6.000949 + +5.587015 + +5.852800 + +5.557214 + +5.692209 + +5.3415785 + +5.598392 +
+min + +-5.222516 + +-3.289649 + +-2.942725 + +-1.848730 + +-2.183447 + +-2.261268 + +-1.499793 + +-0.1865713 + +-4.036536 +
+per10th + +6.005837 + +4.394897 + +4.245508 + +5.023584 + +5.110010 + +6.493001 + +5.988253 + +6.6283767 + +5.990010 +
+per90th + +19.912904 + +20.198414 + +19.897852 + +19.655664 + +20.234034 + +20.832330 + +20.699387 + +20.5333003 + +20.171180 +
+max + +31.605967 + +32.398094 + +31.079248 + +32.790184 + +35.011864 + +34.132040 + +31.410919 + +33.5803528 + +34.205112 +
+
+

Fig.Density plot of validation period

+

Note - need to add back in some facetting to this +fig

+
m <- reshape2::melt(val.dfs.v.df)
+
+ggplot(m, aes(value, fill=variable, colour=variable)) + 
+  geom_density(alpha = 0.3, position="identity") + 
+  theme_minimal() +
+  scale_fill_brewer(palette = "Set1") +
+  scale_color_brewer(palette = "Set1")
+

+
+
+
+

2b. Model fit statistics

+

Using the following to assess overall fit:

+
    +
  • R-squared (rsq)
  • +
  • Root Square Mean Error (RMSE)
  • +
  • Nash-Sutcliffe Efficiency (NSE): Magnitude of +residual variance compared to measured data variance, ranges -∞ to 1, 1 += perfect match to observations
  • +
  • Percent bias (PBIAS): The optimal value of PBIAS is +0.0, with low-magnitude values indicating accurate model simulation. +Positive values indicate overestimation bias, whereas negative values +indicate model underestimation bias.
  • +
+
actual <- val.dfs.v.df$obs.val
+
+rsq <- sapply(val.dfs.v.df[c(2:ncol(val.dfs.v.df))], function(x){
+  cor(actual, x)^2
+})
+
rmse <- sapply(val.dfs.v.df[c(2:ncol(val.dfs.v.df))], function(x){
+  sqrt(mean((actual - x)^2))
+})
+
pbias <- sapply(val.dfs.v.df[c(2:ncol(val.dfs.v.df))], function(x){
+  hydroGOF::pbias(x, actual)
+})
+
nse <- sapply(val.dfs.v.df[c(2:ncol(val.dfs.v.df))], function(x){
+  hydroGOF::NSE(x, actual)
+})
+

Highlighting the bias corrected statistics

+
k <- cbind(rsq, rmse, pbias, nse)
+k %>% 
+  kable(booktabs = T) %>%
+  kable_styling() %>%
+  row_spec(grep(".bc.",row.names(k)), background = "lightgrey")
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +rsq + +rmse + +pbias + +nse +
+Run05_tasmax.raw + +0.5555510 + +4.128585 + +-4.1 + +0.4100210 +
+Run06_tasmax.adj + +0.5535077 + +4.218502 + +-7.3 + +0.3840428 +
+Run07_tasmax.4 + +0.5051351 + +4.241554 + +-5.8 + +0.3772924 +
+Run08_tasmax.raw + +0.5331874 + +4.153865 + +-1.7 + +0.4027737 +
+Run05_tasmax.adj + +0.5464456 + +3.990959 + +4.3 + +0.4486993 +
+Run06_tasmax.4 + +0.5580189 + +3.949778 + +1.1 + +0.4600178 +
+Run07_tasmax.raw + +0.5082040 + +4.090093 + +3.8 + +0.4209707 +
+Run08_tasmax.adj + +0.5298693 + +4.058097 + +1.6 + +0.4299947 +
+
+
+
+

3. Bias Correction Assessment: Metric specific - +tasmax

+
+

3b Days above 30 degrees

+

(Not considered consecutively here)

+
+
+

Number of heatwaves per annum

+

(to be added)

+
+

For future work

+

The number of quantiles selected will effect the efficacy of the bias +correction: lots of options therefore with this specific method

+
+
+
+ + + +
+
+ +
+ + + + + + + + + + + + + + + + + From 6049fcaacf51ce4846bd2a057082263eb6a560ed Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Wed, 22 Nov 2023 16:33:45 +0000 Subject: [PATCH 34/83] fix(ci): add `pre-commit` to check `R/` path and apply --- .github/workflows/ci.yaml | 4 +- .pre-commit-config.yaml | 2 +- R/LCAT/Data_Processing_todf.R | 51 +- R/LCAT/LCATv_apply_qmapQuant_to_crpd_df_fn.R | 464 +++++++++--------- R/README.md | 8 +- R/Resampling/Resampling.HADs.inR.R | 22 +- R/bias-correction-methods/ThreeCitiesQmap.RMD | 20 +- R/bias-correction-methods/WIP_EQM.RMD | 35 +- .../apply_qmapQuant_to_crpd_df_fn.R | 43 +- .../WIP Comparing HADs grids.Rmd | 35 +- .../WIP-Comparing-HADs-grids.Rmd | 35 +- .../WIP-Comparing-HADs-grids.md | 14 +- .../converting_city_crops_to_df.R | 118 ++--- README.md | 2 +- 14 files changed, 420 insertions(+), 433 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 8ca3d40a..38e4d4e9 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -14,11 +14,11 @@ env: on: pull_request: - branches: ['main', 'docker-config'] + branches: ['main', 'ruth-notebook-for-workshop'] paths-ignore: ['docs/**'] push: - branches: ['main', 'docker-config'] + branches: ['main', 'ruth-notebook-for-workshop'] paths-ignore: ['docs/**'] concurrency: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bd658c3a..4ac55731 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,4 +1,4 @@ -exclude: "R|data" +exclude: "data" repos: - repo: https://github.com/psf/black rev: "23.9.1" diff --git a/R/LCAT/Data_Processing_todf.R b/R/LCAT/Data_Processing_todf.R index 89d15d15..fdea70a4 100644 --- a/R/LCAT/Data_Processing_todf.R +++ b/R/LCAT/Data_Processing_todf.R @@ -1,12 +1,12 @@ -## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## +## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## Data Processing UK HadsGrid CPM data from raster to data.frame ## 0. About -# Many of the methods packages for applying bias correction to climate date take as input vector, matrix or data.frame, +# Many of the methods packages for applying bias correction to climate date take as input vector, matrix or data.frame, # rather than a spatial file or raster -# This may not be the most effective way of analysising this data +# This may not be the most effective way of analysising this data rm(list=ls()) @@ -47,16 +47,16 @@ cores <- detectCores() cl <- makeCluster(cores[1]-1) registerDoParallel(cl) - foreach(x = x, + foreach(x = x, .packages = c("terra", "tidyverse"), - .errorhandling = 'pass') %dopar% { + .errorhandling = 'pass') %dopar% { f <- paste0(dd,'shapefiles/NUTS_Level_1_January_2018_FCB_in_the_United_Kingdom_2022_7279368953270783580/NUTS_Level_1_January_2018_FCB_in_the_United_Kingdom.shp') UK.shape <-vect(f) crop.area <- UK.shape[which(UK.shape$nuts118cd==x)] - + var <- c("rainfall", "tasmax", "tasmin", "tas") - + hads19802010_read_crop_df_write(var = var, fp = paste0(dd, "Processed/HadsUKgrid/resampled_2.2km/"), name1 = "HadsUK", @@ -64,45 +64,42 @@ registerDoParallel(cl) crop.area=crop.area, cropname=x, rd=rd) - + } - - stopCluster(cl) + + stopCluster(cl) gc() ### Processing CPM - + x <- regioncd - + cores <- detectCores() cl <- makeCluster(cores[1]-1) registerDoParallel(cl) - - foreach(x = x, + + foreach(x = x, .packages = c("terra", "tidyverse"), - .errorhandling = 'pass') %dopar% { - + .errorhandling = 'pass') %dopar% { + f <- paste0(dd,'shapefiles/NUTS_Level_1_January_2018_FCB_in_the_United_Kingdom_2022_7279368953270783580/NUTS_Level_1_January_2018_FCB_in_the_United_Kingdom.shp') UK.shape <-vect(f) crop.area <- UK.shape[which(UK.shape$nuts118cd==x)] - + runs <- c("05", "07", "08", "06") var <- c("tasmax", "tasmin","pr", "tas") rd <- paste0(dd, "Interim/CPM/Data_as_df") - + cpm_read_crop_df_write(runs=runs, var=var, fp=paste0(dd, "Reprojected/UKCP2.2/"), - year1=2060, year2=2080, #Ran sep for each year segment - name1="CPM", crop = T, + year1=2060, year2=2080, #Ran sep for each year segment + name1="CPM", crop = T, crop.area = crop.area, cropname = x, rd=paste0(dd, "Interim/CPM/Data_as_df")) - - + + } - - stopCluster(cl) + + stopCluster(cl) gc() - - - diff --git a/R/LCAT/LCATv_apply_qmapQuant_to_crpd_df_fn.R b/R/LCAT/LCATv_apply_qmapQuant_to_crpd_df_fn.R index 7ab0cac2..639aa7be 100644 --- a/R/LCAT/LCATv_apply_qmapQuant_to_crpd_df_fn.R +++ b/R/LCAT/LCATv_apply_qmapQuant_to_crpd_df_fn.R @@ -1,5 +1,5 @@ #This version of this file was used to process LCAT files. Later version of this files will be renamed and split -##Loading data as created in 'Data_Processing_todf.R' +##Loading data as created in 'Data_Processing_todf.R' #Requires library(tidyverse) @@ -11,7 +11,7 @@ apply_bias_correction_to_cropped_df <- function(region, #Region code - needs to var, #Meterological variables Runs){ - i <- region + i <- region for(r in Runs){ for(v in var){ @@ -26,8 +26,8 @@ for(r in Runs){ #subset file list to var obs.var <- obs[grepl(v,obs)] - - #subset to calibration years + + #subset to calibration years obs.varc <- obs.var[grepl("1980", obs.var)] obs.df <- fread(paste0(fp, obs.varc)) obs.df <- as.data.frame(obs.df) @@ -36,12 +36,12 @@ for(r in Runs){ obs.df$x <- NULL obs.df$y <- NULL - #Remove the dates not in the cpm + #Remove the dates not in the cpm ## find col position of the first cpm date 19801201 n1 <-min(grep("19801201", names(obs.df))) obs.df <- obs.df[c(n1:ncol(obs.df))] - - + + #Using 1980 - 2010 as calibration period fp <- paste0(dd, "Interim/CPM/Data_as_df/") cpm.files <- list.files(fp) @@ -49,44 +49,44 @@ for(r in Runs){ #Calibration years 1980 - 2010 - load in full one for 1980 - 2000 cpm.cal <- cpm.files[grepl("1980|2000", cpm.files)] - #Subset file list to area + #Subset file list to area cpm.cal <- cpm.cal[grepl(i, cpm.cal)] #subset to var and run cpm.cal.var <- cpm.cal[grepl(v, cpm.cal)&grepl(r, cpm.cal)] - #Load in + #Load in cal.df <- lapply(cpm.cal.var, function(x){ df <- fread(paste0(fp, x)) df <- as.data.frame(df) - + row.names(df)<- paste0(df$x, "_", df$y) df$x <- NULL df$y <- NULL return(df) }) - + cal.df <- cal.df %>% reduce(cbind) - #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here - #Keep all of the files with these years - because the naming convention runs + #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here + #Keep all of the files with these years - because the naming convention runs #from Nov to following year we need to just take the first 30 days of the one starting with 20091201- n2 <- min(grep("20091201-",names(cal.df))) + 29 - - #This is the first part of the validation dataset, but all the val will be added to the projection df for + + #This is the first part of the validation dataset, but all the val will be added to the projection df for #the sake of bias correction and assessed separately proj.df1 <- cal.df[c((n2+1):ncol(cal.df))] cal.df <- cal.df[c(1:n2)] - - gc() - + + gc() + yi <- paste0(i,c(2020,2040,2060), collapse="|") cpm.proj <- cpm.files[grepl(yi, cpm.files)] #Subset to Area, var and run cpm.proj <- cpm.proj[grepl(i, cpm.proj)&grepl(v, cpm.proj)&grepl(r, cpm.proj)] - #Load in + #Load in proj.df2 <- lapply(cpm.proj, function(x){ df <- as.data.frame(fread(paste0(fp, x))) #Remove x and y cols @@ -96,32 +96,32 @@ for(r in Runs){ names(proj.df2) <- cpm.proj proj.df <- c(list(proj.df1), proj.df2) %>% reduce(cbind) - + remove("proj.df1") remove("proj.df2") ## **2. Wrangle the data** - + #missing.in.hads.cpm.cal <- cal.df[-which(row.names(cal.df)%in%row.names(obs.df)),] #missing.in.hads.cpm.proj <- proj.df[-which(row.names(proj.df)%in%row.names(obs.df)),] - + cal.df <- cal.df[which(row.names(cal.df)%in%row.names(obs.df)),] proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] - - #save the missing outputs + + #save the missing outputs p <- paste0("checkpoint1", v, "_", i, "_", r, "_") print(p) #write.csv(missing.in.hads.cpm.cal, paste0(dd, "Debiased/R/QuantileMapping/missing.in.hads/",r,"_",i,"_",v, ".csv")) - + ### Update obs data to 360 days - #The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted - + #The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted + #Convert obs to 360 day year - has 40 more vars so remove the ones not in cal remove <- c("0229_29", "0430_30", "0731_31", "0930_30", "1130_30") remove <- paste0(remove, collapse = "|") - + obs.df <- obs.df[,!grepl(remove, names(obs.df))] #This still pulls in the 31st Dec 2009 for some reason is in the hads so manual remove obs.df <- obs.df[1:ncol(cal.df)] @@ -146,8 +146,8 @@ for(r in Runs){ library(qmap) qm1.fit <- fitQmapQUANT(obs.df, cal.df, wet.day = FALSE, - qstep = 0.01, - nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. + qstep = 0.01, + nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. qm1.hist.a <- doQmapQUANT(cal.df, qm1.fit, type="linear") @@ -170,232 +170,232 @@ for(r in Runs){ p <- paste0("checkpoint5", v, "_", i, "_", r, "_") print(p) rm(list=setdiff(ls(), c("v", "i", "r", "var", "Runs"))) - + gc(reset=TRUE) - + } else { - -#### Precipitation - the HADs variable has is called 'rainfall' + +#### Precipitation - the HADs variable has is called 'rainfall' dd <- "/mnt/vmfileshare/ClimateData/" #Subset to Area #HADs grid observational data fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") files <- list.files(fp) obs <- files[grepl(i, files)] - + #subset file list to var obs.var <- obs[grepl("rainfall",obs)] - - #subset to calibration years + + #subset to calibration years obs.varc <- obs.var[grepl("1980", obs.var)] obs.df <- fread(paste0(fp, obs.varc)) obs.df <- as.data.frame(obs.df) - + row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) obs.df$x <- NULL obs.df$y <- NULL - - #Remove the dates not in the cpm + + #Remove the dates not in the cpm ## find col position of the first cpm date 19801201 n1 <-min(grep("19801201", names(obs.df))) obs.df <- obs.df[c(n1:ncol(obs.df))] - - + + #Using 1980 - 2010 as calibration period fp <- paste0(dd, "Interim/CPM/Data_as_df/") cpm.files <- list.files(fp) - + #Calibration years 1980 - 2010 - load in full one for 1980 - 2000 cpm.cal <- cpm.files[grepl("1980|2000", cpm.files)] - - #Subset file list to area + + #Subset file list to area cpm.cal <- cpm.cal[grepl(i, cpm.cal)] - + #subset to var and run cpm.cal.var <- cpm.cal[grepl(v, cpm.cal)&grepl(r, cpm.cal)] - - #Load in + + #Load in cal.df <- lapply(cpm.cal.var, function(x){ df <- fread(paste0(fp, x)) df <- as.data.frame(df) - + row.names(df)<- paste0(df$x, "_", df$y) df$x <- NULL df$y <- NULL return(df) }) - + cal.df <- cal.df %>% reduce(cbind) - - #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here - #Keep all of the files with these years - because the naming convention runs + + #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here + #Keep all of the files with these years - because the naming convention runs #from Nov to following year we need to just take the first 30 days of the one starting with 20091201- n2 <- min(grep("20091201-",names(cal.df))) + 29 - - #This is the first part of the validation dataset, but all the val will be added to the projection df for + + #This is the first part of the validation dataset, but all the val will be added to the projection df for #the sake of bias correction and assessed separately proj.df1 <- cal.df[c((n2+1):ncol(cal.df))] cal.df <- cal.df[c(1:n2)] - + gc() - + yi <- paste0(i,c(2020,2040,2060), collapse="|") cpm.proj <- cpm.files[grepl(yi, cpm.files)] - + #Subset to Area, var and run cpm.proj <- cpm.proj[grepl(i, cpm.proj)&grepl(v, cpm.proj)&grepl(r, cpm.proj)] - - #Load in + + #Load in proj.df2 <- lapply(cpm.proj, function(x){ df <- as.data.frame(fread(paste0(fp, x))) #Remove x and y cols df[c(3:ncol(df))] }) - + names(proj.df2) <- cpm.proj - + proj.df <- c(list(proj.df1), proj.df2) %>% reduce(cbind) - + remove("proj.df1") remove("proj.df2") - + ## **2. Wrangle the data** - + #missing.in.hads.cpm.cal <- cal.df[-which(row.names(cal.df)%in%row.names(obs.df)),] #missing.in.hads.cpm.proj <- proj.df[-which(row.names(proj.df)%in%row.names(obs.df)),] - - + + cal.df <- cal.df[which(row.names(cal.df)%in%row.names(obs.df)),] proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] - - #save the missing outputs + + #save the missing outputs p <- paste0("checkpoint1", v, "_", i, "_", r, "_") print(p) #write.csv(missing.in.hads.cpm.cal, paste0(dd, "Debiased/R/QuantileMapping/missing.in.hads/",r,"_",i,"_",v, ".csv")) - + ### Update obs data to 360 days - - #The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted - + + #The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted + #Convert obs to 360 day year - has 40 more vars so remove the ones not in cal remove <- c("0229_29", "0430_30", "0731_31", "0930_30", "1130_30") remove <- paste0(remove, collapse = "|") - + obs.df <- obs.df[,!grepl(remove, names(obs.df))] #This still pulls in the 31st Dec 2009 for some reason is in the hads so manual remove obs.df <- obs.df[1:ncol(cal.df)] - + ### Transpose the data sets - + #Obs grid should be cols, observations (time) should be rows for linear scaling - + cal.df <- t(cal.df) proj.df <- t(proj.df) obs.df <- t(obs.df) - + ## **3. Empirical Quantile Mapping** - + #(from qmap vignette) - fitQmapQUANT estimates values of the empirical cumulative distribution function of observed and #modelled time series for regularly spaced quantiles. doQmapQUANT uses these estimates to perform #quantile mapping p <- paste0("checkpoint2", v, "_", i, "_", r, "_") print(p) - - + + qm1.fit <- fitQmapQUANT(obs.df, cal.df, - wet.day = TRUE, #If wet.day=TRUE the empirical probability of nonzero observations is found (obs>=0) and the corresponding modelled value is selected as a threshold. All modelled values below this threshold are set to zero. If wet.day is numeric the same procedure is performed after setting all obs to zero. - qstep = 0.01, - nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. - - + wet.day = TRUE, #If wet.day=TRUE the empirical probability of nonzero observations is found (obs>=0) and the corresponding modelled value is selected as a threshold. All modelled values below this threshold are set to zero. If wet.day is numeric the same procedure is performed after setting all obs to zero. + qstep = 0.01, + nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. + + qm1.hist.a <- doQmapQUANT(cal.df, qm1.fit, type="linear") qm1.hist.b <- doQmapQUANT(cal.df, qm1.fit, type="tricub") - + qm1.proj.a <- doQmapQUANT(proj.df, qm1.fit, type="linear") qm1.proj.b <- doQmapQUANT(proj.df, qm1.fit, type="tricub") - + ## **4. Save the data** p <- paste0("checkpoint3", v, "_", i, "_", r, "_") print(p) # Save data - lists of dfs for now (will be easier for assessment) results.L <- list(obs.df, cal.df, proj.df, qm1.hist.a, qm1.hist.b, qm1.proj.a, qm1.proj.b) - + names(results.L) <- c("t.obs", "t.cal", "t.proj", "qm1.hist.a", "qm1.hist.b", "qm1.proj.a", "qm1.proj.b") p <- paste0("checkpoint4", v, "_", i, "_", r, "_") print(p) base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/resultsL", r,"_",i,"_",v, ".RDS")) - + p <- paste0("checkpoint5", v, "_", i, "_", r, "_") print(p) rm(list=setdiff(ls(), c("v", "i", "r", "var", "Runs"))) - + gc(reset=TRUE) - + } } } } -###################### Further cropping to the cropped dfs (!) - mostly for Scotland which is too big! +###################### Further cropping to the cropped dfs (!) - mostly for Scotland which is too big! cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs to relate to the file name in a unique way to subset var, #Meterological variables - Runs, #Runs in form 'Run08'- matched input + Runs, #Runs in form 'Run08'- matched input N.new.segments,...){ #Numeric, Number of dfs to break down to, eg 4 - - i <- region + + i <- region N.new.segments<- N.new.segments Runs <- Runs var <- var - + for(r in Runs){ for(v in var){ for(y in 1:N.new.segments){ if(v!="pr"){ dd <- "/mnt/vmfileshare/ClimateData/" - + #Subset to Area - #Load cpm first and then use this to subset the latter as there are more cells in cpm that hads + #Load cpm first and then use this to subset the latter as there are more cells in cpm that hads #Using 1980 - 2010 as calibration period fp <- paste0(dd, "Interim/CPM/Data_as_df/") cpm.files <- list.files(fp) - + #Calibration years 1980 - 2010 - load in full one for 1980 - 2000 cpm.cal <- cpm.files[grepl("1980|2000", cpm.files)] - - #Subset file list to area + + #Subset file list to area cpm.cal <- cpm.cal[grepl(i, cpm.cal)] - + #subset to var and run cpm.cal.var <- cpm.cal[grepl(v, cpm.cal)&grepl(r, cpm.cal)] - - #Load in + + #Load in cal.df <- lapply(cpm.cal.var, function(x){ df <- fread(paste0(fp, x)) df <- as.data.frame(df) - + row.names(df)<- paste0(df$x, "_", df$y) df$x <- NULL df$y <- NULL return(df) }) - + cal.df <- cal.df %>% reduce(cbind) - - - #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here - #Keep all of the files with these years - because the naming convention runs + + + #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here + #Keep all of the files with these years - because the naming convention runs #from Nov to following year we need to just take the first 30 days of the one starting with 20091201- n2 <- min(grep("20091201-",names(cal.df))) + 29 - - #This is the first part of the validation dataset, but all the val will be added to the projection df for + + #This is the first part of the validation dataset, but all the val will be added to the projection df for #the sake of bias correction and assessed separately proj.df1 <- cal.df[c((n2+1):ncol(cal.df))] cal.df <- cal.df[c(1:n2)] - + #Subset the dataframe iteratively depending on y nrows.seg <- nrow(cal.df)/N.new.segments y_1 <- y-1 @@ -403,176 +403,176 @@ cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs t nr1 <- round(nrows.seg*y_1) + 1 nr2 <- round(nrows.seg*y) cal.df <- cal.df[nr1:nr2,] - + #proj data yi <- paste0(i,c(2020,2040,2060), collapse="|") cpm.proj <- cpm.files[grepl(yi, cpm.files)] - + #Subset to Area, var and run cpm.proj <- cpm.proj[grepl(i, cpm.proj)&grepl(v, cpm.proj)&grepl(r, cpm.proj)] - - #Load in + + #Load in proj.df2 <- lapply(cpm.proj, function(x){ df <- as.data.frame(fread(paste0(fp, x))) #Remove x and y cols df[c(3:ncol(df))] }) - + names(proj.df2) <- cpm.proj - + proj.df <- c(list(proj.df1), proj.df2) %>% reduce(cbind) proj.df <- proj.df[which(row.names(proj.df)%in%row.names(cal.df)),] - + remove("proj.df1") remove("proj.df2") - - + + #HADs grid observational data fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") files <- list.files(fp) obs <- files[grepl(i, files)] - + #subset file list to var obs.var <- obs[grepl(v,obs)] - - #subset to calibration years + + #subset to calibration years obs.varc <- obs.var[grepl("1980", obs.var)] obs.df <- fread(paste0(fp, obs.varc)) obs.df <- as.data.frame(obs.df) - + row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) obs.df$x <- NULL obs.df$y <- NULL - + #Subset to the rows which are in above (some will be missing) obs.df <- obs.df[which(row.names(obs.df)%in%row.names(cal.df)),] - - #Remove the dates not in the cpm + + #Remove the dates not in the cpm ## find col position of the first cpm date 19801201 n1 <-min(grep("19801201", names(obs.df))) obs.df <- obs.df[c(n1:ncol(obs.df))] - + gc() - - + + ## **2. Wrangle the data** - + #missing.in.hads.cpm.cal <- cal.df[-which(row.names(cal.df)%in%row.names(obs.df)),] #missing.in.hads.cpm.proj <- proj.df[-which(row.names(proj.df)%in%row.names(obs.df)),] - - + + cal.df <- cal.df[which(row.names(cal.df)%in%row.names(obs.df)),] proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] - - #save the missing outputs + + #save the missing outputs p <- paste0("checkpoint1", v, "_", i, "_", r, "_",y) print(p) #write.csv(missing.in.hads.cpm.cal, paste0(dd, "Debiased/R/QuantileMapping/missing.in.hads/",r,"_",i,"_",v, ".csv")) - + ### Update obs data to 360 days - - #The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted - + + #The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted + #Convert obs to 360 day year - has 40 more vars so remove the ones not in cal remove <- c("0229_29", "0430_30", "0731_31", "0930_30", "1130_30") remove <- paste0(remove, collapse = "|") - + obs.df <- obs.df[,!grepl(remove, names(obs.df))] #This still pulls in the 31st Dec 2009 for some reason is in the hads so manual remove obs.df <- obs.df[1:ncol(cal.df)] - + ### Transpose the data sets - + #Obs grid should be cols, observations (time) should be rows for linear scaling - + cal.df <- t(cal.df) proj.df <- t(proj.df) obs.df <- t(obs.df) - - + + ## **3. Empirical Quantile Mapping** - + #(from qmap vignette) - fitQmapQUANT estimates values of the empirical cumulative distribution function of observed and #modelled time series for regularly spaced quantiles. doQmapQUANT uses these estimates to perform #quantile mapping p <- paste0("checkpoint2", v, "_", i, "_", r, "_",y) print(p) - + library(qmap) qm1.fit <- fitQmapQUANT(obs.df, cal.df, wet.day = FALSE, - qstep = 0.01, - nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. - - + qstep = 0.01, + nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. + + qm1.hist.a <- doQmapQUANT(cal.df, qm1.fit, type="linear") qm1.hist.b <- doQmapQUANT(cal.df, qm1.fit, type="tricub") - + qm1.proj.a <- doQmapQUANT(proj.df, qm1.fit, type="linear") qm1.proj.b <- doQmapQUANT(proj.df, qm1.fit, type="tricub") - + ## **4. Save the data** p <- paste0("checkpoint3", v, "_", i, "_", r, "_", y) print(p) # Save data - lists of dfs for now (will be easier for assessment) results.L <- list(obs.df, cal.df, proj.df, qm1.hist.a, qm1.hist.b, qm1.proj.a, qm1.proj.b) - + names(results.L) <- c("t.obs", "t.cal", "t.proj", "qm1.hist.a", "qm1.hist.b", "qm1.proj.a", "qm1.proj.b") p <- paste0("checkpoint4", v, "_", i, "_", r, "_", y) print(p) base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/resultsL", r,"_",i,"_",y,"_",v, ".RDS")) - + p <- paste0("checkpoint5", v, "_", i, "_", r, "_", y) print(p) rm(list=setdiff(ls(), c("v", "i", "r", "var", "Runs", "y", "N.new.segments"))) - + gc(reset=TRUE) - - + + } else { - - #### Precipitation - the HADs variable has is called 'rainfall' + + #### Precipitation - the HADs variable has is called 'rainfall' dd <- "/mnt/vmfileshare/ClimateData/" - + #Subset to Area - #Load cpm first and then use this to subset the latter as there are more cells in cpm that hads + #Load cpm first and then use this to subset the latter as there are more cells in cpm that hads #Using 1980 - 2010 as calibration period fp <- paste0(dd, "Interim/CPM/Data_as_df/") cpm.files <- list.files(fp) - + #Calibration years 1980 - 2010 - load in full one for 1980 - 2000 cpm.cal <- cpm.files[grepl("1980|2000", cpm.files)] - - #Subset file list to area + + #Subset file list to area cpm.cal <- cpm.cal[grepl(i, cpm.cal)] - + #subset to var and run cpm.cal.var <- cpm.cal[grepl(v, cpm.cal)&grepl(r, cpm.cal)] - - #Load in + + #Load in cal.df <- lapply(cpm.cal.var, function(x){ df <- fread(paste0(fp, x)) df <- as.data.frame(df) - + row.names(df)<- paste0(df$x, "_", df$y) df$x <- NULL df$y <- NULL return(df) }) - + cal.df <- cal.df %>% reduce(cbind) - - - #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here - #Keep all of the files with these years - because the naming convention runs + + + #Sub out beyond cal period (2010 - 2020) - ie just keep the calibration here + #Keep all of the files with these years - because the naming convention runs #from Nov to following year we need to just take the first 30 days of the one starting with 20091201- n2 <- min(grep("20091201-",names(cal.df))) + 29 - - #This is the first part of the validation dataset, but all the val will be added to the projection df for + + #This is the first part of the validation dataset, but all the val will be added to the projection df for #the sake of bias correction and assessed separately proj.df1 <- cal.df[c((n2+1):ncol(cal.df))] cal.df <- cal.df[c(1:n2)] - + #Subset the dataframe iteratively depending on y nrows.seg <- nrow(cal.df)/N.new.segments y_1 <- y-1 @@ -580,138 +580,136 @@ cropdf_further_apply_bc_to_cropped_df <- function(region, #Region code - needs t nr1 <- round(nrows.seg*y_1) + 1 nr2 <- round(nrows.seg*y) cal.df <- cal.df[nr1:nr2,] - - + + #proj data yi <- paste0(i,c(2020,2040,2060), collapse="|") cpm.proj <- cpm.files[grepl(yi, cpm.files)] - + #Subset to Area, var and run cpm.proj <- cpm.proj[grepl(i, cpm.proj)&grepl(v, cpm.proj)&grepl(r, cpm.proj)] - - #Load in + + #Load in proj.df2 <- lapply(cpm.proj, function(x){ df <- as.data.frame(fread(paste0(fp, x))) #Remove x and y cols df[c(3:ncol(df))] }) - + names(proj.df2) <- cpm.proj - + proj.df <- c(list(proj.df1), proj.df2) %>% reduce(cbind) proj.df <- proj.df[which(row.names(proj.df)%in%row.names(cal.df)),] - + remove("proj.df1") remove("proj.df2") - - + + #HADs grid observational data fp <- paste0(dd, "Interim/HadsUK/Data_as_df/") files <- list.files(fp) obs <- files[grepl(i, files)] - + #subset file list to var obs.var <- obs[grepl("rainfall",obs)] - - #subset to calibration years + + #subset to calibration years obs.varc <- obs.var[grepl("1980", obs.var)] obs.df <- fread(paste0(fp, obs.varc)) obs.df <- as.data.frame(obs.df) - + row.names(obs.df) <- paste0(obs.df$x, "_", obs.df$y ) obs.df$x <- NULL obs.df$y <- NULL - + #Subset to the rows which are in above (some will be missing) obs.df <- obs.df[which(row.names(obs.df)%in%row.names(cal.df)),] - - #Remove the dates not in the cpm + + #Remove the dates not in the cpm ## find col position of the first cpm date 19801201 n1 <-min(grep("19801201", names(obs.df))) obs.df <- obs.df[c(n1:ncol(obs.df))] - + gc() - - + + ## **2. Wrangle the data** - + #missing.in.hads.cpm.cal <- cal.df[-which(row.names(cal.df)%in%row.names(obs.df)),] #missing.in.hads.cpm.proj <- proj.df[-which(row.names(proj.df)%in%row.names(obs.df)),] - - + + cal.df <- cal.df[which(row.names(cal.df)%in%row.names(obs.df)),] proj.df <- proj.df[which(row.names(proj.df)%in%row.names(obs.df)),] - - #save the missing outputs + + #save the missing outputs p <- paste0("checkpoint1", v, "_", i, "_", r, "_",y) print(p) #write.csv(missing.in.hads.cpm.cal, paste0(dd, "Debiased/R/QuantileMapping/missing.in.hads/",r,"_",i,"_",v, ".csv")) - + ### Update obs data to 360 days - - #The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted - + + #The below is a work around with the HADS dataset having 365 days on leap years - this is to be updateed and corrected when the 360 day sampling is better sorted + #Convert obs to 360 day year - has 40 more vars so remove the ones not in cal remove <- c("0229_29", "0430_30", "0731_31", "0930_30", "1130_30") remove <- paste0(remove, collapse = "|") - + obs.df <- obs.df[,!grepl(remove, names(obs.df))] #This still pulls in the 31st Dec 2009 for some reason is in the hads so manual remove obs.df <- obs.df[1:ncol(cal.df)] - + ### Transpose the data sets - + #Obs grid should be cols, observations (time) should be rows for linear scaling - + cal.df <- t(cal.df) proj.df <- t(proj.df) obs.df <- t(obs.df) - - + + ## **3. Empirical Quantile Mapping** - + #(from qmap vignette) - fitQmapQUANT estimates values of the empirical cumulative distribution function of observed and #modelled time series for regularly spaced quantiles. doQmapQUANT uses these estimates to perform #quantile mapping p <- paste0("checkpoint2", v, "_", i, "_", r, "_",y) print(p) - + qm1.fit <- fitQmapQUANT(obs.df, cal.df, - wet.day = TRUE, #If wet.day=TRUE the empirical probability of nonzero observations is found (obs>=0) and the corresponding modelled value is selected as a threshold. All modelled values below this threshold are set to zero. If wet.day is numeric the same procedure is performed after setting all obs to zero. - qstep = 0.01, - nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. - - + wet.day = TRUE, #If wet.day=TRUE the empirical probability of nonzero observations is found (obs>=0) and the corresponding modelled value is selected as a threshold. All modelled values below this threshold are set to zero. If wet.day is numeric the same procedure is performed after setting all obs to zero. + qstep = 0.01, + nboot = 1) #nboot number of bootstrap samples used for estimation of the observed quantiles. + + qm1.hist.a <- doQmapQUANT(cal.df, qm1.fit, type="linear") qm1.hist.b <- doQmapQUANT(cal.df, qm1.fit, type="tricub") - + qm1.proj.a <- doQmapQUANT(proj.df, qm1.fit, type="linear") qm1.proj.b <- doQmapQUANT(proj.df, qm1.fit, type="tricub") - - + + ## **4. Save the data** p <- paste0("checkpoint3", v, "_", i, "_", r, "_", y) print(p) # Save data - lists of dfs for now (will be easier for assessment) results.L <- list(obs.df, cal.df, proj.df, qm1.hist.a, qm1.hist.b, qm1.proj.a, qm1.proj.b) - + names(results.L) <- c("t.obs", "t.cal", "t.proj", "qm1.hist.a", "qm1.hist.b", "qm1.proj.a", "qm1.proj.b") p <- paste0("checkpoint4", v, "_", i, "_", r, "_", y) print(p) base::saveRDS(results.L, file = paste0(dd, "Debiased/R/QuantileMapping/resultsL", r,"_",i,"_",y,"_",v, ".RDS")) - + p <- paste0("checkpoint5", v, "_", i, "_", r, "_", y) print(p) rm(list=setdiff(ls(), c("v", "i", "r", "var", "Runs", "y", "N.new.segments"))) - + gc(reset=TRUE) - - - + + + } } } } } - - diff --git a/R/README.md b/R/README.md index 390d95b9..6552b697 100644 --- a/R/README.md +++ b/R/README.md @@ -1,13 +1,13 @@ # Methods implemented in R -- **/Resampling** - code for Resampling data to different extents (grid sizes) +- **/Resampling** - code for Resampling data to different extents (grid sizes) - **/bias-correction-methods** - bias correction methods implemented in R -- **/comparing-r-and-python** - Comparing various pipeline aspects between R and python +- **/comparing-r-and-python** - Comparing various pipeline aspects between R and python ``` r -og_c <- terra::crop(og, scotland, snap="in") +og_c <- terra::crop(og, scotland, snap="in") plot(og_c$tasmax_1) ``` @@ -181,7 +181,7 @@ Ok there are some differences that I can see from the plot between the two resampled files! ``` r -## Cropping to a small area to compare with the same orginal HADS file +## Cropping to a small area to compare with the same orginal HADS file i <- rast() ext(i) <- c(200000, 210000, 700000, 710000) ``` @@ -246,7 +246,7 @@ plot(og_ci$tasmax_1) ![](WIP-Comparing-HADs-grids_files/figure-gfm/unnamed-chunk-10-1.png) ``` r -ukcp_c <- terra::crop(ukcp.r, i) +ukcp_c <- terra::crop(ukcp.r, i) plot(ukcp_c$`tasmax_rcp85_land-cpm_uk_2.2km_01_day_19991201-20001130_31`) ``` diff --git a/R/pre-processing/converting_city_crops_to_df.R b/R/pre-processing/converting_city_crops_to_df.R index df9cee3c..7fa85d30 100644 --- a/R/pre-processing/converting_city_crops_to_df.R +++ b/R/pre-processing/converting_city_crops_to_df.R @@ -2,12 +2,12 @@ rm(list=ls()) -# Most input to bias correction methods in R need dfs +# Most input to bias correction methods in R need dfs # Might be better in future to seperate it out differently (ie not run hads and cpm in same loop, or by variable ) library(qmap) library(terra) -library(tidyverse) +library(tidyverse) library(doParallel) dd <- "/mnt/vmfileshare/ClimateData/" @@ -19,55 +19,55 @@ x <- c("London", "Manchester", "Glasgow") # Where to write the results (note subfolder added as name of city/x above) rd <- paste0(dd, "Interim/CPM/Data_as_df/three.cities/") -#file locations for cropped versions of CPM +#file locations for cropped versions of CPM fps <- paste0(dd, "Cropped/three.cities/CPM/") fps <- paste0(fps,x) -#Run the conversion to df in parallel +#Run the conversion to df in parallel cores <- detectCores() cl <- makeCluster(cores[1]-1) registerDoParallel(cl) -foreach(x = x, +foreach(x = x, .packages = c("terra", "tidyverse"), - .errorhandling = 'pass') %dopar% { - + .errorhandling = 'pass') %dopar% { + fp <- fps[grepl(x, fps)] fp <- paste0(fp, "/") files <- list.files(fp) files.paths.all <- paste0(fp, files) - + #group in runs and in variable - + for(v in var){ for(r in run){ rr <- paste0("_",r,"_") files.paths <- files.paths.all[grepl(v, files.paths.all)& grepl(rr, files.paths.all)&grepl("CPM", files.paths.all)] - - # Read in 1st runpath as df with xy coords to ensure overlay - p1 <- files.paths[[1]] + + # Read in 1st runpath as df with xy coords to ensure overlay + p1 <- files.paths[[1]] rast <- rast(p1) - rdf1 <- as.data.frame(rast, xy=T) - - # Load and convert remaining to single col dfs + rdf1 <- as.data.frame(rast, xy=T) + + # Load and convert remaining to single col dfs dfL <- lapply(2:length(files.paths), function(i){ - p <- files.paths[[i]] + p <- files.paths[[i]] rast <- rast(p) - rdf <- as.data.frame(rast) + rdf <- as.data.frame(rast) return(rdf) - }) - + }) + df <- dfL %>% reduce(cbind) df <- cbind(rdf1, df) - + fn <- paste0(rd, x, "/", v, "_","Run",r, ".csv") write.csv(df, fn, row.names = F) - - - } + + + } } } -stopCluster(cl) +stopCluster(cl) gc() @@ -78,74 +78,74 @@ rd <- paste0(dd, "Interim/HadsUK/Data_as_df/three.cities/") fps <- paste0(dd, "Cropped/three.cities/Hads.updated360/") fps <- paste0(fps,x) -#Run the conversion to df in parallel +#Run the conversion to df in parallel cores <- detectCores() cl <- makeCluster(cores[1]-1) registerDoParallel(cl) -foreach(x = x, +foreach(x = x, .packages = c("terra", "tidyverse"), - .errorhandling = 'pass') %dopar% { - + .errorhandling = 'pass') %dopar% { + fp <- fps[grepl(x, fps)] fp <- paste0(fp, "/") files <- list.files(fp) files.paths.all <- paste0(fp, files) - + #group in runs and in variable - + for(v in var){ if(v!="pr"){ - + files.paths <- files.paths.all[grepl(v, files.paths.all)] - - # Read in 1st runpath as df with xy coords to ensure overlay - p1 <- files.paths[[1]] + + # Read in 1st runpath as df with xy coords to ensure overlay + p1 <- files.paths[[1]] rast <- rast(p1) - rdf1 <- as.data.frame(rast, xy=T) - - # Load and convert remaining to single col dfs + rdf1 <- as.data.frame(rast, xy=T) + + # Load and convert remaining to single col dfs dfL <- lapply(2:length(files.paths), function(i){ - p <- files.paths[[i]] + p <- files.paths[[i]] rast <- rast(p) - rdf <- as.data.frame(rast) + rdf <- as.data.frame(rast) return(rdf) - }) - + }) + df <- dfL %>% reduce(cbind) df <- cbind(rdf1, df) - + fn <- paste0(rd, x, "/", v, ".csv") write.csv(df, fn, row.names = F) - - + + } else { - + files.paths <- files.paths.all[grepl("rainfall", files.paths.all)] - - # Read in 1st runpath as df with xy coords to ensure overlay - p1 <- files.paths[[1]] + + # Read in 1st runpath as df with xy coords to ensure overlay + p1 <- files.paths[[1]] rast <- rast(p1) - rdf1 <- as.data.frame(rast, xy=T) - - # Load and convert remaining to single col dfs + rdf1 <- as.data.frame(rast, xy=T) + + # Load and convert remaining to single col dfs dfL <- lapply(2:length(files.paths), function(i){ - p <- files.paths[[i]] + p <- files.paths[[i]] rast <- rast(p) - rdf <- as.data.frame(rast) + rdf <- as.data.frame(rast) return(rdf) - }) - + }) + df <- dfL %>% reduce(cbind) df <- cbind(rdf1, df) - - + + fn <- paste0(rd, x, "/", v, ".csv") write.csv(df, fn, row.names = F) - + } } } -stopCluster(cl) +stopCluster(cl) gc() diff --git a/README.md b/README.md index ac4a1b22..409dadcf 100644 --- a/README.md +++ b/README.md @@ -98,7 +98,7 @@ The UK Climate Projections 2018 (UKCP18) dataset offers insights into the potent [HadUK-Grid](https://www.metoffice.gov.uk/research/climate/maps-and-data/data/haduk-grid/haduk-grid) is a comprehensive collection of climate data for the UK, compiled from various land surface observations across the country. This data is organized into a uniform grid to ensure consistent coverage throughout the UK at up to 1km x 1km resolution. The dataset, spanning from 1836 to the present, includes a variety of climate variables such as air temperature, precipitation, sunshine, and wind speed, available on daily, monthly, seasonal, and annual timescales. ### Geographical Dataset -The geographical dataset can be used for visualising climate data. It mainly includes administrative boundaries published by the Office for National Statistics (ONS). The dataset is sharable under the [Open Government Licence v.3.0](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/) and is available for download via this [link](https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/NUTS_Level_1_January_2018_FCB_in_the_United_Kingdom_2022/FeatureServer/replicafilescache/NUTS_Level_1_January_2018_FCB_in_the_United_Kingdom_2022_7279368953270783580.zip). We include a copy in the `data/Geofiles` folder for convenience. In addition, the clips for three cities' boundaries from the same dataset are copied to `three.cities` subfolder. +The geographical dataset can be used for visualising climate data. It mainly includes administrative boundaries published by the Office for National Statistics (ONS). The dataset is sharable under the [Open Government Licence v.3.0](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/) and is available for download via this [link](https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/NUTS_Level_1_January_2018_FCB_in_the_United_Kingdom_2022/FeatureServer/replicafilescache/NUTS_Level_1_January_2018_FCB_in_the_United_Kingdom_2022_7279368953270783580.zip). We include a copy in the `data/Geofiles` folder for convenience. In addition, the clips for three cities' boundaries from the same dataset are copied to `three.cities` subfolder. ## Why Bias Correction? From a704f7cc0f9790c93471a91b3dc43c453acf6bef Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Wed, 22 Nov 2023 16:35:14 +0000 Subject: [PATCH 35/83] feat: add `notebooks/Assessing_bc_Data_local-mac.rmd` --- .../Assessing_BC_Data_local-mac.rmd | 895 ++++++++++++++++++ 1 file changed, 895 insertions(+) create mode 100644 notebooks/Assessing_bc_data/Assessing_BC_Data_local-mac.rmd diff --git a/notebooks/Assessing_bc_data/Assessing_BC_Data_local-mac.rmd b/notebooks/Assessing_bc_data/Assessing_BC_Data_local-mac.rmd new file mode 100644 index 00000000..213547d1 --- /dev/null +++ b/notebooks/Assessing_bc_data/Assessing_BC_Data_local-mac.rmd @@ -0,0 +1,895 @@ +--- +title: "Bias correction assessment" +author: "Ruth Bowyer" +date: "`r format(Sys.Date())`" +output: + html_document: + theme: cosmo + toc: TRUE + toc_float: TRUE + toc_depth: 4 + code_folding: hide + df_print: paged +--- + + +```{r libs and setup, message=FALSE, warning=F} +rm(list=ls()) + +knitr::opts_knit$set(root.dir="/mnt/vmfileshare/ClimateData/") + +library(ggplot2) +library(terra) +# library(tmap) #pretty maps +library(RColorBrewer) +# library(tidyverse) +library(kableExtra) + +``` + + +## **0. About** + +This is an example notebook for the assessment of bias corrected data, using output from the R 'qmap' package for the city of Glasgow and the variable 'tasmax'. + +**Input data** + +This script requires the following data: + +- 'obs.cal' - observation (HADs data) for the *calibration* period - the dataset used as the reference dataset in the bias correction +- 'obs.val' - as above, for the *validation* period +- 'cpm.cal.raw' - the raw (uncorrected) data for the *calibration* period +- 'cpm.cal.adj' - the adjusted (bias-corrected) data for the *calibration* period +- 'cpm.val.raw' - the raw (uncorrected) data for the *valibration* period +- 'cpm.val.adj' - the adjusted (bias-corrected) data for the *valibration* period +- 'cpm.proj.raw' - the raw (uncorrected) data for the *future/projected* period (optional) +- 'cpm.proj.radj' - the adjusted (bias-corrected) data for the *future/projected* period (optional) + +The data is required in raster format and dataframe formats + +**Calibration vs Validation dates** + +The calibration period runs between 01-12-1980 to the day prior to 01-12-2010 +The validation period runs between 01-12-2010 to the day prior to 01-12-2020 + +```{r data loading, include=FALSE} + +knitr::opts_knit$set(root.dir="/mnt/vmfileshare/ClimateData/") + +#This chunk attempts to apply the conversion to python output data to a form that this script will also use. This could (and probably should) be moved to a source script -- also the R pre-processing should probably be moved to the bias correction script? + +dd <- "/mnt/vmfileshare/ClimateData/" #Data directory of all data used in this script +if(Sys.info()["sysname"] == "Darwin"){ + dd <- "/Volumes/vmfileshare/ClimateData/" +} + +input <- "raster" #Either df or raster -- R outputs are a group of dfs in list form saved as an RDS, python input is a raster +city <- "Glasgow" +var <- "tasmax" +runs <- c("05", "06", "07", "08") + +if(input=="raster"){ + +####### PYTHON INPUTS HERE ###### + # This script uses both raster data and the raw data + # This script uses Lists to group everything by runs + # Therefore what is require from this here is to create a list object for each of the sets of the data as listed above, where the list items are the rasters or dataframes by run (ie each level of the list is a run) + # .nc and .tif files can be read with rast("path/to/file.nc") + # Conversion to df is just as.data.frame(raster, xy=T) - easiest thing is just to loop using lapply the files + #dfs are assumed to be cells x time + r <- list.files(paste0(dd, "Debiased/three.cities.cropped/Manchester/08/tasmax/"), full.names = T)[1] + rast <- rast(r) + crs <- crs(rast) + + ## The output created from the R bias correction framework is a list of dataframes containing all the data we need for this doc (although some are transposed). + rd <- "Debiased/R/QuantileMapping/three.cities/" + +} else if (input=="RDS"){ + ### This R bit is a bit crazy because of the format output from the bias correction - at some point to be cleaned up and moved to a different script. + ## Load a source raster to extract the crs + r <- list.files(paste0(dd, "Reprojected_infill/UKCP2.2/tasmax/05/latest/"), full.names = T)[1] + rast <- rast(r) + crs <- crs(rast) + + ## The output created from the R bias correction framework is a list of dataframes containing all the data we need for this doc (although some are transposed). + rd <- "Debiased/R/QuantileMapping/three.cities/" + + files <- list.files(paste0(dd,rd,city),full.names=T) + files.v <- files[grepl(var, files)] + + allruns <- lapply(files.v, readRDS) + + names <- gsub(paste0(dd,rd,city,"|/|.RDS"),"",files.v) + names(allruns) <- names + + #This was returned for ease where multiple runs have been looped to apply this paritcular function, but actually we don't need a cope for each nor this data in a list. Therefore: + obs.cal.df <- as.data.frame(t(allruns[[1]]$t.obs)) + + obs.val.df <- allruns[[1]]$val.df[c(1:3601)] #To run until 30th Nov 2020 + + cpm.cal.raw.df.L <- lapply(allruns, function(L){ + as.data.frame(t(L[["t.cal"]])) + }) + + #In the R scirpt, the validation is corrected with the projected data as well - so needs to be seperated out (and transposed) + cpm.val.raw.df.L <- lapply(allruns, function(L){ + proj <- as.data.frame(t(L[["t.proj"]])) + val.end.date <- min(grep("20201201-", names(proj)))-1 + cpm.val.raw.df <- proj[,1:val.end.date] + }) + + cpm.proj.raw.df.L <- lapply(allruns, function(L){ + proj <- as.data.frame(t(L[["t.proj"]])) + val.end.date <- min(grep("20201201-", names(proj))) + cpm.val.raw.df <- proj[,val.end.date:ncol(proj)] + }) + + cpm.cal.adj.df.L <- lapply(allruns, function(L){ + adj <- as.data.frame(t(L[["qm1.hist"]])) + }) + + cpm.val.adj.df.L <- lapply(allruns, function(L){ + proj <- as.data.frame(t(L[["qm1.val.proj"]])) + val.end.date <- min(grep("20201201-", names(proj)))-1 + proj[,1:val.end.date] + }) + + cpm.proj.adj.df.L <- lapply(allruns, function(L){ + proj <- as.data.frame(t(L[["qm1.val.proj"]])) + val.end.date <- min(grep("20201201-", names(proj))) + proj[,val.end.date:ncol(proj)] + }) + + ## Convert to rasters --requires creation of x and y cols from row names +## For the comparison, just converting the observation and cpm for the cal and val perios (ie not the projection datasets) + +``` + +```{r data processing, include=FALSE} + +obsrastL <- lapply(list(obs.cal.df, obs.val.df), function(i){ + rn <- row.names(i) #The rownames were saves as x_y coordinates + xi <- gsub("_.*", "", rn) + yi <- gsub(".*_", "", rn) + xy <- data.frame(x = xi, y = yi) + df <- cbind(xy, i) + r <- rast(df, type="xyz") + crs(r) <- crs + return(r) +}) + +names(obsrastL) <- c("obs.cal.rasts", "obs.val.rasts") +list2env(obsrastL, .GlobalEnv) +remove(obsrastL) + +list2rast <- list(cpm.cal.raw.df.L, cpm.cal.adj.df.L, cpm.val.raw.df.L, cpm.val.adj.df.L) + +rastsL <- lapply(list2rast, function(x){ + allruns <- x + df.rL <- lapply(runs, function(i){ + df <- allruns[[grep(i, names(allruns))]] #extract df based on run id + rn <- row.names(df) #The rownames were saves as x_y coordinates + xi <- gsub("_.*", "", rn) + yi <- gsub(".*_", "", rn) + xy <- data.frame(x = xi, y = yi) + df <- cbind(xy, df) + r <- rast(df, type="xyz") + crs(r) <- crs + return(r) + }) + names(df.rL) <- runs + return(df.rL) + }) + +names(rastsL) <- c("cpm.cal.raw.rasts.L", "cpm.cal.adj.rasts.L", "cpm.val.raw.rasts.L", "cpm.val.adj.rasts.L") + +list2env(rastsL, .GlobalEnv) + +remove(rastsL) +remove(list2rast) + +gc() + + } else { + print("Invalid input") +} + + + +``` + + +## **1. Bias Correction Assessment: trends** + +An visual comparison of trends across observation, raw and adjusted data for the same time period + +### **1a. Raster comparison** + +Random selection of 3 days of the observation, calibration and two adjusted cals, for three historic days + +Adding in the city shapeoutline for prettier maps + +```{r} + +shape <-sf::st_as_sf(vect(paste0(dd, "shapefiles/three.cities/", city, "/", city, ".shp"))) + +``` + + + +#### **Day 1 - 1980-12-01 - calibration period ** {.tabset} + +##### Run05 + +```{r, fig.show="hold", out.width="33%"} + + tm_shape(obs.cal.rasts[[1]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + + +tm_shape(cpm.cal.raw.rasts.L$`05`[[1]]) + + tm_raster(title="CPM, Raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.adj.rasts.L$`05`[[1]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +``` + + +##### Run06 + +```{r, fig.show="hold", out.width="33%"} +tm_shape(obs.cal.rasts[[1]]) + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.raw.rasts.L$`06`[[1]]) + + tm_raster(title="CPM, Raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.adj.rasts.L$`06`[[1]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") +``` + + +##### Run07 + +```{r, fig.show="hold", out.width="33%"} + +tm_shape(obs.cal.rasts[[1]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.raw.rasts.L$`07`[[1]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.adj.rasts.L$`07`[[1]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") +``` + + +##### Run08 + +```{r, fig.show="hold", out.width="33%"} + +tm_shape(obs.cal.rasts[[1]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.raw.rasts.L$`08`[[1]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.adj.rasts.L$`08`[[1]]) + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +``` + +#### **Day 2 - 2008-08-01 - calibration period ** {.tabset} + +##### Run05 + +```{r, fig.show="hold", out.width="33%"} + + tm_shape(obs.cal.rasts[[7081]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + + +tm_shape(cpm.cal.raw.rasts.L$`05`[[7081]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.adj.rasts.L$`05`[[7081]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +``` + + +##### Run06 + +```{r, fig.show="hold", out.width="33%"} +tm_shape(obs.cal.rasts[[7081]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.raw.rasts.L$`06`[[7081]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.adj.rasts.L$`06`[[7081]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") +``` + + +##### Run07 + +```{r, fig.show="hold", out.width="33%"} + +tm_shape(obs.cal.rasts[[7081]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.raw.rasts.L$`07`[[7081]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.adj.rasts.L$`07`[[7081]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") +``` + + +##### Run08 + +```{r, fig.show="hold", out.width="33%"} + +tm_shape(obs.cal.rasts[[7081]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.raw.rasts.L$`08`[[7081]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.adj.rasts.L$`08`[[7081]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +``` + + + +#### **Day 3 - 2015-05-01 - calibration period ** {.tabset} + +##### Run05 + +```{r, fig.show="hold", out.width="33%"} + + tm_shape(obs.val.rasts[[1590]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + + +tm_shape(cpm.val.raw.rasts.L$`05`[[1590]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.val.adj.rasts.L$`05`[[1590]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +``` + + +##### Run06 + +```{r, fig.show="hold", out.width="33%"} +tm_shape(obs.val.rasts[[1590]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.val.raw.rasts.L$`06`[[1590]]) + + tm_raster(title="CPM, Raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.val.adj.rasts.L$`06`[[1590]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") +``` + + +##### Run07 + +```{r, fig.show="hold", out.width="33%"} + +tm_shape(obs.val.rasts[[1590]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.val.raw.rasts.L$`07`[[1590]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.val.adj.rasts.L$`07`[[1590]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") +``` + + +##### Run08 + +```{r, fig.show="hold", out.width="33%"} + +tm_shape(obs.val.rasts[[1590]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.val.raw.rasts.L$`08`[[1590]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.val.adj.rasts.L$`08`[[1590]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +``` + +#### {-} + +### **1b. Trend comparison** + +```{r} +#Lists of dfs to summarise the means of +dfL <- c(list(obs.cal.df), cpm.cal.raw.df.L, cpm.cal.adj.df.L) +names(dfL) <- c("obs.cal", paste0("cpm.cal.raw.", names(cpm.cal.raw.df.L)), + paste0("cpm.cal.adj.", names(cpm.cal.raw.df.L))) + +#Returns a list of dfs in handy format for graphing +dfg.daily.means <- lapply(dfL, function(i){ + x <- 1:ncol(i) #ignore cols 1 & 2 with x y + #Calc mean and sd + dfx <- lapply(x, function(x){ + y <- i[,x] + mean <- mean(y, na.rm=T) + sd <- sd(y, na.rm=T) + dfr <- data.frame(mean=mean, + sd.high=mean+sd, + sd.low=mean-sd) + dfr$day <- names(i)[x] + return(dfr) + }) + dfx_g <- dfx %>% purrr::reduce(rbind) + }) + + +names(dfg.daily.means) <- names(dfL) +``` + +*Note* : Can add a plot here for daily averages but it's quite visually confusing so omitting for now + +#### **Seasonal trends - Calibration period ** + + +```{r} + +#Annotate season based on month index - the dates have different formats depending on the input data (ie hads vs cpm) so am pulling out the necessary to adjust sep + +obs.cal.season.mean <- dfg.daily.means$obs.cal + +x <- dfg.daily.means$obs.cal$day + +obs.cal.season.mean$season <- ifelse(grepl("1231_|0131_|0228_|0229_", x), + "Winter", + ifelse(grepl("0331_|0430_|0531_", x), "Spring", + ifelse(grepl("0630_|0731_|0831_", x), "Summer", "Autumn"))) + +#Note = the n days per season is not quite evenly split between the 4 seasons because of how the hads resamples across the year for 360 days + + #Create season_year - All Winter months apart from Dec to be added to the previous year (ie Winter 2000) would be the Dec of 2000 to the Feb of 2001 + rem <- nchar(var) + 39 + year <- substr(x, rem, rem+3) + year <- as.numeric(substr(year, 1,4)) + obs.cal.season.mean$season_year <- ifelse(grepl("0131_|0228_|0229_", x), + paste0(year-1, obs.cal.season.mean$season), + paste0(year, obs.cal.season.mean$season)) + # Mutate to a seasonal mean df + obs.cal.season.mean <- aggregate(obs.cal.season.mean[[1]], list(obs.cal.season.mean[["season_year"]]), function(x) c(seasonal.mean = mean(x), sd.high.seasonal = mean(x) + sd(x), sd.low.seasonal = mean(x) - sd(x))) + obs.cal.season.mean<- data.frame(season_year=obs.cal.season.mean$Group.1, + seasonal.mean=obs.cal.season.mean$x[,"seasonal.mean"], + sd.high.seasonal = obs.cal.season.mean$x[,"sd.high.seasonal"], + sd.low.seasonal = obs.cal.season.mean$x[,"sd.low.seasonal"]) + + + #Grouping variable for later vars + obs.cal.season.mean$model <- "obs" + +``` + +## Ruth to finish cleaning up this bit (it won't run at the moment) + +```{r eval=FALSE, include=FALSE} + dfg.seasonal.mean <- lapply(c("raw.cal.daymeans", "bc.b.cal.daymeans", + "bc.a.cal.daymeans"), function(i){ + df <- dfg[[i]] + x <- df$day + x <- gsub(".*_", "", x) + x <- as.numeric(x) + #The CPM days are consecutive 1 - 360 by year + df$season <- ifelse(x<91, "Winter", + ifelse(x<181, "Spring", + ifelse(x<271, "Summer", "Autumn"))) + + #Create season_year - All Winter months apart from Dec to be added to the previous year (ie Winter 2000) would be the Dec of 2000 to the Feb of 2001 + year <- gsub(".*day_", "", df$day) + year <- as.numeric(substr(year, 1,4)) + df$season_year <- ifelse(x>29&x<91, + paste0(year-1, df$season), + paste0(year, df$season)) + + # Mutate to a seasonal mean -- cant get this to run in tidyverse within loop as cant seem to get col indexing working so: + df2 <- aggregate(df[[1]], list(df[["season_year"]]), function(x) c(seasonal.mean = mean(x), sd.high.seasonal = mean(x) + sd(x), sd.low.seasonal = mean(x) - sd(x))) + + df2 <- data.frame(season_year=df2$Group.1, + seasonal.mean=df2$x[,"seasonal.mean"], + sd.high.seasonal = df2$x[,"sd.high.seasonal"], + sd.low.seasonal = df2$x[,"sd.low.seasonal"]) + + df2$model <- gsub(".daymeans","",i) + + return(df2)}) + + dff <- c(list(obs.seasonal.mean.df), dfg.seasonal.mean) %>% reduce(rbind) + dff$Run <- r + return(dff) +}) + +names(seasonal.means) <- runs + +seasonal.means.df <- seasonal.means %>% reduce(rbind) + +``` + +#### Fig. Calibration period - seasonal mean + +```{r eval=FALSE, include=FALSE} + +ggplot(seasonal.means.df, aes(season_year, seasonal.mean, group=model, colour=model)) + + geom_line() + + facet_wrap(.~Run) + + theme_bw() + ylab("Av daily max temp oC") + + ggtitle("Tasmax Hisotric trends") + + scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Seasonal averages, 1980.12.01 - 2009.12.01") + + scale_color_brewer(palette="Set1", name="Model") + +``` + + +##### *Summer only* + +```{r Raw seasonal winter, eval=FALSE, include=FALSE} + +dfg_sm<- subset(seasonal.means.df, grepl("Summer", season_year)) + +ggplot(dfg_sm, aes(season_year, seasonal.mean, group=model, colour=model)) + + geom_line(alpha=0.7) + + facet_wrap(.~Run) + + theme_bw() + ylab("Av daily max temp oC -Summer average") + + ggtitle("Tasmax Hisotric trends") + + scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Summer averages, 1980.12.01 - 2009.12.01") + + scale_color_brewer(palette="Set1", name="Model") + +``` + + +#### *Annual trends - seasonal max* + +For tasmax - grouping to season and calculating the seasonal maxima vals (i.e. rather than means above) + +```{r eval=FALSE, include=FALSE} + +#Convert to max, out put a df in easy fig format +dfg.max <- lapply(runs, function(r){ + L <- df.rL[[r]] + names(L)[1:3] <- c("obs", "cal", "proj") + dfg <- lapply(names(L), function(ii){ + dfi <- L[[ii]] + x <- 3:ncol(dfi) #ignore cols 1 & 2 with x y + #Calc maxima of the + dfx <- lapply(x, function(x){ + xx <- dfi[,x] + data.frame(max=max(xx, na.rm=T), day= names(dfi)[x]) + }) + + dfx_g <- dfx %>% purrr::reduce(rbind) + }) + + names(dfg) <- paste0(names(L), ".max") + return(dfg) +}) + +names(dfg.max) <- runs + +seasonal.max.cal <- lapply(runs, function(r){ + dfg <- dfg.max[[r]] + #Hads/obs df + df1 <- dfg$obs.max + x <- df1$day + df1$season <- ifelse(grepl("1231_|0131_|0228_|0229_", x), + "Winter", + ifelse(grepl("0331_|0430_|0531_", x), "Spring", + ifelse(grepl("0630_|0731_|0831_", x), "Summer", "Autumn"))) + +#Note: the seasons should each have 90 days but seemingly Winter and Autumn have 89 and Spring and Summer have 91 - this is due to how the manual aligning worked out and should be updated when the hads data is re-run + + #Create season_year - All Winter months apart from Dec to be added to the previous year (ie Winter 2000) would be the Dec of 2000 to the Feb of 2001 + year <- gsub("^[^_]*_", "", x) + year <- as.numeric(substr(year, 1,4)) + df1$season_year <- ifelse(grepl("0131_|0228_|0229_", x), + paste0(year-1, df1$season), + paste0(year, df1$season)) + # Mutate to a seasonal mean df + obs.seasonal.max.df <- aggregate(df1[[1]], list(df1[["season_year"]]), max) + #Grouping variable for later vars + obs.seasonal.max.df$model <- "obs" + + dfg.seasonal.max <- lapply(c("cal.max", "qm1.hist.a.max", + "qm1.hist.b.max"), function(i){ + df <- dfg[[i]] + x <- df$day + x <- gsub(".*_", "", x) + x <- as.numeric(x) + #The CPM days are consecutive 1 - 360 by year + df$season <- ifelse(x<91, "Winter", + ifelse(x<181, "Spring", + ifelse(x<271, "Summer", "Autumn"))) + + #Create season_year - All Winter months apart from Dec to be added to the previous year (ie Winter 2000) would be the Dec of 2000 to the Feb of 2001 + year <- gsub(".*day_", "", df$day) + year <- as.numeric(substr(year, 1,4)) + df$season_year <- ifelse(x>29&x<91, + paste0(year-1, df$season), + paste0(year, df$season)) + + # Mutate to a seasonal mean -- cant get this to run in tidyverse within loop as cant seem to get col indexing working so: + df2 <- aggregate(df[[1]], list(df[["season_year"]]), max) + + df2$model <- gsub(".max","",i) + + return(df2)}) + + dff <- c(list(obs.seasonal.max.df), dfg.seasonal.max) %>% reduce(rbind) + dff$Run <- r + return(dff) +}) + +names(seasonal.max.cal) <- runs + +seasonal.maxima.df <- seasonal.max.cal %>% reduce(rbind) +names(seasonal.maxima.df) <- c("season_year", "max", "model", "Run") +``` + +#### Fig. Calibration period - seasonal max + +```{r eval=FALSE, include=FALSE} + +ggplot(seasonal.maxima.df, aes(season_year, max, group=model, colour=model)) + + geom_line() + + facet_wrap(.~Run) + + theme_bw() + ylab("Max daily max temp oC") + + ggtitle("Tasmax Hisotric trends") + + scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Seasonal averages, 1980.12.01 - 2009.12.01") + + scale_color_brewer(palette="Set1", name="Model") + +``` + +#### Fig. Calibration period - *Summer only* + +```{r eval=FALSE, include=FALSE} + +dfg_sm<- subset(seasonal.maxima.df, !grepl("qm1.hist.b", model)&grepl("Summer", season_year)) + +ggplot(dfg_sm, aes(season_year, max, group=model, colour=model)) + + geom_line(alpha=0.7) + + facet_wrap(.~Run) + + theme_bw() + ylab("Av daily max temp oC -Summer average") + + ggtitle("Tasmax Historic trends") + + scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Seasonal Summer averages, 1980.12.01 - 2009.12.01") + + scale_color_brewer(palette="Set1", name="Model") + +``` + + +#### *Validation period - annual trends - seasonal mean* + +(To be added) + +#### *Validation period - annual trends - seasonal max* + +(To be added) + +## **2. Bias Correction Assessment: Metrics** + +Using the validation data set for this + + +```{r} + +val.dfs <- c(list(obs.val.df), cpm.val.raw.df.L, cpm.val.adj.df.L) + +#Convert dfs to a vector +val.dfs.v <- lapply(val.dfs, function(d){ + #Convert to single vector + unlist(as.vector(d))}) + +val.dfs.v.df <- as.data.frame(val.dfs.v) +names(val.dfs.v.df) <- c("obs.val", paste0("Run", rep(runs, 2), "_", var, ".",rep(c("raw", "adj", 4)))) # Names for easy reference + +``` + + +### **2a. Descriptive statistics** + +```{r descriptives validation} + +descriptives <- apply(val.dfs.v.df, 2, function(x){ + per <- data.frame(as.list(quantile(x, probs=c(0.1, 0.9)))) + data.frame(mean=mean(x), sd=sd(x), min = min(x), per10th=per$X10.,per90th=per$X90., max = max(x)) +}) + +descriptives <- descriptives %>% reduce(rbind) +row.names(descriptives) <- names(val.dfs.v.df) +d <- t(descriptives) + +d %>% + kable(booktabs = T) %>% + kable_styling() %>% + row_spec(grep(".bc.",row.names(d)), background = "lightgrey") + +``` + + +#### Fig.Density plot of validation period + +**Note** - need to add back in some facetting to this fig + +```{r warning=F, message=F} +m <- reshape2::melt(val.dfs.v.df) + +ggplot(m, aes(value, fill=variable, colour=variable)) + + geom_density(alpha = 0.3, position="identity") + + theme_minimal() + + scale_fill_brewer(palette = "Set1") + + scale_color_brewer(palette = "Set1") + +``` + +### **2b. Model fit statistics** + +Using the following to assess overall fit: + +- **R-squared (rsq)** +- **Root Square Mean Error (RMSE)** +- **Nash-Sutcliffe Efficiency (NSE):** Magnitude of residual variance compared to measured data variance, ranges -∞ to 1, 1 = perfect match to observations +- **Percent bias (PBIAS):** The optimal value of PBIAS is 0.0, with low-magnitude values indicating accurate model simulation. Positive values indicate overestimation bias, whereas negative values indicate model underestimation bias. + +```{r rsq} +actual <- val.dfs.v.df$obs.val + +rsq <- sapply(val.dfs.v.df[c(2:ncol(val.dfs.v.df))], function(x){ + cor(actual, x)^2 +}) + +``` + +```{r rmse} + +rmse <- sapply(val.dfs.v.df[c(2:ncol(val.dfs.v.df))], function(x){ + sqrt(mean((actual - x)^2)) +}) + +``` + +```{r pbias} + +pbias <- sapply(val.dfs.v.df[c(2:ncol(val.dfs.v.df))], function(x){ + hydroGOF::pbias(x, actual) +}) + +``` + +```{r nse} +nse <- sapply(val.dfs.v.df[c(2:ncol(val.dfs.v.df))], function(x){ + hydroGOF::NSE(x, actual) +}) + +``` + +Highlighting the bias corrected statistics + +```{r pretty kable} + +k <- cbind(rsq, rmse, pbias, nse) +k %>% + kable(booktabs = T) %>% + kable_styling() %>% + row_spec(grep(".bc.",row.names(k)), background = "lightgrey") + +``` + + +## **3. Bias Correction Assessment: Metric specific - tasmax** + +### **3b Days above 30 degrees** + +(Not considered consecutively here) + +```{r eval=FALSE, include=FALSE} + +### Ruth to update + +val.dfs.v.df$year <- substr(row.names(val.dfs.v.df), 8,11) + +over30 <- lapply(names(val.dfs.v.df), function(i){ + x <- val.dfs.v.df[,i] + df <- aggregate(x, list(val.dfs.v.df$year), function(x){sum(x>=30)}) + names(df) <- c("year", paste0("Days.over.30.", i)) + return(df) +}) + +over30 %>% reduce(left_join, "year") +``` + + +### **Number of heatwaves per annum** + +(to be added) + +#### **For future work** + +The number of quantiles selected will effect the efficacy of the bias correction: lots of options therefore with this specific method + + From 30ab29405e9398969a927dafc0b981f999cf1838 Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Thu, 23 Nov 2023 01:50:27 +0000 Subject: [PATCH 36/83] fix(ci): apply `pre-commit` change to remainder of `R/` --- R/misc/Cropping_Rasters_to_three_cities.R | 24 +- R/misc/HAD.CRS.for.sourcing.R | 4 +- R/misc/Identifying_Runs.RMD | 265 +++++++++-------- R/misc/Identifying_Runs.md | 344 +++++++++++----------- R/misc/InfillDataRunIdentyfing.R | 6 +- R/misc/calc.mean.sd.daily.infill.R | 7 +- R/misc/clim-recal-specific-functions.R | 2 - R/misc/cropping-CPM-to-Scotland.R | 5 +- R/misc/read_crop.fn.R | 112 ++++--- R/misc/read_crop_df_write.fn.R | 152 +++++----- 10 files changed, 455 insertions(+), 466 deletions(-) diff --git a/R/misc/Cropping_Rasters_to_three_cities.R b/R/misc/Cropping_Rasters_to_three_cities.R index 87537d98..f7c206e0 100644 --- a/R/misc/Cropping_Rasters_to_three_cities.R +++ b/R/misc/Cropping_Rasters_to_three_cities.R @@ -1,4 +1,4 @@ -## Crop CPM and HADs +## Crop CPM and HADs rm(list=ls()) #setwd("~/Desktop/clim-recal/clim-recal/") @@ -14,7 +14,7 @@ dd <- "/mnt/vmfileshare/ClimateData/" ## Using the extents (ie full grid) rather than masking and cropping to city outlines -# 1. London +# 1. London f <- paste0(dd,'shapefiles/NUTS_Level_1_January_2018_FCB_in_the_United_Kingdom_2022_7279368953270783580/NUTS_Level_1_January_2018_FCB_in_the_United_Kingdom.shp') UK.shape <-vect(f) London <- UK.shape[which(UK.shape$nuts118cd=="UKI")] @@ -57,8 +57,8 @@ ext.L <- list(London.ext, Glasgow.ext, Manchester.ext) names(ext.L) <- cities lapply(cities, function(x){ - - cpm_read_crop(runs=runs, var = var, + + cpm_read_crop(runs=runs, var = var, fp = paste0(dd, "Reprojected_infill/UKCP2.2/"), rd = paste0(dd, "Cropped/three.cities/CPM/"), crop.area=ext.L[[x]], @@ -71,26 +71,24 @@ lapply(cities, function(x){ var <- c("tasmax", "tasmin", "rainfall") lapply(cities, function(x){ - - hads_read_crop(var = var, - fp= paste0(dd, "Processed/HadsUKgrid/resampled_2.2km/"), + + hads_read_crop(var = var, + fp= paste0(dd, "Processed/HadsUKgrid/resampled_2.2km/"), rd= paste0(dd, "Cropped/three.cities/Hads.original360/"), file.date="19801201", #Start from the same date as the CPM crop.area=ext.L[[x]], cropname=x) }) -#### HADs - updated 360 calendar +#### HADs - updated 360 calendar var <- c("tasmax", "tasmin", "rainfall") lapply(cities, function(x){ - - hads_read_crop(var = var, - fp= paste0(dd, "Processed/HadsUKgrid/resampled_calendarfix/"), + + hads_read_crop(var = var, + fp= paste0(dd, "Processed/HadsUKgrid/resampled_calendarfix/"), rd= paste0(dd, "Cropped/three.cities/Hads.updated360/"), file.date="19801201", #Start from the same date as the CPM crop.area=ext.L[[x]], cropname=x) }) - - diff --git a/R/misc/HAD.CRS.for.sourcing.R b/R/misc/HAD.CRS.for.sourcing.R index c87146dc..2e0015be 100644 --- a/R/misc/HAD.CRS.for.sourcing.R +++ b/R/misc/HAD.CRS.for.sourcing.R @@ -1,4 +1,4 @@ #crs as extracted from the reprojected HADs files -#this should be updated when hads is reprojected as suspect lots is incorrect +#this should be updated when hads is reprojected as suspect lots is incorrect -HAD.CRS = "PROJCRS[\"undefined\",\n BASEGEOGCRS[\"undefined\",\n DATUM[\"undefined\",\n ELLIPSOID[\"undefined\",6377563.396,299.324961266495,\n LENGTHUNIT[\"metre\",1,\n ID[\"EPSG\",9001]]]],\n PRIMEM[\"undefined\",0,\n ANGLEUNIT[\"degree\",0.0174532925199433,\n ID[\"EPSG\",9122]]]],\n CONVERSION[\"unnamed\",\n METHOD[\"Transverse Mercator\",\n ID[\"EPSG\",9807]],\n PARAMETER[\"Latitude of natural origin\",49,\n ANGLEUNIT[\"degree\",0.0174532925199433],\n ID[\"EPSG\",8801]],\n PARAMETER[\"Longitude of natural origin\",-2,\n ANGLEUNIT[\"degree\",0.0174532925199433],\n ID[\"EPSG\",8802]],\n PARAMETER[\"Scale factor at natural origin\",0.9996012717,\n SCALEUNIT[\"unity\",1],\n ID[\"EPSG\",8805]],\n PARAMETER[\"False easting\",400000,\n LENGTHUNIT[\"metre\",1],\n ID[\"EPSG\",8806]],\n PARAMETER[\"False northing\",-100000,\n LENGTHUNIT[\"metre\",1],\n ID[\"EPSG\",8807]]],\n CS[Cartesian,2],\n AXIS[\"easting\",east,\n ORDER[1],\n LENGTHUNIT[\"metre\",1,\n ID[\"EPSG\",9001]]],\n AXIS[\"northing\",north,\n ORDER[2],\n LENGTHUNIT[\"metre\",1,\n ID[\"EPSG\",9001]]]]" \ No newline at end of file +HAD.CRS = "PROJCRS[\"undefined\",\n BASEGEOGCRS[\"undefined\",\n DATUM[\"undefined\",\n ELLIPSOID[\"undefined\",6377563.396,299.324961266495,\n LENGTHUNIT[\"metre\",1,\n ID[\"EPSG\",9001]]]],\n PRIMEM[\"undefined\",0,\n ANGLEUNIT[\"degree\",0.0174532925199433,\n ID[\"EPSG\",9122]]]],\n CONVERSION[\"unnamed\",\n METHOD[\"Transverse Mercator\",\n ID[\"EPSG\",9807]],\n PARAMETER[\"Latitude of natural origin\",49,\n ANGLEUNIT[\"degree\",0.0174532925199433],\n ID[\"EPSG\",8801]],\n PARAMETER[\"Longitude of natural origin\",-2,\n ANGLEUNIT[\"degree\",0.0174532925199433],\n ID[\"EPSG\",8802]],\n PARAMETER[\"Scale factor at natural origin\",0.9996012717,\n SCALEUNIT[\"unity\",1],\n ID[\"EPSG\",8805]],\n PARAMETER[\"False easting\",400000,\n LENGTHUNIT[\"metre\",1],\n ID[\"EPSG\",8806]],\n PARAMETER[\"False northing\",-100000,\n LENGTHUNIT[\"metre\",1],\n ID[\"EPSG\",8807]]],\n CS[Cartesian,2],\n AXIS[\"easting\",east,\n ORDER[1],\n LENGTHUNIT[\"metre\",1,\n ID[\"EPSG\",9001]]],\n AXIS[\"northing\",north,\n ORDER[2],\n LENGTHUNIT[\"metre\",1,\n ID[\"EPSG\",9001]]]]" diff --git a/R/misc/Identifying_Runs.RMD b/R/misc/Identifying_Runs.RMD index 1d9de8ef..36db9382 100644 --- a/R/misc/Identifying_Runs.RMD +++ b/R/misc/Identifying_Runs.RMD @@ -18,9 +18,9 @@ library(RColorBrewer) ## **0. About** -Script to identify the mean, 2nd highest and 2nd lowers daily tasmax per UKCP18 CPM run. +Script to identify the mean, 2nd highest and 2nd lowers daily tasmax per UKCP18 CPM run. -These runs will be the focus of initial bias correction focus +These runs will be the focus of initial bias correction focus ## **1. Load Data** @@ -32,7 +32,7 @@ In retrospect the conversion to df might not have been necessary/the most resour As of June 2023, the tasmax-as-dataframe and tasmax daily means and the df data is located in `vmfileshare/Interim/tasmax_dfs/` -There is an error in the naming convention - Y00_Y20 should be Y01 to reflect the infill data time period (although this does cover a breif period of 2000) - to be updated in future +There is an error in the naming convention - Y00_Y20 should be Y01 to reflect the infill data time period (although this does cover a breif period of 2000) - to be updated in future ```{r} @@ -65,7 +65,7 @@ dfL <- lapply(i, function(i){ fp <- fp[grepl(i, fp)] dfs <- lapply(fp, read.csv) n <- namesL[[paste0("names_",i)]] - names(dfs) <- n + names(dfs) <- n return(dfs) }) @@ -79,10 +79,10 @@ list2env(dfL, .GlobalEnv) ```{r} -Y <- rep(c(1981:2000), each=360) +Y <- rep(c(1981:2000), each=360) dfs_hist <- lapply(names_hist, function(i){ - df <- dfs_hist[[i]] + df <- dfs_hist[[i]] names(df) <- c("day", "mean", "sd") df$model <- i df$dn <- 1:nrow(df) @@ -90,26 +90,26 @@ dfs_hist <- lapply(names_hist, function(i){ return(df) }) -#Create a single df in long form of Runs for the historical period +#Create a single df in long form of Runs for the historical period historical_means <- dfs_hist %>% reduce(rbind) - + ``` ### **Time series - daily** ```{r fig.height=8} -ggplot(historical_means) + +ggplot(historical_means) + geom_line(aes(x=dn, y=mean, group=model, colour=model)) + - theme_bw() + xlab("Day (Historical 1980 - 2000)") + - ylab("Daily mean max temp (tasmax) oC") + + theme_bw() + xlab("Day (Historical 1980 - 2000)") + + ylab("Daily mean max temp (tasmax) oC") + scale_colour_brewer(palette = "Paired", name = "") + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank(), - legend.position = "none") + - facet_wrap(.~ model, ncol=3) + legend.position = "none") + + facet_wrap(.~ model, ncol=3) ``` @@ -118,7 +118,7 @@ ggplot(historical_means) + ### **boxplot - mean historical** ```{r} -#Create a pallete specific to the runs so when reordered maintain the same colours +#Create a pallete specific to the runs so when reordered maintain the same colours historical_means$model <- as.factor(historical_means$model) c <- brewer.pal(12, "Paired") my_colours <- setNames(c, levels(historical_means$model)) @@ -128,10 +128,10 @@ my_colours <- setNames(c, levels(historical_means$model)) ```{r} -historical_means %>% +historical_means %>% mutate(model = fct_reorder(model, mean, .fun='median')) %>% - ggplot(aes(x=reorder(model, mean), y=mean, fill=model)) + - geom_boxplot() + theme_bw() + + ggplot(aes(x=reorder(model, mean), y=mean, fill=model)) + + geom_boxplot() + theme_bw() + ylab("Mean daily max temp (tasmax) oC") + xlab("model") + scale_fill_manual(values = my_colours) + theme(axis.text.x=element_blank(), @@ -157,40 +157,40 @@ ggplot(historical_means, aes(sample=mean, colour=factor(model))) + ```{r message=FALSE} -#Aggregating to year for annual average +#Aggregating to year for annual average historical_means$Yf <- as.factor(historical_means$Y) -historical_means_y <- historical_means %>% +historical_means_y <- historical_means %>% group_by(Yf, model) %>% dplyr::summarise(mean.annual=mean(mean, na.rm=T), sd.annual=sd(mean, na.rm = T)) ``` ```{r} -ggplot(historical_means_y) + - geom_line(aes(x = as.numeric(Yf), y=mean.annual, +ggplot(historical_means_y) + + geom_line(aes(x = as.numeric(Yf), y=mean.annual, color=model)) + - theme_bw() + xlab("Year (Historical 1980 - 2000)") + - ylab("Annual mean of mean daily max temp (tasmax) oC") + + theme_bw() + xlab("Year (Historical 1980 - 2000)") + + ylab("Annual mean of mean daily max temp (tasmax) oC") + scale_fill_brewer(palette = "Paired", name = "") + scale_colour_brewer(palette = "Paired", name = "") + theme(axis.text.x=element_blank(), - axis.ticks.x=element_blank()) + axis.ticks.x=element_blank()) ``` ```{r} # Plotting with SDs in geom_ribbon to see if anything wildely different -ggplot(historical_means_y) + - geom_ribbon(aes(as.numeric(Yf), y=mean.annual, +ggplot(historical_means_y) + + geom_ribbon(aes(as.numeric(Yf), y=mean.annual, ymin = mean.annual - sd.annual, ymax= mean.annual + sd.annual, - fill=model), alpha=0.4) + - geom_line(aes(x = as.numeric(Yf), y=mean.annual, + fill=model), alpha=0.4) + + geom_line(aes(x = as.numeric(Yf), y=mean.annual, color=model)) + - theme_bw() + xlab("Year (Historical 1980 - 2000)") + - ylab("Annual mean of mean daily max temp (tasmax) oC") + + theme_bw() + xlab("Year (Historical 1980 - 2000)") + + ylab("Annual mean of mean daily max temp (tasmax) oC") + scale_fill_brewer(palette = "Paired", name = "") + scale_colour_brewer(palette = "Paired", name = "") + theme(axis.text.x=element_blank(), @@ -202,10 +202,10 @@ ggplot(historical_means_y) + ```{r} -historical_means_y %>% +historical_means_y %>% mutate(model = fct_reorder(model, mean.annual, .fun='median')) %>% - ggplot(aes(x=reorder(model, mean.annual), y=mean.annual, fill=model)) + - geom_boxplot() + theme_bw() + + ggplot(aes(x=reorder(model, mean.annual), y=mean.annual, fill=model)) + + geom_boxplot() + theme_bw() + ylab("Annual max daily max temp oC") + xlab("model") + scale_fill_manual(values = my_colours) + theme(axis.text.x=element_blank(), @@ -231,7 +231,7 @@ ggplot(historical_means_y, aes(sample=mean.annual, colour=factor(model))) + ```{r message=FALSE} -historical_max_y <- historical_means %>% +historical_max_y <- historical_means %>% group_by(Yf, model) %>% dplyr::summarise(max=max(mean, na.rm=T)) ``` @@ -240,14 +240,14 @@ historical_max_y <- historical_means %>% ```{r} ggplot(historical_max_y) + - geom_line(aes(x = as.numeric(Yf), y=max, + geom_line(aes(x = as.numeric(Yf), y=max, color=model)) + - theme_bw() + xlab("Year (Historical 1980 - 2000)") + - ylab("Annual max of mean daily max temp (tasmax) oC") + + theme_bw() + xlab("Year (Historical 1980 - 2000)") + + ylab("Annual max of mean daily max temp (tasmax) oC") + scale_fill_brewer(palette = "Paired", name = "") + scale_colour_brewer(palette = "Paired", name = "") + theme(axis.text.x=element_blank(), - axis.ticks.x=element_blank()) + axis.ticks.x=element_blank()) ``` @@ -257,10 +257,10 @@ ggplot(historical_max_y) + ```{r} -historical_max_y %>% +historical_max_y %>% mutate(model = fct_reorder(model, max, .fun='median')) %>% - ggplot(aes(x=reorder(model, max), y=max, fill=model)) + - geom_boxplot() + theme_bw() + + ggplot(aes(x=reorder(model, max), y=max, fill=model)) + + geom_boxplot() + theme_bw() + ylab("Annual max of mean daily max temp (tasmax) oC") + xlab("model") + scale_fill_manual(values = my_colours) + theme(axis.text.x=element_blank(), @@ -304,7 +304,7 @@ Max vals are different but based on means then selection would be Run 02 (2nd lo ```{r} -Y <- rep(c(2021:2040), each=360) +Y <- rep(c(2021:2040), each=360) dfs_Y21_Y40 <- lapply(names_Y21_Y40, function(i){ @@ -316,27 +316,27 @@ dfs_Y21_Y40 <- lapply(names_Y21_Y40, function(i){ return(df) }) -#Create a single df in long form of Runs for the Y21_Y40 period +#Create a single df in long form of Runs for the Y21_Y40 period Y21_Y40_means <- dfs_Y21_Y40 %>% reduce(rbind) - + ``` ### **Time series - daily** ```{r} -ggplot(Y21_Y40_means) + +ggplot(Y21_Y40_means) + geom_line(aes(x=dn, y=mean, group=model, colour=model)) + # Removing sd ribbon for ease of viewing - #geom_ribbon(aes(x =dn, ymin = mean - sd, ymax= mean + sd), alpha=0.4) + - theme_bw() + xlab("Daily (1980 - 2000)") + - ylab("Daily mean max temp (tasmax) oC") + + #geom_ribbon(aes(x =dn, ymin = mean - sd, ymax= mean + sd), alpha=0.4) + + theme_bw() + xlab("Daily (1980 - 2000)") + + ylab("Daily mean max temp (tasmax) oC") + #scale_fill_brewer(palette = "Paired", name = "") + scale_colour_brewer(palette = "Paired", name = "") + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank(), - legend.position = "none") + - facet_wrap(.~ model, ncol=3) + guides(fill = FALSE) + legend.position = "none") + + facet_wrap(.~ model, ncol=3) + guides(fill = FALSE) ``` @@ -345,7 +345,7 @@ ggplot(Y21_Y40_means) + ### **boxplot - mean Y21_Y40** ```{r} -#Create a pallete specific to the runs so when reordered maintain the same colours +#Create a pallete specific to the runs so when reordered maintain the same colours Y21_Y40_means$model <- as.factor(Y21_Y40_means$model) c <- brewer.pal(12, "Paired") my_colours <- setNames(c, levels(Y21_Y40_means$model)) @@ -355,10 +355,10 @@ my_colours <- setNames(c, levels(Y21_Y40_means$model)) ```{r} -Y21_Y40_means %>% +Y21_Y40_means %>% mutate(model = fct_reorder(model, mean, .fun='median')) %>% - ggplot(aes(x=reorder(model, mean), y=mean, fill=model)) + - geom_boxplot() + theme_bw() + + ggplot(aes(x=reorder(model, mean), y=mean, fill=model)) + + geom_boxplot() + theme_bw() + ylab("Mean daily max temp (tasmax) oC") + xlab("model") + scale_fill_manual(values = my_colours) + theme(axis.text.x=element_blank(), @@ -384,40 +384,40 @@ ggplot(Y21_Y40_means, aes(sample=mean, colour=factor(model))) + ```{r message= FALSE, warning=FALSE} -#Aggregating to year for annual average +#Aggregating to year for annual average Y21_Y40_means$Yf <- as.factor(Y21_Y40_means$Y) -Y21_Y40_means_y <- Y21_Y40_means %>% +Y21_Y40_means_y <- Y21_Y40_means %>% group_by(Yf, model) %>% dplyr::summarise(mean.annual=mean(mean, na.rm=T), sd.annual=sd(mean, na.rm = T)) ``` ```{r} -ggplot(Y21_Y40_means_y) + - geom_line(aes(x = as.numeric(Yf), y=mean.annual, +ggplot(Y21_Y40_means_y) + + geom_line(aes(x = as.numeric(Yf), y=mean.annual, color=model)) + - theme_bw() + xlab("Year (2021 - 2040)") + - ylab("Annual mean of mean daily max temp (tasmax) oC") + + theme_bw() + xlab("Year (2021 - 2040)") + + ylab("Annual mean of mean daily max temp (tasmax) oC") + scale_fill_brewer(palette = "Paired", name = "") + scale_colour_brewer(palette = "Paired", name = "") + theme(axis.text.x=element_blank(), - axis.ticks.x=element_blank()) + axis.ticks.x=element_blank()) ``` ```{r} # Plotting with SDs in geom_ribbon to see if anything wildely different -ggplot(Y21_Y40_means_y) + - geom_ribbon(aes(as.numeric(Yf), y=mean.annual, +ggplot(Y21_Y40_means_y) + + geom_ribbon(aes(as.numeric(Yf), y=mean.annual, ymin = mean.annual - sd.annual, ymax= mean.annual + sd.annual, - fill=model), alpha=0.4) + - geom_line(aes(x = as.numeric(Yf), y=mean.annual, + fill=model), alpha=0.4) + + geom_line(aes(x = as.numeric(Yf), y=mean.annual, color=model)) + - theme_bw() + xlab("Year (2021 - 2040)") + - ylab("Annual mean of mean daily max temp (tasmax) oC") + + theme_bw() + xlab("Year (2021 - 2040)") + + ylab("Annual mean of mean daily max temp (tasmax) oC") + scale_fill_brewer(palette = "Paired", name = "") + scale_colour_brewer(palette = "Paired", name = "") + theme(axis.text.x=element_blank(), @@ -429,10 +429,10 @@ ggplot(Y21_Y40_means_y) + ```{r} -Y21_Y40_means_y %>% +Y21_Y40_means_y %>% mutate(model = fct_reorder(model, mean.annual, .fun='median')) %>% - ggplot(aes(x=reorder(model, mean.annual), y=mean.annual, fill=model)) + - geom_boxplot() + theme_bw() + + ggplot(aes(x=reorder(model, mean.annual), y=mean.annual, fill=model)) + + geom_boxplot() + theme_bw() + ylab("Annual max daily max temp oC") + xlab("model") + scale_fill_manual(values = my_colours) + theme(axis.text.x=element_blank(), @@ -458,7 +458,7 @@ ggplot(Y21_Y40_means_y, aes(sample=mean.annual, colour=factor(model))) + ```{r message=FALSE} -Y21_Y40_max_y <- Y21_Y40_means %>% +Y21_Y40_max_y <- Y21_Y40_means %>% group_by(Yf, model) %>% dplyr::summarise(max=max(mean, na.rm=T)) ``` @@ -467,14 +467,14 @@ Y21_Y40_max_y <- Y21_Y40_means %>% ```{r} ggplot(Y21_Y40_max_y) + - geom_line(aes(x = as.numeric(Yf), y=max, + geom_line(aes(x = as.numeric(Yf), y=max, color=model)) + - theme_bw() + xlab("Year (2021 - 2040)") + - ylab("Annual max of mean daily max temp (tasmax) oC") + + theme_bw() + xlab("Year (2021 - 2040)") + + ylab("Annual max of mean daily max temp (tasmax) oC") + scale_fill_brewer(palette = "Paired", name = "") + scale_colour_brewer(palette = "Paired", name = "") + theme(axis.text.x=element_blank(), - axis.ticks.x=element_blank()) + axis.ticks.x=element_blank()) ``` @@ -484,10 +484,10 @@ ggplot(Y21_Y40_max_y) + ```{r} -Y21_Y40_max_y %>% +Y21_Y40_max_y %>% mutate(model = fct_reorder(model, max, .fun='median')) %>% - ggplot(aes(x=reorder(model, max), y=max, fill=model)) + - geom_boxplot() + theme_bw() + + ggplot(aes(x=reorder(model, max), y=max, fill=model)) + + geom_boxplot() + theme_bw() + ylab("Annual max of mean daily max temp (tasmax) oC") + xlab("model") + scale_fill_manual(values = my_colours) + theme(axis.text.x=element_blank(), @@ -515,7 +515,7 @@ av2$coefficients[order(av2$coefficients)] ``` -Max of means +Max of means ```{r} @@ -535,7 +535,7 @@ Based on this period, the seelction would be: Run 05, Run 03, Run 08, Run 06 (so ```{r} -Y <- rep(c(2061:2080), each=360) +Y <- rep(c(2061:2080), each=360) dfs_Y61_Y80 <- lapply(names_Y61_Y80, function(i){ @@ -547,27 +547,27 @@ dfs_Y61_Y80 <- lapply(names_Y61_Y80, function(i){ return(df) }) -#Create a single df in long form of Runs for the Y61_Y80 period +#Create a single df in long form of Runs for the Y61_Y80 period Y61_Y80_means <- dfs_Y61_Y80 %>% reduce(rbind) - + ``` ### **Time series - daily** ```{r} -ggplot(Y61_Y80_means) + +ggplot(Y61_Y80_means) + geom_line(aes(x=dn, y=mean, group=model, colour=model)) + # Removing sd ribbon for ease of viewing - #geom_ribbon(aes(x =dn, ymin = mean - sd, ymax= mean + sd), alpha=0.4) + - theme_bw() + xlab("Day (2060 - 2080)") + - ylab("Daily mean max temp (tasmax) oC") + + #geom_ribbon(aes(x =dn, ymin = mean - sd, ymax= mean + sd), alpha=0.4) + + theme_bw() + xlab("Day (2060 - 2080)") + + ylab("Daily mean max temp (tasmax) oC") + #scale_fill_brewer(palette = "Paired", name = "") + scale_colour_brewer(palette = "Paired", name = "") + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank(), - legend.position = "none") + - facet_wrap(.~ model, ncol=3) + guides(fill = FALSE) + legend.position = "none") + + facet_wrap(.~ model, ncol=3) + guides(fill = FALSE) ``` @@ -576,7 +576,7 @@ ggplot(Y61_Y80_means) + ### **boxplot - mean Y61_Y80** ```{r} -#Create a pallete specific to the runs so when reordered maintain the same colours +#Create a pallete specific to the runs so when reordered maintain the same colours Y61_Y80_means$model <- as.factor(Y61_Y80_means$model) c <- brewer.pal(12, "Paired") my_colours <- setNames(c, levels(Y61_Y80_means$model)) @@ -586,10 +586,10 @@ my_colours <- setNames(c, levels(Y61_Y80_means$model)) ```{r} -Y61_Y80_means %>% +Y61_Y80_means %>% mutate(model = fct_reorder(model, mean, .fun='median')) %>% - ggplot(aes(x=reorder(model, mean), y=mean, fill=model)) + - geom_boxplot() + theme_bw() + + ggplot(aes(x=reorder(model, mean), y=mean, fill=model)) + + geom_boxplot() + theme_bw() + ylab("Mean daily max temp (tasmax) oC") + xlab("model") + scale_fill_manual(values = my_colours) + theme(axis.text.x=element_blank(), @@ -615,40 +615,40 @@ ggplot(Y61_Y80_means, aes(sample=mean, colour=factor(model))) + ```{r message= FALSE, warning=FALSE} -#Aggregating to year for annual average +#Aggregating to year for annual average Y61_Y80_means$Yf <- as.factor(Y61_Y80_means$Y) -Y61_Y80_means_y <- Y61_Y80_means %>% +Y61_Y80_means_y <- Y61_Y80_means %>% group_by(Yf, model) %>% dplyr::summarise(mean.annual=mean(mean, na.rm=T), sd.annual=sd(mean, na.rm = T)) ``` ```{r} -ggplot(Y61_Y80_means_y) + - geom_line(aes(x = as.numeric(Yf), y=mean.annual, +ggplot(Y61_Y80_means_y) + + geom_line(aes(x = as.numeric(Yf), y=mean.annual, color=model)) + - theme_bw() + xlab("Year (2061 - 2080)") + - ylab("Annual mean of mean daily max temp (tasmax) oC") + + theme_bw() + xlab("Year (2061 - 2080)") + + ylab("Annual mean of mean daily max temp (tasmax) oC") + scale_fill_brewer(palette = "Paired", name = "") + scale_colour_brewer(palette = "Paired", name = "") + theme(axis.text.x=element_blank(), - axis.ticks.x=element_blank()) + axis.ticks.x=element_blank()) ``` ```{r} # Plotting with SDs in geom_ribbon to see if anything wildely different -ggplot(Y61_Y80_means_y) + - geom_ribbon(aes(as.numeric(Yf), y=mean.annual, +ggplot(Y61_Y80_means_y) + + geom_ribbon(aes(as.numeric(Yf), y=mean.annual, ymin = mean.annual - sd.annual, ymax= mean.annual + sd.annual, - fill=model), alpha=0.4) + - geom_line(aes(x = as.numeric(Yf), y=mean.annual, + fill=model), alpha=0.4) + + geom_line(aes(x = as.numeric(Yf), y=mean.annual, color=model)) + - theme_bw() + xlab("Year (2061 - 2080)") + - ylab("Annual mean of mean daily max temp (tasmax) oC") + + theme_bw() + xlab("Year (2061 - 2080)") + + ylab("Annual mean of mean daily max temp (tasmax) oC") + scale_fill_brewer(palette = "Paired", name = "") + scale_colour_brewer(palette = "Paired", name = "") + theme(axis.text.x=element_blank(), @@ -660,10 +660,10 @@ ggplot(Y61_Y80_means_y) + ```{r} -Y61_Y80_means_y %>% +Y61_Y80_means_y %>% mutate(model = fct_reorder(model, mean.annual, .fun='median')) %>% - ggplot(aes(x=reorder(model, mean.annual), y=mean.annual, fill=model)) + - geom_boxplot() + theme_bw() + + ggplot(aes(x=reorder(model, mean.annual), y=mean.annual, fill=model)) + + geom_boxplot() + theme_bw() + ylab("Annual max daily max temp oC") + xlab("model") + scale_fill_manual(values = my_colours) + theme(axis.text.x=element_blank(), @@ -689,7 +689,7 @@ ggplot(Y61_Y80_means_y, aes(sample=mean.annual, colour=factor(model))) + ```{r message=FALSE} -Y61_Y80_max_y <- Y61_Y80_means %>% +Y61_Y80_max_y <- Y61_Y80_means %>% group_by(Yf, model) %>% dplyr::summarise(max=max(mean, na.rm=T)) ``` @@ -698,14 +698,14 @@ Y61_Y80_max_y <- Y61_Y80_means %>% ```{r} ggplot(Y61_Y80_max_y) + - geom_line(aes(x = as.numeric(Yf), y=max, + geom_line(aes(x = as.numeric(Yf), y=max, color=model)) + - theme_bw() + xlab("Year (2061 - 2080)") + - ylab("Annual max of mean daily max temp (tasmax) oC") + + theme_bw() + xlab("Year (2061 - 2080)") + + ylab("Annual max of mean daily max temp (tasmax) oC") + scale_fill_brewer(palette = "Paired", name = "") + scale_colour_brewer(palette = "Paired", name = "") + theme(axis.text.x=element_blank(), - axis.ticks.x=element_blank()) + axis.ticks.x=element_blank()) ``` @@ -715,10 +715,10 @@ ggplot(Y61_Y80_max_y) + ```{r} -Y61_Y80_max_y %>% +Y61_Y80_max_y %>% mutate(model = fct_reorder(model, max, .fun='median')) %>% - ggplot(aes(x=reorder(model, max), y=max, fill=model)) + - geom_boxplot() + theme_bw() + + ggplot(aes(x=reorder(model, max), y=max, fill=model)) + + geom_boxplot() + theme_bw() + ylab("Annual max of mean daily max temp (tasmax) oC") + xlab("model") + scale_fill_manual(values = my_colours) + theme(axis.text.x=element_blank(), @@ -746,7 +746,7 @@ av2$coefficients[order(av2$coefficients)] ``` -Max of means +Max of means ```{r} @@ -757,7 +757,7 @@ av3$coefficients[order(av3$coefficients)] Runs suggested by this slice are Run 05, Run 09, Run 03 and Run 11 -Run 3 and 5 suggested above +Run 3 and 5 suggested above ## **3. Everything combined** @@ -765,14 +765,14 @@ The result per time slice suggest different runs, aside from run 5 ### Add in infill data -**Update 13.05.23** - Adding in the infill data, and taking the anova result across the whole time period +**Update 13.05.23** - Adding in the infill data, and taking the anova result across the whole time period ```{r infill means} -Y <- rep(c(2001:2020), each=360) +Y <- rep(c(2001:2020), each=360) dfs_Y00_Y20 <- lapply(names_Y00_Y20, function(i){ - df <- dfs_Y00_Y20[[i]] + df <- dfs_Y00_Y20[[i]] names(df) <- c("day", "mean", "sd") df$model <- i df$dn <- 1:nrow(df) @@ -782,10 +782,10 @@ dfs_Y00_Y20 <- lapply(names_Y00_Y20, function(i){ }) -Y <- rep(c(2041:2060), each=360) +Y <- rep(c(2041:2060), each=360) dfs_Y41_Y60 <- lapply(names_Y41_Y60, function(i){ - df <- dfs_Y41_Y60[[i]] + df <- dfs_Y41_Y60[[i]] names(df) <- c("day", "mean", "sd") df$model <- i df$dn <- 1:nrow(df) @@ -796,8 +796,8 @@ dfs_Y41_Y60 <- lapply(names_Y41_Y60, function(i){ #Create a single df in long form as above -Y00_Y20_means <- dfs_Y00_Y20 %>% reduce(rbind) -Y41_Y60_means <- dfs_Y41_Y60 %>% reduce(rbind) +Y00_Y20_means <- dfs_Y00_Y20 %>% reduce(rbind) +Y41_Y60_means <- dfs_Y41_Y60 %>% reduce(rbind) ``` Assessing what the combined times slices suggest via anova @@ -819,21 +819,21 @@ av1$coefficients[order(av1$coefficients)] #### Annual means: ```{r} -# As above, creating annual means +# As above, creating annual means infill.L <- list(Y00_Y20_means, Y41_Y60_means) infill.L_y <- lapply(infill.L, function(x){ - means_y <- x %>% + means_y <- x %>% group_by(Yf, model) %>% dplyr::summarise(mean.annual=mean(mean, na.rm=T), sd.annual=sd(mean, na.rm = T))}) ``` ```{r} -all.means_y <- rbind(historical_means_y, - infill.L_y[[1]], - Y21_Y40_means_y, - infill.L_y[[2]], +all.means_y <- rbind(historical_means_y, + infill.L_y[[1]], + Y21_Y40_means_y, + infill.L_y[[2]], Y61_Y80_means_y) x <- as.character(all.means_y$model) @@ -847,5 +847,4 @@ av2$coefficients[order(av2$coefficients)] **Updated June 13th 2023 result** -Considering all together, suggests: Runs 05, Run07, Run08 and Run06 - +Considering all together, suggests: Runs 05, Run07, Run08 and Run06 diff --git a/R/misc/Identifying_Runs.md b/R/misc/Identifying_Runs.md index 0aad23b8..1f202636 100644 --- a/R/misc/Identifying_Runs.md +++ b/R/misc/Identifying_Runs.md @@ -70,7 +70,7 @@ dfL <- lapply(i, function(i){ fp <- fp[grepl(i, fp)] dfs <- lapply(fp, read.csv) n <- namesL[[paste0("names_",i)]] - names(dfs) <- n + names(dfs) <- n return(dfs) }) @@ -85,10 +85,10 @@ list2env(dfL, .GlobalEnv) ### **2a. Historical figures** ``` r -Y <- rep(c(1981:2000), each=360) +Y <- rep(c(1981:2000), each=360) dfs_hist <- lapply(names_hist, function(i){ - df <- dfs_hist[[i]] + df <- dfs_hist[[i]] names(df) <- c("day", "mean", "sd") df$model <- i df$dn <- 1:nrow(df) @@ -96,24 +96,24 @@ dfs_hist <- lapply(names_hist, function(i){ return(df) }) -#Create a single df in long form of Runs for the historical period +#Create a single df in long form of Runs for the historical period historical_means <- dfs_hist %>% reduce(rbind) ``` ### **Time series - daily** ``` r -ggplot(historical_means) + +ggplot(historical_means) + geom_line(aes(x=dn, y=mean, group=model, colour=model)) + - theme_bw() + xlab("Day (Historical 1980 - 2000)") + - ylab("Daily mean max temp (tasmax) oC") + + theme_bw() + xlab("Day (Historical 1980 - 2000)") + + ylab("Daily mean max temp (tasmax) oC") + scale_colour_brewer(palette = "Paired", name = "") + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank(), - legend.position = "none") + - facet_wrap(.~ model, ncol=3) + legend.position = "none") + + facet_wrap(.~ model, ncol=3) ``` ![](Identifying_Runs_files/figure-gfm/unnamed-chunk-4-1.png) @@ -121,17 +121,17 @@ ggplot(historical_means) + ### **boxplot - mean historical** ``` r -#Create a pallete specific to the runs so when reordered maintain the same colours +#Create a pallete specific to the runs so when reordered maintain the same colours historical_means$model <- as.factor(historical_means$model) c <- brewer.pal(12, "Paired") my_colours <- setNames(c, levels(historical_means$model)) ``` ``` r -historical_means %>% +historical_means %>% mutate(model = fct_reorder(model, mean, .fun='median')) %>% - ggplot(aes(x=reorder(model, mean), y=mean, fill=model)) + - geom_boxplot() + theme_bw() + + ggplot(aes(x=reorder(model, mean), y=mean, fill=model)) + + geom_boxplot() + theme_bw() + ylab("Mean daily max temp (tasmax) oC") + xlab("model") + scale_fill_manual(values = my_colours) + theme(axis.text.x=element_blank(), @@ -156,40 +156,40 @@ ggplot(historical_means, aes(sample=mean, colour=factor(model))) + ### **Time series - annual mean** ``` r -#Aggregating to year for annual average +#Aggregating to year for annual average historical_means$Yf <- as.factor(historical_means$Y) -historical_means_y <- historical_means %>% +historical_means_y <- historical_means %>% group_by(Yf, model) %>% dplyr::summarise(mean.annual=mean(mean, na.rm=T), sd.annual=sd(mean, na.rm = T)) ``` ``` r -ggplot(historical_means_y) + - geom_line(aes(x = as.numeric(Yf), y=mean.annual, +ggplot(historical_means_y) + + geom_line(aes(x = as.numeric(Yf), y=mean.annual, color=model)) + - theme_bw() + xlab("Year (Historical 1980 - 2000)") + - ylab("Annual mean of mean daily max temp (tasmax) oC") + + theme_bw() + xlab("Year (Historical 1980 - 2000)") + + ylab("Annual mean of mean daily max temp (tasmax) oC") + scale_fill_brewer(palette = "Paired", name = "") + scale_colour_brewer(palette = "Paired", name = "") + theme(axis.text.x=element_blank(), - axis.ticks.x=element_blank()) + axis.ticks.x=element_blank()) ``` ![](Identifying_Runs_files/figure-gfm/unnamed-chunk-9-1.png) ``` r # Plotting with SDs in geom_ribbon to see if anything wildely different -ggplot(historical_means_y) + - geom_ribbon(aes(as.numeric(Yf), y=mean.annual, +ggplot(historical_means_y) + + geom_ribbon(aes(as.numeric(Yf), y=mean.annual, ymin = mean.annual - sd.annual, ymax= mean.annual + sd.annual, - fill=model), alpha=0.4) + - geom_line(aes(x = as.numeric(Yf), y=mean.annual, + fill=model), alpha=0.4) + + geom_line(aes(x = as.numeric(Yf), y=mean.annual, color=model)) + - theme_bw() + xlab("Year (Historical 1980 - 2000)") + - ylab("Annual mean of mean daily max temp (tasmax) oC") + + theme_bw() + xlab("Year (Historical 1980 - 2000)") + + ylab("Annual mean of mean daily max temp (tasmax) oC") + scale_fill_brewer(palette = "Paired", name = "") + scale_colour_brewer(palette = "Paired", name = "") + theme(axis.text.x=element_blank(), @@ -201,10 +201,10 @@ ggplot(historical_means_y) + ### **boxplot - annual mean historical** ``` r -historical_means_y %>% +historical_means_y %>% mutate(model = fct_reorder(model, mean.annual, .fun='median')) %>% - ggplot(aes(x=reorder(model, mean.annual), y=mean.annual, fill=model)) + - geom_boxplot() + theme_bw() + + ggplot(aes(x=reorder(model, mean.annual), y=mean.annual, fill=model)) + + geom_boxplot() + theme_bw() + ylab("Annual max daily max temp oC") + xlab("model") + scale_fill_manual(values = my_colours) + theme(axis.text.x=element_blank(), @@ -229,21 +229,21 @@ ggplot(historical_means_y, aes(sample=mean.annual, colour=factor(model))) + ### **Time series - annual max** ``` r -historical_max_y <- historical_means %>% +historical_max_y <- historical_means %>% group_by(Yf, model) %>% dplyr::summarise(max=max(mean, na.rm=T)) ``` ``` r ggplot(historical_max_y) + - geom_line(aes(x = as.numeric(Yf), y=max, + geom_line(aes(x = as.numeric(Yf), y=max, color=model)) + - theme_bw() + xlab("Year (Historical 1980 - 2000)") + - ylab("Annual max of mean daily max temp (tasmax) oC") + + theme_bw() + xlab("Year (Historical 1980 - 2000)") + + ylab("Annual max of mean daily max temp (tasmax) oC") + scale_fill_brewer(palette = "Paired", name = "") + scale_colour_brewer(palette = "Paired", name = "") + theme(axis.text.x=element_blank(), - axis.ticks.x=element_blank()) + axis.ticks.x=element_blank()) ``` ![](Identifying_Runs_files/figure-gfm/unnamed-chunk-14-1.png) @@ -251,10 +251,10 @@ ggplot(historical_max_y) + ### **boxplot - annual max** ``` r -historical_max_y %>% +historical_max_y %>% mutate(model = fct_reorder(model, max, .fun='median')) %>% - ggplot(aes(x=reorder(model, max), y=max, fill=model)) + - geom_boxplot() + theme_bw() + + ggplot(aes(x=reorder(model, max), y=max, fill=model)) + + geom_boxplot() + theme_bw() + ylab("Annual max of mean daily max temp (tasmax) oC") + xlab("model") + scale_fill_manual(values = my_colours) + theme(axis.text.x=element_blank(), @@ -277,11 +277,11 @@ av1 <- aov(mean ~ model - 1, historical_means) av1$coefficients[order(av1$coefficients)] ``` - ## modelhist_Run10 modelhist_Run02 modelhist_Run05 modelhist_Run07 modelhist_Run09 - ## 9.89052 11.06863 11.21424 11.22048 11.27647 - ## modelhist_Run04 modelhist_Run03 modelhist_Run08 modelhist_Run01 modelhist_Run06 - ## 11.29057 11.35848 11.45257 11.45414 11.86451 - ## modelhist_Run11 modelhist_Run12 + ## modelhist_Run10 modelhist_Run02 modelhist_Run05 modelhist_Run07 modelhist_Run09 + ## 9.89052 11.06863 11.21424 11.22048 11.27647 + ## modelhist_Run04 modelhist_Run03 modelhist_Run08 modelhist_Run01 modelhist_Run06 + ## 11.29057 11.35848 11.45257 11.45414 11.86451 + ## modelhist_Run11 modelhist_Run12 ## 11.99148 12.31870 Annual means: @@ -291,11 +291,11 @@ av2 <- aov(mean.annual ~ model - 1, historical_means_y) av2$coefficients[order(av2$coefficients)] ``` - ## modelhist_Run10 modelhist_Run02 modelhist_Run05 modelhist_Run07 modelhist_Run09 - ## 9.89052 11.06863 11.21424 11.22048 11.27647 - ## modelhist_Run04 modelhist_Run03 modelhist_Run08 modelhist_Run01 modelhist_Run06 - ## 11.29057 11.35848 11.45257 11.45414 11.86451 - ## modelhist_Run11 modelhist_Run12 + ## modelhist_Run10 modelhist_Run02 modelhist_Run05 modelhist_Run07 modelhist_Run09 + ## 9.89052 11.06863 11.21424 11.22048 11.27647 + ## modelhist_Run04 modelhist_Run03 modelhist_Run08 modelhist_Run01 modelhist_Run06 + ## 11.29057 11.35848 11.45257 11.45414 11.86451 + ## modelhist_Run11 modelhist_Run12 ## 11.99148 12.31870 Max of means: @@ -305,11 +305,11 @@ av3 <- aov(max ~ model - 1, historical_max_y) av3$coefficients[order(av3$coefficients)] ``` - ## modelhist_Run10 modelhist_Run04 modelhist_Run02 modelhist_Run05 modelhist_Run03 - ## 18.12329 18.81126 18.90054 19.01801 19.10454 - ## modelhist_Run09 modelhist_Run01 modelhist_Run08 modelhist_Run07 modelhist_Run11 - ## 19.23705 19.31541 19.44439 19.54981 19.57548 - ## modelhist_Run06 modelhist_Run12 + ## modelhist_Run10 modelhist_Run04 modelhist_Run02 modelhist_Run05 modelhist_Run03 + ## 18.12329 18.81126 18.90054 19.01801 19.10454 + ## modelhist_Run09 modelhist_Run01 modelhist_Run08 modelhist_Run07 modelhist_Run11 + ## 19.23705 19.31541 19.44439 19.54981 19.57548 + ## modelhist_Run06 modelhist_Run12 ## 19.88375 20.47650 Max vals are different but based on means then selection would be Run 02 @@ -318,7 +318,7 @@ Max vals are different but based on means then selection would be Run 02 ### **2b. Y2020 - Y2040** ``` r -Y <- rep(c(2021:2040), each=360) +Y <- rep(c(2021:2040), each=360) dfs_Y21_Y40 <- lapply(names_Y21_Y40, function(i){ @@ -330,25 +330,25 @@ dfs_Y21_Y40 <- lapply(names_Y21_Y40, function(i){ return(df) }) -#Create a single df in long form of Runs for the Y21_Y40 period +#Create a single df in long form of Runs for the Y21_Y40 period Y21_Y40_means <- dfs_Y21_Y40 %>% reduce(rbind) ``` ### **Time series - daily** ``` r -ggplot(Y21_Y40_means) + +ggplot(Y21_Y40_means) + geom_line(aes(x=dn, y=mean, group=model, colour=model)) + # Removing sd ribbon for ease of viewing - #geom_ribbon(aes(x =dn, ymin = mean - sd, ymax= mean + sd), alpha=0.4) + - theme_bw() + xlab("Daily (1980 - 2000)") + - ylab("Daily mean max temp (tasmax) oC") + + #geom_ribbon(aes(x =dn, ymin = mean - sd, ymax= mean + sd), alpha=0.4) + + theme_bw() + xlab("Daily (1980 - 2000)") + + ylab("Daily mean max temp (tasmax) oC") + #scale_fill_brewer(palette = "Paired", name = "") + scale_colour_brewer(palette = "Paired", name = "") + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank(), - legend.position = "none") + - facet_wrap(.~ model, ncol=3) + guides(fill = FALSE) + legend.position = "none") + + facet_wrap(.~ model, ncol=3) + guides(fill = FALSE) ``` ## Warning: The `` argument of `guides()` cannot be `FALSE`. Use "none" instead as @@ -362,17 +362,17 @@ ggplot(Y21_Y40_means) + ### **boxplot - mean Y21_Y40** ``` r -#Create a pallete specific to the runs so when reordered maintain the same colours +#Create a pallete specific to the runs so when reordered maintain the same colours Y21_Y40_means$model <- as.factor(Y21_Y40_means$model) c <- brewer.pal(12, "Paired") my_colours <- setNames(c, levels(Y21_Y40_means$model)) ``` ``` r -Y21_Y40_means %>% +Y21_Y40_means %>% mutate(model = fct_reorder(model, mean, .fun='median')) %>% - ggplot(aes(x=reorder(model, mean), y=mean, fill=model)) + - geom_boxplot() + theme_bw() + + ggplot(aes(x=reorder(model, mean), y=mean, fill=model)) + + geom_boxplot() + theme_bw() + ylab("Mean daily max temp (tasmax) oC") + xlab("model") + scale_fill_manual(values = my_colours) + theme(axis.text.x=element_blank(), @@ -397,40 +397,40 @@ ggplot(Y21_Y40_means, aes(sample=mean, colour=factor(model))) + ### **Time series - annual mean** ``` r -#Aggregating to year for annual average +#Aggregating to year for annual average Y21_Y40_means$Yf <- as.factor(Y21_Y40_means$Y) -Y21_Y40_means_y <- Y21_Y40_means %>% +Y21_Y40_means_y <- Y21_Y40_means %>% group_by(Yf, model) %>% dplyr::summarise(mean.annual=mean(mean, na.rm=T), sd.annual=sd(mean, na.rm = T)) ``` ``` r -ggplot(Y21_Y40_means_y) + - geom_line(aes(x = as.numeric(Yf), y=mean.annual, +ggplot(Y21_Y40_means_y) + + geom_line(aes(x = as.numeric(Yf), y=mean.annual, color=model)) + - theme_bw() + xlab("Year (2021 - 2040)") + - ylab("Annual mean of mean daily max temp (tasmax) oC") + + theme_bw() + xlab("Year (2021 - 2040)") + + ylab("Annual mean of mean daily max temp (tasmax) oC") + scale_fill_brewer(palette = "Paired", name = "") + scale_colour_brewer(palette = "Paired", name = "") + theme(axis.text.x=element_blank(), - axis.ticks.x=element_blank()) + axis.ticks.x=element_blank()) ``` ![](Identifying_Runs_files/figure-gfm/unnamed-chunk-25-1.png) ``` r # Plotting with SDs in geom_ribbon to see if anything wildely different -ggplot(Y21_Y40_means_y) + - geom_ribbon(aes(as.numeric(Yf), y=mean.annual, +ggplot(Y21_Y40_means_y) + + geom_ribbon(aes(as.numeric(Yf), y=mean.annual, ymin = mean.annual - sd.annual, ymax= mean.annual + sd.annual, - fill=model), alpha=0.4) + - geom_line(aes(x = as.numeric(Yf), y=mean.annual, + fill=model), alpha=0.4) + + geom_line(aes(x = as.numeric(Yf), y=mean.annual, color=model)) + - theme_bw() + xlab("Year (2021 - 2040)") + - ylab("Annual mean of mean daily max temp (tasmax) oC") + + theme_bw() + xlab("Year (2021 - 2040)") + + ylab("Annual mean of mean daily max temp (tasmax) oC") + scale_fill_brewer(palette = "Paired", name = "") + scale_colour_brewer(palette = "Paired", name = "") + theme(axis.text.x=element_blank(), @@ -442,10 +442,10 @@ ggplot(Y21_Y40_means_y) + ### **boxplot - annual mean 2021 - 2040** ``` r -Y21_Y40_means_y %>% +Y21_Y40_means_y %>% mutate(model = fct_reorder(model, mean.annual, .fun='median')) %>% - ggplot(aes(x=reorder(model, mean.annual), y=mean.annual, fill=model)) + - geom_boxplot() + theme_bw() + + ggplot(aes(x=reorder(model, mean.annual), y=mean.annual, fill=model)) + + geom_boxplot() + theme_bw() + ylab("Annual max daily max temp oC") + xlab("model") + scale_fill_manual(values = my_colours) + theme(axis.text.x=element_blank(), @@ -470,21 +470,21 @@ ggplot(Y21_Y40_means_y, aes(sample=mean.annual, colour=factor(model))) + ### **Time series - annual max** ``` r -Y21_Y40_max_y <- Y21_Y40_means %>% +Y21_Y40_max_y <- Y21_Y40_means %>% group_by(Yf, model) %>% dplyr::summarise(max=max(mean, na.rm=T)) ``` ``` r ggplot(Y21_Y40_max_y) + - geom_line(aes(x = as.numeric(Yf), y=max, + geom_line(aes(x = as.numeric(Yf), y=max, color=model)) + - theme_bw() + xlab("Year (2021 - 2040)") + - ylab("Annual max of mean daily max temp (tasmax) oC") + + theme_bw() + xlab("Year (2021 - 2040)") + + ylab("Annual max of mean daily max temp (tasmax) oC") + scale_fill_brewer(palette = "Paired", name = "") + scale_colour_brewer(palette = "Paired", name = "") + theme(axis.text.x=element_blank(), - axis.ticks.x=element_blank()) + axis.ticks.x=element_blank()) ``` ![](Identifying_Runs_files/figure-gfm/unnamed-chunk-30-1.png) @@ -492,10 +492,10 @@ ggplot(Y21_Y40_max_y) + ### **boxplot - annual max** ``` r -Y21_Y40_max_y %>% +Y21_Y40_max_y %>% mutate(model = fct_reorder(model, max, .fun='median')) %>% - ggplot(aes(x=reorder(model, max), y=max, fill=model)) + - geom_boxplot() + theme_bw() + + ggplot(aes(x=reorder(model, max), y=max, fill=model)) + + geom_boxplot() + theme_bw() + ylab("Annual max of mean daily max temp (tasmax) oC") + xlab("model") + scale_fill_manual(values = my_colours) + theme(axis.text.x=element_blank(), @@ -514,11 +514,11 @@ av1 <- aov(mean ~ model - 1, Y21_Y40_means) av1$coefficients[order(av1$coefficients)] ``` - ## modelY21_Y40_Run10 modelY21_Y40_Run05 modelY21_Y40_Run07 modelY21_Y40_Run02 - ## 10.93136 12.36223 12.64493 12.67791 - ## modelY21_Y40_Run09 modelY21_Y40_Run03 modelY21_Y40_Run08 modelY21_Y40_Run01 - ## 12.72584 12.85999 12.92934 13.03640 - ## modelY21_Y40_Run04 modelY21_Y40_Run12 modelY21_Y40_Run06 modelY21_Y40_Run11 + ## modelY21_Y40_Run10 modelY21_Y40_Run05 modelY21_Y40_Run07 modelY21_Y40_Run02 + ## 10.93136 12.36223 12.64493 12.67791 + ## modelY21_Y40_Run09 modelY21_Y40_Run03 modelY21_Y40_Run08 modelY21_Y40_Run01 + ## 12.72584 12.85999 12.92934 13.03640 + ## modelY21_Y40_Run04 modelY21_Y40_Run12 modelY21_Y40_Run06 modelY21_Y40_Run11 ## 13.07768 13.20011 13.38047 13.60076 Annual means: @@ -528,11 +528,11 @@ av2 <- aov(mean.annual ~ model - 1, Y21_Y40_means_y) av2$coefficients[order(av2$coefficients)] ``` - ## modelY21_Y40_Run10 modelY21_Y40_Run05 modelY21_Y40_Run07 modelY21_Y40_Run02 - ## 10.93136 12.36223 12.64493 12.67791 - ## modelY21_Y40_Run09 modelY21_Y40_Run03 modelY21_Y40_Run08 modelY21_Y40_Run01 - ## 12.72584 12.85999 12.92934 13.03640 - ## modelY21_Y40_Run04 modelY21_Y40_Run12 modelY21_Y40_Run06 modelY21_Y40_Run11 + ## modelY21_Y40_Run10 modelY21_Y40_Run05 modelY21_Y40_Run07 modelY21_Y40_Run02 + ## 10.93136 12.36223 12.64493 12.67791 + ## modelY21_Y40_Run09 modelY21_Y40_Run03 modelY21_Y40_Run08 modelY21_Y40_Run01 + ## 12.72584 12.85999 12.92934 13.03640 + ## modelY21_Y40_Run04 modelY21_Y40_Run12 modelY21_Y40_Run06 modelY21_Y40_Run11 ## 13.07768 13.20011 13.38047 13.60076 Max of means @@ -542,11 +542,11 @@ av3 <- aov(max ~ model - 1, Y21_Y40_max_y) av3$coefficients[order(av3$coefficients)] ``` - ## modelY21_Y40_Run10 modelY21_Y40_Run02 modelY21_Y40_Run09 modelY21_Y40_Run03 - ## 19.29044 20.69596 20.82538 21.05558 - ## modelY21_Y40_Run05 modelY21_Y40_Run07 modelY21_Y40_Run08 modelY21_Y40_Run01 - ## 21.09128 21.22942 21.33484 21.37443 - ## modelY21_Y40_Run04 modelY21_Y40_Run06 modelY21_Y40_Run12 modelY21_Y40_Run11 + ## modelY21_Y40_Run10 modelY21_Y40_Run02 modelY21_Y40_Run09 modelY21_Y40_Run03 + ## 19.29044 20.69596 20.82538 21.05558 + ## modelY21_Y40_Run05 modelY21_Y40_Run07 modelY21_Y40_Run08 modelY21_Y40_Run01 + ## 21.09128 21.22942 21.33484 21.37443 + ## modelY21_Y40_Run04 modelY21_Y40_Run06 modelY21_Y40_Run12 modelY21_Y40_Run11 ## 21.49363 21.98667 22.09476 22.65178 Based on means then selection would be Run 02 (2nd lowest), Run 04 & Run @@ -558,7 +558,7 @@ Run 06 (so definetly Run 3 but others to be discussed) ### **2c. Y2061 - Y2080** ``` r -Y <- rep(c(2061:2080), each=360) +Y <- rep(c(2061:2080), each=360) dfs_Y61_Y80 <- lapply(names_Y61_Y80, function(i){ @@ -570,25 +570,25 @@ dfs_Y61_Y80 <- lapply(names_Y61_Y80, function(i){ return(df) }) -#Create a single df in long form of Runs for the Y61_Y80 period +#Create a single df in long form of Runs for the Y61_Y80 period Y61_Y80_means <- dfs_Y61_Y80 %>% reduce(rbind) ``` ### **Time series - daily** ``` r -ggplot(Y61_Y80_means) + +ggplot(Y61_Y80_means) + geom_line(aes(x=dn, y=mean, group=model, colour=model)) + # Removing sd ribbon for ease of viewing - #geom_ribbon(aes(x =dn, ymin = mean - sd, ymax= mean + sd), alpha=0.4) + - theme_bw() + xlab("Day (2060 - 2080)") + - ylab("Daily mean max temp (tasmax) oC") + + #geom_ribbon(aes(x =dn, ymin = mean - sd, ymax= mean + sd), alpha=0.4) + + theme_bw() + xlab("Day (2060 - 2080)") + + ylab("Daily mean max temp (tasmax) oC") + #scale_fill_brewer(palette = "Paired", name = "") + scale_colour_brewer(palette = "Paired", name = "") + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank(), - legend.position = "none") + - facet_wrap(.~ model, ncol=3) + guides(fill = FALSE) + legend.position = "none") + + facet_wrap(.~ model, ncol=3) + guides(fill = FALSE) ``` ![](Identifying_Runs_files/figure-gfm/unnamed-chunk-36-1.png) @@ -596,17 +596,17 @@ ggplot(Y61_Y80_means) + ### **boxplot - mean Y61_Y80** ``` r -#Create a pallete specific to the runs so when reordered maintain the same colours +#Create a pallete specific to the runs so when reordered maintain the same colours Y61_Y80_means$model <- as.factor(Y61_Y80_means$model) c <- brewer.pal(12, "Paired") my_colours <- setNames(c, levels(Y61_Y80_means$model)) ``` ``` r -Y61_Y80_means %>% +Y61_Y80_means %>% mutate(model = fct_reorder(model, mean, .fun='median')) %>% - ggplot(aes(x=reorder(model, mean), y=mean, fill=model)) + - geom_boxplot() + theme_bw() + + ggplot(aes(x=reorder(model, mean), y=mean, fill=model)) + + geom_boxplot() + theme_bw() + ylab("Mean daily max temp (tasmax) oC") + xlab("model") + scale_fill_manual(values = my_colours) + theme(axis.text.x=element_blank(), @@ -631,40 +631,40 @@ ggplot(Y61_Y80_means, aes(sample=mean, colour=factor(model))) + ### **Time series - annual mean** ``` r -#Aggregating to year for annual average +#Aggregating to year for annual average Y61_Y80_means$Yf <- as.factor(Y61_Y80_means$Y) -Y61_Y80_means_y <- Y61_Y80_means %>% +Y61_Y80_means_y <- Y61_Y80_means %>% group_by(Yf, model) %>% dplyr::summarise(mean.annual=mean(mean, na.rm=T), sd.annual=sd(mean, na.rm = T)) ``` ``` r -ggplot(Y61_Y80_means_y) + - geom_line(aes(x = as.numeric(Yf), y=mean.annual, +ggplot(Y61_Y80_means_y) + + geom_line(aes(x = as.numeric(Yf), y=mean.annual, color=model)) + - theme_bw() + xlab("Year (2061 - 2080)") + - ylab("Annual mean of mean daily max temp (tasmax) oC") + + theme_bw() + xlab("Year (2061 - 2080)") + + ylab("Annual mean of mean daily max temp (tasmax) oC") + scale_fill_brewer(palette = "Paired", name = "") + scale_colour_brewer(palette = "Paired", name = "") + theme(axis.text.x=element_blank(), - axis.ticks.x=element_blank()) + axis.ticks.x=element_blank()) ``` ![](Identifying_Runs_files/figure-gfm/unnamed-chunk-41-1.png) ``` r # Plotting with SDs in geom_ribbon to see if anything wildely different -ggplot(Y61_Y80_means_y) + - geom_ribbon(aes(as.numeric(Yf), y=mean.annual, +ggplot(Y61_Y80_means_y) + + geom_ribbon(aes(as.numeric(Yf), y=mean.annual, ymin = mean.annual - sd.annual, ymax= mean.annual + sd.annual, - fill=model), alpha=0.4) + - geom_line(aes(x = as.numeric(Yf), y=mean.annual, + fill=model), alpha=0.4) + + geom_line(aes(x = as.numeric(Yf), y=mean.annual, color=model)) + - theme_bw() + xlab("Year (2061 - 2080)") + - ylab("Annual mean of mean daily max temp (tasmax) oC") + + theme_bw() + xlab("Year (2061 - 2080)") + + ylab("Annual mean of mean daily max temp (tasmax) oC") + scale_fill_brewer(palette = "Paired", name = "") + scale_colour_brewer(palette = "Paired", name = "") + theme(axis.text.x=element_blank(), @@ -676,10 +676,10 @@ ggplot(Y61_Y80_means_y) + ### **boxplot - annual mean Y61_Y80** ``` r -Y61_Y80_means_y %>% +Y61_Y80_means_y %>% mutate(model = fct_reorder(model, mean.annual, .fun='median')) %>% - ggplot(aes(x=reorder(model, mean.annual), y=mean.annual, fill=model)) + - geom_boxplot() + theme_bw() + + ggplot(aes(x=reorder(model, mean.annual), y=mean.annual, fill=model)) + + geom_boxplot() + theme_bw() + ylab("Annual max daily max temp oC") + xlab("model") + scale_fill_manual(values = my_colours) + theme(axis.text.x=element_blank(), @@ -704,21 +704,21 @@ ggplot(Y61_Y80_means_y, aes(sample=mean.annual, colour=factor(model))) + ### **Time series - annual max** ``` r -Y61_Y80_max_y <- Y61_Y80_means %>% +Y61_Y80_max_y <- Y61_Y80_means %>% group_by(Yf, model) %>% dplyr::summarise(max=max(mean, na.rm=T)) ``` ``` r ggplot(Y61_Y80_max_y) + - geom_line(aes(x = as.numeric(Yf), y=max, + geom_line(aes(x = as.numeric(Yf), y=max, color=model)) + - theme_bw() + xlab("Year (2061 - 2080)") + - ylab("Annual max of mean daily max temp (tasmax) oC") + + theme_bw() + xlab("Year (2061 - 2080)") + + ylab("Annual max of mean daily max temp (tasmax) oC") + scale_fill_brewer(palette = "Paired", name = "") + scale_colour_brewer(palette = "Paired", name = "") + theme(axis.text.x=element_blank(), - axis.ticks.x=element_blank()) + axis.ticks.x=element_blank()) ``` ![](Identifying_Runs_files/figure-gfm/unnamed-chunk-46-1.png) @@ -726,10 +726,10 @@ ggplot(Y61_Y80_max_y) + ### **boxplot - annual max** ``` r -Y61_Y80_max_y %>% +Y61_Y80_max_y %>% mutate(model = fct_reorder(model, max, .fun='median')) %>% - ggplot(aes(x=reorder(model, max), y=max, fill=model)) + - geom_boxplot() + theme_bw() + + ggplot(aes(x=reorder(model, max), y=max, fill=model)) + + geom_boxplot() + theme_bw() + ylab("Annual max of mean daily max temp (tasmax) oC") + xlab("model") + scale_fill_manual(values = my_colours) + theme(axis.text.x=element_blank(), @@ -748,11 +748,11 @@ av1 <- aov(mean ~ model - 1, Y61_Y80_means) av1$coefficients[order(av1$coefficients)] ``` - ## modelY61_Y80_Run10 modelY61_Y80_Run05 modelY61_Y80_Run01 modelY61_Y80_Run08 - ## 12.70342 13.87016 14.55815 14.65973 - ## modelY61_Y80_Run04 modelY61_Y80_Run09 modelY61_Y80_Run03 modelY61_Y80_Run12 - ## 14.69527 14.76917 14.79545 14.87939 - ## modelY61_Y80_Run07 modelY61_Y80_Run02 modelY61_Y80_Run11 modelY61_Y80_Run06 + ## modelY61_Y80_Run10 modelY61_Y80_Run05 modelY61_Y80_Run01 modelY61_Y80_Run08 + ## 12.70342 13.87016 14.55815 14.65973 + ## modelY61_Y80_Run04 modelY61_Y80_Run09 modelY61_Y80_Run03 modelY61_Y80_Run12 + ## 14.69527 14.76917 14.79545 14.87939 + ## modelY61_Y80_Run07 modelY61_Y80_Run02 modelY61_Y80_Run11 modelY61_Y80_Run06 ## 14.94320 15.01577 15.11392 15.11814 Annual means: @@ -762,11 +762,11 @@ av2 <- aov(mean.annual ~ model - 1, Y61_Y80_means_y) av2$coefficients[order(av2$coefficients)] ``` - ## modelY61_Y80_Run10 modelY61_Y80_Run05 modelY61_Y80_Run01 modelY61_Y80_Run08 - ## 12.70342 13.87016 14.55815 14.65973 - ## modelY61_Y80_Run04 modelY61_Y80_Run09 modelY61_Y80_Run03 modelY61_Y80_Run12 - ## 14.69527 14.76917 14.79545 14.87939 - ## modelY61_Y80_Run07 modelY61_Y80_Run02 modelY61_Y80_Run11 modelY61_Y80_Run06 + ## modelY61_Y80_Run10 modelY61_Y80_Run05 modelY61_Y80_Run01 modelY61_Y80_Run08 + ## 12.70342 13.87016 14.55815 14.65973 + ## modelY61_Y80_Run04 modelY61_Y80_Run09 modelY61_Y80_Run03 modelY61_Y80_Run12 + ## 14.69527 14.76917 14.79545 14.87939 + ## modelY61_Y80_Run07 modelY61_Y80_Run02 modelY61_Y80_Run11 modelY61_Y80_Run06 ## 14.94320 15.01577 15.11392 15.11814 Max of means @@ -776,11 +776,11 @@ av3 <- aov(max ~ model - 1, Y61_Y80_max_y) av3$coefficients[order(av3$coefficients)] ``` - ## modelY61_Y80_Run10 modelY61_Y80_Run05 modelY61_Y80_Run03 modelY61_Y80_Run04 - ## 21.83290 23.32972 23.88512 23.98220 - ## modelY61_Y80_Run02 modelY61_Y80_Run01 modelY61_Y80_Run08 modelY61_Y80_Run06 - ## 23.98610 24.03094 24.13232 24.41824 - ## modelY61_Y80_Run12 modelY61_Y80_Run09 modelY61_Y80_Run07 modelY61_Y80_Run11 + ## modelY61_Y80_Run10 modelY61_Y80_Run05 modelY61_Y80_Run03 modelY61_Y80_Run04 + ## 21.83290 23.32972 23.88512 23.98220 + ## modelY61_Y80_Run02 modelY61_Y80_Run01 modelY61_Y80_Run08 modelY61_Y80_Run06 + ## 23.98610 24.03094 24.13232 24.41824 + ## modelY61_Y80_Run12 modelY61_Y80_Run09 modelY61_Y80_Run07 modelY61_Y80_Run11 ## 24.48810 24.53152 24.77651 25.09102 Runs suggested by this slice are Run 05, Run 09, Run 03 and Run 11 @@ -797,10 +797,10 @@ The result per time slice suggest different runs, aside from run 5 result across the whole time period ``` r -Y <- rep(c(2001:2020), each=360) +Y <- rep(c(2001:2020), each=360) dfs_Y00_Y20 <- lapply(names_Y00_Y20, function(i){ - df <- dfs_Y00_Y20[[i]] + df <- dfs_Y00_Y20[[i]] names(df) <- c("day", "mean", "sd") df$model <- i df$dn <- 1:nrow(df) @@ -810,10 +810,10 @@ dfs_Y00_Y20 <- lapply(names_Y00_Y20, function(i){ }) -Y <- rep(c(2041:2060), each=360) +Y <- rep(c(2041:2060), each=360) dfs_Y41_Y60 <- lapply(names_Y41_Y60, function(i){ - df <- dfs_Y41_Y60[[i]] + df <- dfs_Y41_Y60[[i]] names(df) <- c("day", "mean", "sd") df$model <- i df$dn <- 1:nrow(df) @@ -824,8 +824,8 @@ dfs_Y41_Y60 <- lapply(names_Y41_Y60, function(i){ #Create a single df in long form as above -Y00_Y20_means <- dfs_Y00_Y20 %>% reduce(rbind) -Y41_Y60_means <- dfs_Y41_Y60 %>% reduce(rbind) +Y00_Y20_means <- dfs_Y00_Y20 %>% reduce(rbind) +Y41_Y60_means <- dfs_Y41_Y60 %>% reduce(rbind) ``` Assessing what the combined times slices suggest via anova @@ -844,19 +844,19 @@ av1 <- aov(mean ~ model - 1, all.means) av1$coefficients[order(av1$coefficients)] ``` - ## modelRun10 modelRun05 modelRun09 modelRun04 modelRun03 modelRun07 modelRun08 - ## 11.12464 12.48165 12.79216 12.89910 12.91685 12.91894 12.95115 - ## modelRun02 modelRun01 modelRun12 modelRun06 modelRun11 + ## modelRun10 modelRun05 modelRun09 modelRun04 modelRun03 modelRun07 modelRun08 + ## 11.12464 12.48165 12.79216 12.89910 12.91685 12.91894 12.95115 + ## modelRun02 modelRun01 modelRun12 modelRun06 modelRun11 ## 12.95347 12.97947 13.38267 13.40644 13.61157 #### Annual means: ``` r -# As above, creating annual means +# As above, creating annual means infill.L <- list(Y00_Y20_means, Y41_Y60_means) infill.L_y <- lapply(infill.L, function(x){ - means_y <- x %>% + means_y <- x %>% group_by(Yf, model) %>% dplyr::summarise(mean.annual=mean(mean, na.rm=T), sd.annual=sd(mean, na.rm = T))}) ``` @@ -867,10 +867,10 @@ infill.L_y <- lapply(infill.L, function(x){ ## argument. ``` r -all.means_y <- rbind(historical_means_y, - infill.L_y[[1]], - Y21_Y40_means_y, - infill.L_y[[2]], +all.means_y <- rbind(historical_means_y, + infill.L_y[[1]], + Y21_Y40_means_y, + infill.L_y[[2]], Y61_Y80_means_y) x <- as.character(all.means_y$model) @@ -880,9 +880,9 @@ av2 <- aov(mean.annual ~ model - 1, all.means_y) av2$coefficients[order(av2$coefficients)] ``` - ## modelRun10 modelRun05 modelRun09 modelRun04 modelRun03 modelRun07 modelRun08 - ## 11.12464 12.48165 12.79216 12.89910 12.91685 12.91894 12.95115 - ## modelRun02 modelRun01 modelRun12 modelRun06 modelRun11 + ## modelRun10 modelRun05 modelRun09 modelRun04 modelRun03 modelRun07 modelRun08 + ## 11.12464 12.48165 12.79216 12.89910 12.91685 12.91894 12.95115 + ## modelRun02 modelRun01 modelRun12 modelRun06 modelRun11 ## 12.95347 12.97947 13.38267 13.40644 13.61157 **Updated June 13th 2023 result** diff --git a/R/misc/InfillDataRunIdentyfing.R b/R/misc/InfillDataRunIdentyfing.R index 516de7bb..3358b914 100644 --- a/R/misc/InfillDataRunIdentyfing.R +++ b/R/misc/InfillDataRunIdentyfing.R @@ -1,6 +1,6 @@ # Script for converting all UKCP CPM input data to dataframes # Updating to include infill data: -# This script edited from: 'ConvertingallCPMdatatodf.R' +# This script edited from: 'ConvertingallCPMdatatodf.R' ## As a note for future, I did try and run this to extract the means via terra but interestingly would have taken much longer! library(terra) @@ -25,9 +25,7 @@ file.paths <- lapply(Runs, function(i){ fp <- paste0(dd, "Reprojected_infill/UKCP2.2/tasmax/", i, "/latest/") f <- list.files(fp) files <- f[!grepl(".aux.xml", f)] - # Data for infill only pulled out - if re-run use all + # Data for infill only pulled out - if re-run use all files <- files[grepl(infill.years, files)] files.p <- paste0(fp, files) }) - - \ No newline at end of file diff --git a/R/misc/calc.mean.sd.daily.infill.R b/R/misc/calc.mean.sd.daily.infill.R index 68fcbc83..86c5c0f8 100644 --- a/R/misc/calc.mean.sd.daily.infill.R +++ b/R/misc/calc.mean.sd.daily.infill.R @@ -28,17 +28,17 @@ for(i in 1:12){ df.means_hist <- as.data.frame(df.means_hist) df.sds_hist <- sapply(df[c(1:7200)], sd, na.rm=T) df.sds_hist <- as.data.frame(df.sds_hist) - df.avs_hist <- cbind(df.means_hist, df.sds_hist) + df.avs_hist <- cbind(df.means_hist, df.sds_hist) r <- Runs[[i]] fn <- paste0("df.avs_Y00_Y20_Run",i, ".csv") write.csv(df.avs_hist, fn) - + df.means_Y41_Y60 <- colMeans(df[c(7201:14400)], na.rm=T) df.means_Y41_Y60 <- as.data.frame(df.means_Y41_Y60) df.sds_Y41_Y60 <- sapply(df[c(7201:14400)], sd, na.rm=T) df.sds_Y41_Y60 <- as.data.frame(df.sds_Y41_Y60) - df.avs_Y41_Y60 <- cbind(df.means_Y41_Y60, df.sds_Y41_Y60) + df.avs_Y41_Y60 <- cbind(df.means_Y41_Y60, df.sds_Y41_Y60) fn <- paste0("df.Y41_Y60_Run",i, ".csv") write.csv(df.avs_Y41_Y60, fn) @@ -48,4 +48,3 @@ for(i in 1:12){ gc() } - diff --git a/R/misc/clim-recal-specific-functions.R b/R/misc/clim-recal-specific-functions.R index 0274557d..c51fac76 100644 --- a/R/misc/clim-recal-specific-functions.R +++ b/R/misc/clim-recal-specific-functions.R @@ -5,5 +5,3 @@ first.rast <- function(x){ rp <- paste0(fp,r) rast(rp) } - - diff --git a/R/misc/cropping-CPM-to-Scotland.R b/R/misc/cropping-CPM-to-Scotland.R index 3149385d..33e3575a 100644 --- a/R/misc/cropping-CPM-to-Scotland.R +++ b/R/misc/cropping-CPM-to-Scotland.R @@ -1,4 +1,4 @@ -#### Cropping 'Raw' (reprojected CPM) data to Scotland +#### Cropping 'Raw' (reprojected CPM) data to Scotland rm(list=ls()) #libs @@ -16,7 +16,7 @@ files <- list.files(p) raw.files <- files[!grepl("aux.xml", files)] raw.files.p <- paste0(p, raw.files) -raw.dat <- lapply(raw.files.p, rast) +raw.dat <- lapply(raw.files.p, rast) raw.dat.r <- rast(raw.dat) nlyr(raw.dat.r) @@ -36,4 +36,3 @@ lapply(n, function(i){ fn <- paste0(rd, i, ".tif") writeRaster(rast, filename=fn) }) - diff --git a/R/misc/read_crop.fn.R b/R/misc/read_crop.fn.R index a6a6e994..ee7391db 100644 --- a/R/misc/read_crop.fn.R +++ b/R/misc/read_crop.fn.R @@ -1,4 +1,4 @@ -#### FOR USE IN CROPPING RASTERS TO THE THREE CITIES +#### FOR USE IN CROPPING RASTERS TO THE THREE CITIES # A function to read in specific runs, vars and years, crop them to an area (optionally) and write vals to a georef'd df @@ -9,123 +9,123 @@ cpm_read_crop <- function(runs, #Character vector of selected runs as number onl rd, #path to results directory eg paste0(dd, "Cropped/three.cities/CPM/") crop.area, #Polygon of area to crop to - any Spat obj accepted by terra::crop will work cropname){ #Character - name of crop to be assigned to the returned vect - - - runs <- runs + + + runs <- runs var <- var fp <- fp rd <- rd - + bbox <- crop.area cropname <- cropname - + for(i in runs){ for(v in var){ p <- paste0(fp, v, "/", i, "/latest/") files <- list.files(p) files <- files[!grepl("aux.xml", files)] - + files.p <- paste0(p, files) - - # Load and convert remaining to single col dfs + + # Load and convert remaining to single col dfs dfL <- lapply(1:length(files.p), function(n){ - f <- files.p[[n]] + f <- files.p[[n]] r <- rast(f) r_c <- crop(r, bbox, snap="out") - - #Write - f <- files[[n]]#filename as it was read in - fn <- paste0(rd, cropname, "/" , f) - - writeRaster(r_c, fn, overwrite=TRUE) - - }) - + + #Write + f <- files[[n]]#filename as it was read in + fn <- paste0(rd, cropname, "/" , f) + + writeRaster(r_c, fn, overwrite=TRUE) + + }) + gc() } } -} +} -# HADs function +# HADs function hads_read_crop <- function(var, #Character vector of selected variables - this might need changing fp, #filepath of parent d of folders where files are - eg paste0(dd, "Reprojected_infill/UKCP2.2/") rd, #path to results directory eg paste0(dd, "Cropped/three.cities/CPM/") - file.date, #Character, Date of HADs file to crop from in YYYYMMDD + file.date, #Character, Date of HADs file to crop from in YYYYMMDD crop.area, #Polygon of area to crop to - any Spat obj accepted by terra::crop will work cropname){ #Character - name of crop to be assigned to the returned vect - + var <- var fp <- fp bbox <- crop.area cropname <- cropname file.date <- file.date - + for(v in var){ - + HADs.files <- list.files(paste0(fp, v,"/day/")) files <- HADs.files[grepl(v, HADs.files)] file.i <- grep(file.date,files) files <- files[file.i:length(files)] files.p <- paste0(fp, v,"/day/",files) - - - # Load and convert remaining to single col dfs + + + # Load and convert remaining to single col dfs dfL <- lapply(1:length(files.p), function(n){ - f <- files.p[[n]] + f <- files.p[[n]] r <- rast(f) r_c <- crop(r, bbox, snap="out") - - #Write - f <- files[[n]]#filename as it was read in - fn <- paste0(rd, cropname, "/" , f) - - writeCDF(r_c, fn, overwrite=TRUE) + + #Write + f <- files[[n]]#filename as it was read in + fn <- paste0(rd, cropname, "/" , f) + + writeCDF(r_c, fn, overwrite=TRUE) }) gc() - } -} + } +} #Function to read and crop and convert to df, but not write it. - at some point should be merged with other R folder hads_read_crop_df <- function(var, #Character vector of selected variables - this might need changing fp, #filepath of parent d of folders where files are - eg paste0(dd, "Reprojected_infill/UKCP2.2/"). Include up until var folder - file.date1, #Character, Date of HADs file to crop from in YYYYMMDD - file.date2, #Character, Date of HADs file to crop to in YYYYMMDD + file.date1, #Character, Date of HADs file to crop from in YYYYMMDD + file.date2, #Character, Date of HADs file to crop to in YYYYMMDD crop.area){ #Polygon of area to crop to - any Spat obj accepted by terra::crop will work - + var <- var fp <- fp bbox <- crop.area file.date1 <- file.date1 file.date2 <- file.date2 - + for(v in var){ - + HADs.files <- list.files(paste0(fp, v,"/day/")) files <- HADs.files[grepl(v, HADs.files)] file.i <- grep(file.date1,files) file.ii <- grep(file.date2,files) files <- files[file.i:file.ii] files.p <- paste0(fp, v,"/day/",files) - - # Read in 1st runpath as df with xy coords to ensure overlay with CPM data - p <- files.p[[1]] + + # Read in 1st runpath as df with xy coords to ensure overlay with CPM data + p <- files.p[[1]] r <- rast(p) - rdf1 <- as.data.frame(r, xy=T) - + rdf1 <- as.data.frame(r, xy=T) + #To ensure subset dataframe has useful naming convention - this does not pull it through as such n <- substr(p, nchar(p)-20, nchar(p)) n <- gsub(".nc","", n) names(rdf1) <- gsub("_", paste0(n, "_"), names(rdf1)) - + # Load and convert remaining to single col dfs i <- 2:length(files.p) - + dfL <-lapply(i, function(i){ - p <- files.p[[i]] + p <- files.p[[i]] r <- rast(p) rdf <- as.data.frame(r) #To ensure subset dataframe has useful naming convention - this does not pull it through as such @@ -133,13 +133,11 @@ hads_read_crop_df <- function(var, #Character vector of selected variables - thi n <- gsub(".nc","", n) names(rdf) <- gsub("_", paste0(n, "_"), names(rdf)) return(rdf) - }) - + }) + df <- dfL %>% reduce(cbind) df <- cbind(rdf1, df) - - gc() - } -} - + gc() + } +} diff --git a/R/misc/read_crop_df_write.fn.R b/R/misc/read_crop_df_write.fn.R index 46afa9f1..92c898c7 100644 --- a/R/misc/read_crop_df_write.fn.R +++ b/R/misc/read_crop_df_write.fn.R @@ -7,7 +7,7 @@ write.csv.date <- function(x, y){ csvFileName <- paste(rd,"/",fn,".",date,".csv",sep="") write.csv(x, file=csvFileName, row.names = F)} -# A function to read in specific runs, vars and years +# A function to read in specific runs, vars and years cpm_read_crop_df_write <- function(runs, #Character vector of selected runs var, #Character vector of selected variables - this might need changing @@ -19,82 +19,82 @@ cpm_read_crop_df_write <- function(runs, #Character vector of selected runs crop.area, #Polygon of area to crop to - any Spat obj accepted by terra::crop will work cropname, #Character - name of crop to be assigned to the returned df - usually the crop area rd){ # results directory for storing results - - runs <- runs + + runs <- runs var <- var years <- paste0(year1:year2, "1201", collapse="|") if(crop == T){ - + bbox <- crop.area - + for(i in runs){ for(v in var){ p <- paste0(fp, v, "/", i, "/latest/") files <- list.files(p) files <- files[!grepl("aux.xml", files)] - + files.y <- files[grepl(years, files)]# Historical timeslice 2 for calibration files.y.p <- paste0(p, files.y) - - # Read in 1st runpath as df with xy coords to ensure overlay - p1 <- files.y.p[[1]] + + # Read in 1st runpath as df with xy coords to ensure overlay + p1 <- files.y.p[[1]] r <- rast(p1) - r_c <- crop(r, bbox, snap="out", mask=T) - rdf1 <- as.data.frame(r_c, xy=T) - - # Load and convert remaining to single col dfs + r_c <- crop(r, bbox, snap="out", mask=T) + rdf1 <- as.data.frame(r_c, xy=T) + + # Load and convert remaining to single col dfs dfL <- lapply(2:length(files.y.p), function(i){ - p <- files.y.p[[i]] + p <- files.y.p[[i]] r <- rast(p) - r_c <- crop(r, bbox, snap="out", mask=T) - rdf <- as.data.frame(r_c) + r_c <- crop(r, bbox, snap="out", mask=T) + rdf <- as.data.frame(r_c) return(rdf) - }) - + }) + df <- dfL %>% reduce(cbind) df <- cbind(rdf1, df) - + fn <- paste0(name1, "_", cropname, year1, "_", year2, v, "_Run", i) - + rd <- rd write.csv.date(df, fn) gc() } } } else { #for where no crop to be applied - + for(i in runs){ for(v in var){ p <- paste0(fp, v, "/", i, "/latest/") files <- list.files(p) files <- files[!grepl("aux.xml", files)] - + files.y <- files[grepl(years, files)]# Historical timeslice 2 for calibration files.y.p <- paste0(p, files.y) - - # Read in 1st runpath as df with xy coords to ensure overlay - p1 <- files.y.p[[1]] + + # Read in 1st runpath as df with xy coords to ensure overlay + p1 <- files.y.p[[1]] r <- rast(p1) - rdf1 <- as.data.frame(r_c, xy=T) - - # Load and convert remaining to single col dfs + rdf1 <- as.data.frame(r_c, xy=T) + + # Load and convert remaining to single col dfs dfL <- lapply(2:length(files.y.p), function(i){ - p <- files.y.p[[i]] + p <- files.y.p[[i]] r <- rast(p) - rdf <- as.data.frame(r_c) + rdf <- as.data.frame(r_c) return(rdf) - }) - + }) + df <- dfL %>% reduce(cbind) df <- cbind(rdf1, df) - + rd <- rd - + fn <- paste0(name1, "_", cropname, year1, "_", year2, v, "_Run", i) - + write.csv.date(df, fn) - + gc() } } @@ -102,7 +102,7 @@ cpm_read_crop_df_write <- function(runs, #Character vector of selected runs } -# HADs function +# HADs function hads19802010_read_crop_df_write <- function(var, #Character vector of selected variables - this might need changing fp, #filepath of parent d of folders where files are - eg paste0(dd, "Reprojected_infill/UKCP2.2/") @@ -111,77 +111,77 @@ hads19802010_read_crop_df_write <- function(var, #Character vector of selected v crop.area, #Polygon of area to crop to - any Spat obj accepted by terra::crop will work cropname, #Character - name of crop to be assigned to the returned df - usually the crop area rd){ # results directory for storing results - + var <- var fp <- fp crop <- crop for(v in var){ - + HADs.files <- list.files(paste0(fp, v,"/day/")) files <- HADs.files[grepl(v, HADs.files)] Runpaths <- paste0(fp,v,"/day/",files[1:360]) #Subsetting to years 1980-2010 - if we download different data then this would need to be changed - + if(crop == TRUE){ - + bbox <- crop.area cropname <- cropname - - # Read in 1st runpath as df with xy coords to ensure overlay with CPM data - p <- Runpaths[[1]] + + # Read in 1st runpath as df with xy coords to ensure overlay with CPM data + p <- Runpaths[[1]] r <- rast(p) - r_c <- crop(r, bbox, snap="out", mask=T) - rdf1 <- as.data.frame(r_c, xy=T) - + r_c <- crop(r, bbox, snap="out", mask=T) + rdf1 <- as.data.frame(r_c, xy=T) + #To ensure subset dataframe has useful naming convention - this does not pull it through as such n <- substr(p, nchar(p)-20, nchar(p)) n <- gsub(".nc","", n) names(rdf1) <- gsub("_", paste0(n, "_"), names(rdf1)) - + # Load and convert remaining to single col dfs i <- 2:length(Runpaths) - + dfL <-lapply(i, function(i){ - p <- Runpaths[[i]] + p <- Runpaths[[i]] r <- rast(p) - r_c <- crop(r, bbox, snap="out", mask=T) + r_c <- crop(r, bbox, snap="out", mask=T) rdf <- as.data.frame(r_c) #To ensure subset dataframe has useful naming convention - this does not pull it through as such n <- substr(p, nchar(p)-20, nchar(p)) n <- gsub(".nc","", n) names(rdf) <- gsub("_", paste0(n, "_"), names(rdf)) return(rdf) - }) - + }) + df <- dfL %>% reduce(cbind) df <- cbind(rdf1, df) - + rd <- rd - + fn <- paste0(name1,cropname,"1980_2010_", v) - + write.csv.date(df, fn) - + gc() - + } else { - - - # Read in 1st runpath as df with xy coords to ensure overlay with CPM data - p <- Runpaths[[1]] + + + # Read in 1st runpath as df with xy coords to ensure overlay with CPM data + p <- Runpaths[[1]] r <- rast(p) - rdf1 <- as.data.frame(r, xy=T) - + rdf1 <- as.data.frame(r, xy=T) + #To ensure subset dataframe has useful naming convention - this does not pull it through as such n <- substr(p, nchar(p)-20, nchar(p)) n <- gsub(".nc","", n) names(rdf1) <- gsub("_", paste0(n, "_"), names(rdf1)) - + # Load and convert remaining to single col dfs i <- 2:length(Runpaths) - + dfL <-lapply(i, function(i){ - p <- Runpaths[[i]] + p <- Runpaths[[i]] r <- rast(p) rdf <- as.data.frame(r) #To ensure subset dataframe has useful naming convention - this does not pull it through as such @@ -189,19 +189,19 @@ hads19802010_read_crop_df_write <- function(var, #Character vector of selected v n <- gsub(".nc","", n) names(rdf) <- gsub("_", paste0(n, "_"), names(rdf)) return(rdf) - }) - + }) + df <- dfL %>% reduce(cbind) df <- cbind(rdf1, df) - + rd <- rd - + fn <- paste0(name1,"1980_2010_", v) - + write.csv.date(df, fn) - + gc() - + } - } -} + } +} From 0de11c3214e45a6e9e621c307877f3113a5f0548 Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Wed, 6 Dec 2023 10:07:19 +0000 Subject: [PATCH 37/83] feat: add `notebooks/Assessing_bc_Data/Assessing_BC-Data-workshop.RMD` --- .../Assessing_BC_Data-workshop.RMD | 887 ++++++++++++++++++ 1 file changed, 887 insertions(+) create mode 100644 notebooks/Assessing_bc_data/Assessing_BC_Data-workshop.RMD diff --git a/notebooks/Assessing_bc_data/Assessing_BC_Data-workshop.RMD b/notebooks/Assessing_bc_data/Assessing_BC_Data-workshop.RMD new file mode 100644 index 00000000..3750231d --- /dev/null +++ b/notebooks/Assessing_bc_data/Assessing_BC_Data-workshop.RMD @@ -0,0 +1,887 @@ +--- +title: "Bias correction assessment" +author: "Ruth Bowyer" +date: "`r format(Sys.Date())`" +output: + html_document: + theme: cosmo + toc: TRUE + toc_float: TRUE + toc_depth: 4 + code_folding: hide + df_print: paged +params: + ask: false +--- + + +```{r libs and setup, message=FALSE, warning=F} +rm(list=ls()) + +knitr::opts_knit$set(root.dir="/mnt/vmfileshare/ClimateData/") + + +library(ggplot2) +library(terra) +library(tmap) #pretty maps +library(RColorBrewer) +library(tidyverse) +library(kableExtra) + +if (!require(devtools)) install.packages("devtools") +library(devtools) +install_github("hzambran/hydroTSM") +install_github("hzambran/hydroGOF") +``` + + +## **0. About** + +This is an example notebook for the assessment of bias corrected data, using output from the R 'qmap' package for the city of Glasgow and the variable 'tasmax'. + +**Input data** + +This script requires the following data: + +- 'obs.cal' - observation (HADs data) for the *calibration* period - the dataset used as the reference dataset in the bias correction +- 'obs.val' - as above, for the *validation* period +- 'cpm.cal.raw' - the raw (uncorrected) data for the *calibration* period +- 'cpm.cal.adj' - the adjusted (bias-corrected) data for the *calibration* period +- 'cpm.val.raw' - the raw (uncorrected) data for the *valibration* period +- 'cpm.val.adj' - the adjusted (bias-corrected) data for the *valibration* period +- 'cpm.proj.raw' - the raw (uncorrected) data for the *future/projected* period (optional) +- 'cpm.proj.radj' - the adjusted (bias-corrected) data for the *future/projected* period (optional) + +The data is required in raster format and dataframe formats + +**Calibration vs Validation dates** + +The calibration period runs between 01-12-1980 to the day prior to 01-12-2010 +The validation period runs between 01-12-2010 to the day prior to 01-12-2020 + +```{r data loading, include=FALSE} + +#This chunk attempts to apply the conversion to python output data to a form that this script will also use. This could (and probably should) be moved to a source script -- also the R pre-processing should probably be moved to the bias correction script? + +dd <- "/mnt/vmfileshare/ClimateData/" #Data directory of all data used in this script + +input <- "RDS" #Either df or raster -- R outputs are a group of dfs in list form saved as an RDS, python input is a raster +city <- "Glasgow" +var <- "tasmax" +runs <- c("05", "06", "07", "08") + +if(input=="raster"){ + +####### PYTHON INPUTS HERE ###### + # This script uses both raster data and the raw data + # This script uses Lists to group everything by runs + # Therefore what is require from this here is to create a list object for each of the sets of the data as listed above, where the list items are the rasters or dataframes by run (ie each level of the list is a run) + # .nc and .tif files can be read with rast("path/to/file.nc") + # Conversion to df is just as.data.frame(raster, xy=T) - easiest thing is just to loop using lapply the files + #dfs are assumed to be cells x time + +} else if (input=="RDS"){ + ### This R bit is a bit crazy because of the format output from the bias correction - at some point to be cleaned up and moved to a different script. + ## Load a source raster to extract the crs + r <- list.files(paste0(dd, "Reprojected_infill/UKCP2.2/tasmax/05/latest/"), full.names = T)[1] + rast <- rast(r) + crs <- crs(rast) + + ## The output created from the R bias correction framework is a list of dataframes containing all the data we need for this doc (although some are transposed). + rd <- "Debiased/R/QuantileMapping/three.cities/" + + files <- list.files(paste0(dd,rd,city),full.names=T) + files.v <- files[grepl(var, files)] + + allruns <- lapply(files.v, readRDS) + + names <- gsub(paste0(dd,rd,city,"|/|.RDS"),"",files.v) + names(allruns) <- names + + #This was returned for ease where multiple runs have been looped to apply this paritcular function, but actually we don't need a cope for each nor this data in a list. Therefore: + obs.cal.df <- as.data.frame(t(allruns[[1]]$t.obs)) + + obs.val.df <- allruns[[1]]$val.df[c(1:3601)] #To run until 30th Nov 2020 + + cpm.cal.raw.df.L <- lapply(allruns, function(L){ + as.data.frame(t(L[["t.cal"]])) + }) + + #In the R scirpt, the validation is corrected with the projected data as well - so needs to be seperated out (and transposed) + cpm.val.raw.df.L <- lapply(allruns, function(L){ + proj <- as.data.frame(t(L[["t.proj"]])) + val.end.date <- min(grep("20201201-", names(proj)))-1 + cpm.val.raw.df <- proj[,1:val.end.date] + }) + + cpm.proj.raw.df.L <- lapply(allruns, function(L){ + proj <- as.data.frame(t(L[["t.proj"]])) + val.end.date <- min(grep("20201201-", names(proj))) + cpm.val.raw.df <- proj[,val.end.date:ncol(proj)] + }) + + cpm.cal.adj.df.L <- lapply(allruns, function(L){ + adj <- as.data.frame(t(L[["qm1.hist"]])) + }) + + cpm.val.adj.df.L <- lapply(allruns, function(L){ + proj <- as.data.frame(t(L[["qm1.val.proj"]])) + val.end.date <- min(grep("20201201-", names(proj)))-1 + proj[,1:val.end.date] + }) + + cpm.proj.adj.df.L <- lapply(allruns, function(L){ + proj <- as.data.frame(t(L[["qm1.val.proj"]])) + val.end.date <- min(grep("20201201-", names(proj))) + proj[,val.end.date:ncol(proj)] + }) + + ## Convert to rasters --requires creation of x and y cols from row names +## For the comparison, just converting the observation and cpm for the cal and val perios (ie not the projection datasets) + +obsrastL <- lapply(list(obs.cal.df, obs.val.df), function(i){ + rn <- row.names(i) #The rownames were saves as x_y coordinates + xi <- gsub("_.*", "", rn) + yi <- gsub(".*_", "", rn) + xy <- data.frame(x = xi, y = yi) + df <- cbind(xy, i) + r <- rast(df, type="xyz") + crs(r) <- crs + return(r) +}) + +names(obsrastL) <- c("obs.cal.rasts", "obs.val.rasts") +list2env(obsrastL, .GlobalEnv) +remove(obsrastL) + +list2rast <- list(cpm.cal.raw.df.L, cpm.cal.adj.df.L, cpm.val.raw.df.L, cpm.val.adj.df.L) + +rastsL <- lapply(list2rast, function(x){ + allruns <- x + df.rL <- lapply(runs, function(i){ + df <- allruns[[grep(i, names(allruns))]] #extract df based on run id + rn <- row.names(df) #The rownames were saves as x_y coordinates + xi <- gsub("_.*", "", rn) + yi <- gsub(".*_", "", rn) + xy <- data.frame(x = xi, y = yi) + df <- cbind(xy, df) + r <- rast(df, type="xyz") + crs(r) <- crs + return(r) + }) + names(df.rL) <- runs + return(df.rL) + }) + +names(rastsL) <- c("cpm.cal.raw.rasts.L", "cpm.cal.adj.rasts.L", "cpm.val.raw.rasts.L", "cpm.val.adj.rasts.L") + +list2env(rastsL, .GlobalEnv) + +remove(rastsL) +remove(list2rast) + +gc() + + } else { + print("Invalid input") +} + + + +``` + + +## **1. Bias Correction Assessment: trends** + +An visual comparison of trends across observation, raw and adjusted data for the same time period + +### **1a. Raster comparison** + +Random selection of 3 days of the observation, calibration and two adjusted cals, for three historic days + +Adding in the city shapeoutline for prettier maps + +```{r} + +shape <-sf::st_as_sf(vect(paste0(dd, "shapefiles/three.cities/", city, "/", city, ".shp"))) + +``` + + + +#### **Day 1 - 1980-12-01 - calibration period ** {.tabset} + +##### Run05 + +```{r, fig.show="hold", out.width="33%"} + + tm_shape(obs.cal.rasts[[1]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + + +tm_shape(cpm.cal.raw.rasts.L$`05`[[1]]) + + tm_raster(title="CPM, Raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.adj.rasts.L$`05`[[1]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +``` + + +##### Run06 + +```{r, fig.show="hold", out.width="33%"} +tm_shape(obs.cal.rasts[[1]]) + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.raw.rasts.L$`06`[[1]]) + + tm_raster(title="CPM, Raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.adj.rasts.L$`06`[[1]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") +``` + + +##### Run07 + +```{r, fig.show="hold", out.width="33%"} + +tm_shape(obs.cal.rasts[[1]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.raw.rasts.L$`07`[[1]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.adj.rasts.L$`07`[[1]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") +``` + + +##### Run08 + +```{r, fig.show="hold", out.width="33%"} + +tm_shape(obs.cal.rasts[[1]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.raw.rasts.L$`08`[[1]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.adj.rasts.L$`08`[[1]]) + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +``` + +#### **Day 2 - 2008-08-01 - calibration period ** {.tabset} + +##### Run05 + +```{r, fig.show="hold", out.width="33%"} + + tm_shape(obs.cal.rasts[[7081]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + + +tm_shape(cpm.cal.raw.rasts.L$`05`[[7081]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.adj.rasts.L$`05`[[7081]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +``` + + +##### Run06 + +```{r, fig.show="hold", out.width="33%"} +tm_shape(obs.cal.rasts[[7081]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.raw.rasts.L$`06`[[7081]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.adj.rasts.L$`06`[[7081]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") +``` + + +##### Run07 + +```{r, fig.show="hold", out.width="33%"} + +tm_shape(obs.cal.rasts[[7081]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.raw.rasts.L$`07`[[7081]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.adj.rasts.L$`07`[[7081]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") +``` + + +##### Run08 + +```{r, fig.show="hold", out.width="33%"} + +tm_shape(obs.cal.rasts[[7081]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.raw.rasts.L$`08`[[7081]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.adj.rasts.L$`08`[[7081]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +``` + + + +#### **Day 3 - 2015-05-01 - calibration period ** {.tabset} + +##### Run05 + +```{r, fig.show="hold", out.width="33%"} + + tm_shape(obs.val.rasts[[1590]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + + +tm_shape(cpm.val.raw.rasts.L$`05`[[1590]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.val.adj.rasts.L$`05`[[1590]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +``` + + +##### Run06 + +```{r, fig.show="hold", out.width="33%"} +tm_shape(obs.val.rasts[[1590]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.val.raw.rasts.L$`06`[[1590]]) + + tm_raster(title="CPM, Raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.val.adj.rasts.L$`06`[[1590]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") +``` + + +##### Run07 + +```{r, fig.show="hold", out.width="33%"} + +tm_shape(obs.val.rasts[[1590]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.val.raw.rasts.L$`07`[[1590]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.val.adj.rasts.L$`07`[[1590]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") +``` + + +##### Run08 + +```{r, fig.show="hold", out.width="33%"} + +tm_shape(obs.val.rasts[[1590]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.val.raw.rasts.L$`08`[[1590]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.val.adj.rasts.L$`08`[[1590]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +``` + +#### {-} + +### **1b. Trend comparison** + +```{r} +#Lists of dfs to summarise the means of +dfL <- c(list(obs.cal.df), cpm.cal.raw.df.L, cpm.cal.adj.df.L) +names(dfL) <- c("obs.cal", paste0("cpm.cal.raw.", names(cpm.cal.raw.df.L)), + paste0("cpm.cal.adj.", names(cpm.cal.raw.df.L))) + +#Returns a list of dfs in handy format for graphing +dfg.daily.means <- lapply(dfL, function(i){ + x <- 1:ncol(i) #ignore cols 1 & 2 with x y + #Calc mean and sd + dfx <- lapply(x, function(x){ + y <- i[,x] + mean <- mean(y, na.rm=T) + sd <- sd(y, na.rm=T) + dfr <- data.frame(mean=mean, + sd.high=mean+sd, + sd.low=mean-sd) + dfr$day <- names(i)[x] + return(dfr) + }) + dfx_g <- dfx %>% purrr::reduce(rbind) + }) + + +names(dfg.daily.means) <- names(dfL) +``` + +*Note* : Can add a plot here for daily averages but it's quite visually confusing so omitting for now + +#### **Seasonal trends - Calibration period ** + + +```{r} + +#Annotate season based on month index - the dates have different formats depending on the input data (ie hads vs cpm) so am pulling out the necessary to adjust sep + +obs.cal.season.mean <- dfg.daily.means$obs.cal + +x <- dfg.daily.means$obs.cal$day + +obs.cal.season.mean$season <- ifelse(grepl("1231_|0131_|0228_|0229_", x), + "Winter", + ifelse(grepl("0331_|0430_|0531_", x), "Spring", + ifelse(grepl("0630_|0731_|0831_", x), "Summer", "Autumn"))) + +#Note = the n days per season is not quite evenly split between the 4 seasons because of how the hads resamples across the year for 360 days + + #Create season_year - All Winter months apart from Dec to be added to the previous year (ie Winter 2000) would be the Dec of 2000 to the Feb of 2001 + rem <- nchar(var) + 39 + year <- substr(x, rem, rem+3) + year <- as.numeric(substr(year, 1,4)) + obs.cal.season.mean$season_year <- ifelse(grepl("0131_|0228_|0229_", x), + paste0(year-1, obs.cal.season.mean$season), + paste0(year, obs.cal.season.mean$season)) + # Mutate to a seasonal mean df + obs.cal.season.mean <- aggregate(obs.cal.season.mean[[1]], list(obs.cal.season.mean[["season_year"]]), function(x) c(seasonal.mean = mean(x), sd.high.seasonal = mean(x) + sd(x), sd.low.seasonal = mean(x) - sd(x))) + obs.cal.season.mean<- data.frame(season_year=obs.cal.season.mean$Group.1, + seasonal.mean=obs.cal.season.mean$x[,"seasonal.mean"], + sd.high.seasonal = obs.cal.season.mean$x[,"sd.high.seasonal"], + sd.low.seasonal = obs.cal.season.mean$x[,"sd.low.seasonal"]) + + + #Grouping variable for later vars + obs.cal.season.mean$model <- "obs" + +``` + +## Ruth to finish cleaning up this bit (it won't run at the moment) + +```{r eval=FALSE, include=FALSE} + dfg.seasonal.mean <- lapply(c("raw.cal.daymeans", "bc.b.cal.daymeans", + "bc.a.cal.daymeans"), function(i){ + df <- dfg[[i]] + x <- df$day + x <- gsub(".*_", "", x) + x <- as.numeric(x) + #The CPM days are consecutive 1 - 360 by year + df$season <- ifelse(x<91, "Winter", + ifelse(x<181, "Spring", + ifelse(x<271, "Summer", "Autumn"))) + + #Create season_year - All Winter months apart from Dec to be added to the previous year (ie Winter 2000) would be the Dec of 2000 to the Feb of 2001 + year <- gsub(".*day_", "", df$day) + year <- as.numeric(substr(year, 1,4)) + df$season_year <- ifelse(x>29&x<91, + paste0(year-1, df$season), + paste0(year, df$season)) + + # Mutate to a seasonal mean -- cant get this to run in tidyverse within loop as cant seem to get col indexing working so: + df2 <- aggregate(df[[1]], list(df[["season_year"]]), function(x) c(seasonal.mean = mean(x), sd.high.seasonal = mean(x) + sd(x), sd.low.seasonal = mean(x) - sd(x))) + + df2 <- data.frame(season_year=df2$Group.1, + seasonal.mean=df2$x[,"seasonal.mean"], + sd.high.seasonal = df2$x[,"sd.high.seasonal"], + sd.low.seasonal = df2$x[,"sd.low.seasonal"]) + + df2$model <- gsub(".daymeans","",i) + + return(df2)}) + + dff <- c(list(obs.seasonal.mean.df), dfg.seasonal.mean) %>% reduce(rbind) + dff$Run <- r + return(dff) +}) + +names(seasonal.means) <- runs + +seasonal.means.df <- seasonal.means %>% reduce(rbind) + +``` + +#### Fig. Calibration period - seasonal mean + +```{r eval=FALSE, include=FALSE} + +ggplot(seasonal.means.df, aes(season_year, seasonal.mean, group=model, colour=model)) + + geom_line() + + facet_wrap(.~Run) + + theme_bw() + ylab("Av daily max temp oC") + + ggtitle("Tasmax Hisotric trends") + + scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Seasonal averages, 1980.12.01 - 2009.12.01") + + scale_color_brewer(palette="Set1", name="Model") + +``` + + +##### *Summer only* + +```{r Raw seasonal winter, eval=FALSE, include=FALSE} + +dfg_sm<- subset(seasonal.means.df, grepl("Summer", season_year)) + +ggplot(dfg_sm, aes(season_year, seasonal.mean, group=model, colour=model)) + + geom_line(alpha=0.7) + + facet_wrap(.~Run) + + theme_bw() + ylab("Av daily max temp oC -Summer average") + + ggtitle("Tasmax Hisotric trends") + + scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Summer averages, 1980.12.01 - 2009.12.01") + + scale_color_brewer(palette="Set1", name="Model") + +``` + + +#### *Annual trends - seasonal max* + +For tasmax - grouping to season and calculating the seasonal maxima vals (i.e. rather than means above) + +```{r eval=FALSE, include=FALSE} + +#Convert to max, out put a df in easy fig format +dfg.max <- lapply(runs, function(r){ + L <- df.rL[[r]] + names(L)[1:3] <- c("obs", "cal", "proj") + dfg <- lapply(names(L), function(ii){ + dfi <- L[[ii]] + x <- 3:ncol(dfi) #ignore cols 1 & 2 with x y + #Calc maxima of the + dfx <- lapply(x, function(x){ + xx <- dfi[,x] + data.frame(max=max(xx, na.rm=T), day= names(dfi)[x]) + }) + + dfx_g <- dfx %>% purrr::reduce(rbind) + }) + + names(dfg) <- paste0(names(L), ".max") + return(dfg) +}) + +names(dfg.max) <- runs + +seasonal.max.cal <- lapply(runs, function(r){ + dfg <- dfg.max[[r]] + #Hads/obs df + df1 <- dfg$obs.max + x <- df1$day + df1$season <- ifelse(grepl("1231_|0131_|0228_|0229_", x), + "Winter", + ifelse(grepl("0331_|0430_|0531_", x), "Spring", + ifelse(grepl("0630_|0731_|0831_", x), "Summer", "Autumn"))) + +#Note: the seasons should each have 90 days but seemingly Winter and Autumn have 89 and Spring and Summer have 91 - this is due to how the manual aligning worked out and should be updated when the hads data is re-run + + #Create season_year - All Winter months apart from Dec to be added to the previous year (ie Winter 2000) would be the Dec of 2000 to the Feb of 2001 + year <- gsub("^[^_]*_", "", x) + year <- as.numeric(substr(year, 1,4)) + df1$season_year <- ifelse(grepl("0131_|0228_|0229_", x), + paste0(year-1, df1$season), + paste0(year, df1$season)) + # Mutate to a seasonal mean df + obs.seasonal.max.df <- aggregate(df1[[1]], list(df1[["season_year"]]), max) + #Grouping variable for later vars + obs.seasonal.max.df$model <- "obs" + + dfg.seasonal.max <- lapply(c("cal.max", "qm1.hist.a.max", + "qm1.hist.b.max"), function(i){ + df <- dfg[[i]] + x <- df$day + x <- gsub(".*_", "", x) + x <- as.numeric(x) + #The CPM days are consecutive 1 - 360 by year + df$season <- ifelse(x<91, "Winter", + ifelse(x<181, "Spring", + ifelse(x<271, "Summer", "Autumn"))) + + #Create season_year - All Winter months apart from Dec to be added to the previous year (ie Winter 2000) would be the Dec of 2000 to the Feb of 2001 + year <- gsub(".*day_", "", df$day) + year <- as.numeric(substr(year, 1,4)) + df$season_year <- ifelse(x>29&x<91, + paste0(year-1, df$season), + paste0(year, df$season)) + + # Mutate to a seasonal mean -- cant get this to run in tidyverse within loop as cant seem to get col indexing working so: + df2 <- aggregate(df[[1]], list(df[["season_year"]]), max) + + df2$model <- gsub(".max","",i) + + return(df2)}) + + dff <- c(list(obs.seasonal.max.df), dfg.seasonal.max) %>% reduce(rbind) + dff$Run <- r + return(dff) +}) + +names(seasonal.max.cal) <- runs + +seasonal.maxima.df <- seasonal.max.cal %>% reduce(rbind) +names(seasonal.maxima.df) <- c("season_year", "max", "model", "Run") +``` + +#### Fig. Calibration period - seasonal max + +```{r eval=FALSE, include=FALSE} + +ggplot(seasonal.maxima.df, aes(season_year, max, group=model, colour=model)) + + geom_line() + + facet_wrap(.~Run) + + theme_bw() + ylab("Max daily max temp oC") + + ggtitle("Tasmax Hisotric trends") + + scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Seasonal averages, 1980.12.01 - 2009.12.01") + + scale_color_brewer(palette="Set1", name="Model") + +``` + +#### Fig. Calibration period - *Summer only* + +```{r eval=FALSE, include=FALSE} + +dfg_sm<- subset(seasonal.maxima.df, !grepl("qm1.hist.b", model)&grepl("Summer", season_year)) + +ggplot(dfg_sm, aes(season_year, max, group=model, colour=model)) + + geom_line(alpha=0.7) + + facet_wrap(.~Run) + + theme_bw() + ylab("Av daily max temp oC -Summer average") + + ggtitle("Tasmax Historic trends") + + scale_x_discrete(labels = NULL, breaks = NULL) + xlab("Seasonal Summer averages, 1980.12.01 - 2009.12.01") + + scale_color_brewer(palette="Set1", name="Model") + +``` + + +#### *Validation period - annual trends - seasonal mean* + +(To be added) + +#### *Validation period - annual trends - seasonal max* + +(To be added) + +## **2. Bias Correction Assessment: Metrics** + +Using the validation data set for this + + +```{r} + +val.dfs <- c(list(obs.val.df), cpm.val.raw.df.L, cpm.val.adj.df.L) + +#Convert dfs to a vector +val.dfs.v <- lapply(val.dfs, function(d){ + #Convert to single vector + unlist(as.vector(d))}) + +val.dfs.v.df <- as.data.frame(val.dfs.v) +names(val.dfs.v.df) <- c("obs.val", paste0("Run", rep(runs, 2), "_", var, ".",rep(c("raw", "adj", 4)))) # Names for easy reference + +``` + + +### **2a. Descriptive statistics** + +```{r descriptives validation} + +descriptives <- apply(val.dfs.v.df, 2, function(x){ + per <- data.frame(as.list(quantile(x, probs=c(0.1, 0.9)))) + data.frame(mean=mean(x), sd=sd(x), min = min(x), per10th=per$X10.,per90th=per$X90., max = max(x)) +}) + +descriptives <- descriptives %>% reduce(rbind) +row.names(descriptives) <- names(val.dfs.v.df) +d <- t(descriptives) + +d %>% + kable(booktabs = T) %>% + kable_styling() %>% + row_spec(grep(".bc.",row.names(d)), background = "lightgrey") + +``` + + +#### Fig.Density plot of validation period + +**Note** - need to add back in some facetting to this fig + +```{r warning=F, message=F} +m <- reshape2::melt(val.dfs.v.df) + +ggplot(m, aes(value, fill=variable, colour=variable)) + + geom_density(alpha = 0.3, position="identity") + + theme_minimal() + + scale_fill_brewer(palette = "Set1") + + scale_color_brewer(palette = "Set1") + +``` + +### **2b. Model fit statistics** + +Using the following to assess overall fit: + +- **R-squared (rsq)** +- **Root Square Mean Error (RMSE)** +- **Nash-Sutcliffe Efficiency (NSE):** Magnitude of residual variance compared to measured data variance, ranges -∞ to 1, 1 = perfect match to observations +- **Percent bias (PBIAS):** The optimal value of PBIAS is 0.0, with low-magnitude values indicating accurate model simulation. Positive values indicate overestimation bias, whereas negative values indicate model underestimation bias. + +```{r rsq} +actual <- val.dfs.v.df$obs.val + +rsq <- sapply(val.dfs.v.df[c(2:ncol(val.dfs.v.df))], function(x){ + cor(actual, x)^2 +}) + +``` + +```{r rmse} + +rmse <- sapply(val.dfs.v.df[c(2:ncol(val.dfs.v.df))], function(x){ + sqrt(mean((actual - x)^2)) +}) + +``` + +```{r pbias} + +pbias <- sapply(val.dfs.v.df[c(2:ncol(val.dfs.v.df))], function(x){ + hydroGOF::pbias(x, actual) +}) + +``` + +```{r nse} +nse <- sapply(val.dfs.v.df[c(2:ncol(val.dfs.v.df))], function(x){ + hydroGOF::NSE(x, actual) +}) + +``` + +Highlighting the bias corrected statistics + +```{r pretty kable} + +k <- cbind(rsq, rmse, pbias, nse) +k %>% + kable(booktabs = T) %>% + kable_styling() %>% + row_spec(grep(".bc.",row.names(k)), background = "lightgrey") + +``` + + +## **3. Bias Correction Assessment: Metric specific - tasmax** + +### **3b Days above 30 degrees** + +(Not considered consecutively here) + +```{r eval=FALSE, include=FALSE} + +### Ruth to update + +val.dfs.v.df$year <- substr(row.names(val.dfs.v.df), 8,11) + +over30 <- lapply(names(val.dfs.v.df), function(i){ + x <- val.dfs.v.df[,i] + df <- aggregate(x, list(val.dfs.v.df$year), function(x){sum(x>=30)}) + names(df) <- c("year", paste0("Days.over.30.", i)) + return(df) +}) + +over30 %>% reduce(left_join, "year") +``` + + +### **Number of heatwaves per annum** + +(to be added) + +#### **For future work** + +The number of quantiles selected will effect the efficacy of the bias correction: lots of options therefore with this specific method + + From 4ddee599a4aef5a88ef0b6812e9e00304d0dbb9a Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Wed, 6 Dec 2023 15:18:43 +0000 Subject: [PATCH 38/83] feat(refactor): comment out `python` vs `R` loading options in `Assessing...-workshop.RMD` --- .../Assessing_BC_Data-workshop.RMD | 145 ++++++++++-------- 1 file changed, 77 insertions(+), 68 deletions(-) diff --git a/notebooks/Assessing_bc_data/Assessing_BC_Data-workshop.RMD b/notebooks/Assessing_bc_data/Assessing_BC_Data-workshop.RMD index 3750231d..e75c2cb5 100644 --- a/notebooks/Assessing_bc_data/Assessing_BC_Data-workshop.RMD +++ b/notebooks/Assessing_bc_data/Assessing_BC_Data-workshop.RMD @@ -20,6 +20,7 @@ rm(list=ls()) knitr::opts_knit$set(root.dir="/mnt/vmfileshare/ClimateData/") +# Add in the automatic installation process library(ggplot2) library(terra) @@ -27,6 +28,8 @@ library(tmap) #pretty maps library(RColorBrewer) library(tidyverse) library(kableExtra) +install.package("ncdf4") +library(ncdf4) if (!require(devtools)) install.packages("devtools") library(devtools) @@ -70,73 +73,79 @@ city <- "Glasgow" var <- "tasmax" runs <- c("05", "06", "07", "08") -if(input=="raster"){ ####### PYTHON INPUTS HERE ###### - # This script uses both raster data and the raw data - # This script uses Lists to group everything by runs - # Therefore what is require from this here is to create a list object for each of the sets of the data as listed above, where the list items are the rasters or dataframes by run (ie each level of the list is a run) - # .nc and .tif files can be read with rast("path/to/file.nc") - # Conversion to df is just as.data.frame(raster, xy=T) - easiest thing is just to loop using lapply the files - #dfs are assumed to be cells x time - -} else if (input=="RDS"){ - ### This R bit is a bit crazy because of the format output from the bias correction - at some point to be cleaned up and moved to a different script. - ## Load a source raster to extract the crs - r <- list.files(paste0(dd, "Reprojected_infill/UKCP2.2/tasmax/05/latest/"), full.names = T)[1] - rast <- rast(r) - crs <- crs(rast) - - ## The output created from the R bias correction framework is a list of dataframes containing all the data we need for this doc (although some are transposed). - rd <- "Debiased/R/QuantileMapping/three.cities/" - - files <- list.files(paste0(dd,rd,city),full.names=T) - files.v <- files[grepl(var, files)] - - allruns <- lapply(files.v, readRDS) - - names <- gsub(paste0(dd,rd,city,"|/|.RDS"),"",files.v) - names(allruns) <- names - - #This was returned for ease where multiple runs have been looped to apply this paritcular function, but actually we don't need a cope for each nor this data in a list. Therefore: - obs.cal.df <- as.data.frame(t(allruns[[1]]$t.obs)) - - obs.val.df <- allruns[[1]]$val.df[c(1:3601)] #To run until 30th Nov 2020 - - cpm.cal.raw.df.L <- lapply(allruns, function(L){ - as.data.frame(t(L[["t.cal"]])) - }) - - #In the R scirpt, the validation is corrected with the projected data as well - so needs to be seperated out (and transposed) - cpm.val.raw.df.L <- lapply(allruns, function(L){ - proj <- as.data.frame(t(L[["t.proj"]])) - val.end.date <- min(grep("20201201-", names(proj)))-1 - cpm.val.raw.df <- proj[,1:val.end.date] - }) - - cpm.proj.raw.df.L <- lapply(allruns, function(L){ - proj <- as.data.frame(t(L[["t.proj"]])) - val.end.date <- min(grep("20201201-", names(proj))) - cpm.val.raw.df <- proj[,val.end.date:ncol(proj)] - }) - - cpm.cal.adj.df.L <- lapply(allruns, function(L){ - adj <- as.data.frame(t(L[["qm1.hist"]])) - }) - - cpm.val.adj.df.L <- lapply(allruns, function(L){ - proj <- as.data.frame(t(L[["qm1.val.proj"]])) - val.end.date <- min(grep("20201201-", names(proj)))-1 - proj[,1:val.end.date] - }) - - cpm.proj.adj.df.L <- lapply(allruns, function(L){ - proj <- as.data.frame(t(L[["qm1.val.proj"]])) - val.end.date <- min(grep("20201201-", names(proj))) - proj[,val.end.date:ncol(proj)] - }) - - ## Convert to rasters --requires creation of x and y cols from row names +# if(input=="raster"){ +# This script uses both raster data and the raw data +# This script uses Lists to group everything by runs +# Therefore what is require from this here is to create a list object for each of the sets of the data as listed above, where the list items are the rasters or dataframes by run (ie each level of the list is a run) +# .nc and .tif files can be read with rast("path/to/file.nc") +# Conversion to df is just as.data.frame(raster, xy=T) - easiest thing is just to loop using lapply the files +#dfs are assumed to be cells x time + +rd.python <- "Debiased/three.cities.cropped" +dd.python <- "/mnt/vmfileshare/ClimateData" #Data directory of all data used in this script +#ncfname <- paste(dd, rd, city, runs[1], var, ".nc", sep="") +r.python <- list.files(paste(dd.python, rd.python, city, runs[2], var, sep="/"), pattern="debiased*", full.names = T) +ncin <- nc_open(r[1]) + +# } else if (input=="RDS"){ +### This R bit is a bit crazy because of the format output from the bias correction - at some point to be cleaned up and moved to a different script. +## Load a source raster to extract the crs +r <- list.files(paste0(dd, "Reprojected_infill/UKCP2.2/tasmax/05/latest/"), full.names = T)[1] +rast <- rast(r) +crs <- crs(rast) + +## The output created from the R bias correction framework is a list of dataframes containing all the data we need for this doc (although some are transposed). +rd <- "Debiased/R/QuantileMapping/three.cities/" + +files <- list.files(paste0(dd,rd,city),full.names=T) +files.v <- files[grepl(var, files)] + +allruns <- lapply(files.v, readRDS) + +names <- gsub(paste0(dd,rd,city,"|/|.RDS"),"",files.v) +names(allruns) <- names + +#This was returned for ease where multiple runs have been looped to apply this paritcular function, but actually we don't need a cope for each nor this data in a list. Therefore: +obs.cal.df <- as.data.frame(t(allruns[[1]]$t.obs)) + +obs.val.df <- allruns[[1]]$val.df[c(1:3601)] #To run until 30th Nov 2020 + +cpm.cal.raw.df.L <- lapply(allruns, function(L){ + as.data.frame(t(L[["t.cal"]])) + }) + +#In the R scirpt, the validation is corrected with the projected data as well - so needs to be seperated out (and transposed) +cpm.val.raw.df.L <- lapply(allruns, function(L){ + proj <- as.data.frame(t(L[["t.proj"]])) + val.end.date <- min(grep("20201201-", names(proj)))-1 + cpm.val.raw.df <- proj[,1:val.end.date] +}) + +cpm.proj.raw.df.L <- lapply(allruns, function(L){ + proj <- as.data.frame(t(L[["t.proj"]])) + val.end.date <- min(grep("20201201-", names(proj))) + cpm.val.raw.df <- proj[,val.end.date:ncol(proj)] +}) + +cpm.cal.adj.df.L <- lapply(allruns, function(L){ + adj <- as.data.frame(t(L[["qm1.hist"]])) +}) + + cpm.val.adj.df.L <- lapply(allruns, function(L){ + proj <- as.data.frame(t(L[["qm1.val.proj"]])) + val.end.date <- min(grep("20201201-", names(proj)))-1 + proj[,1:val.end.date] +}) + + cpm.proj.adj.df.L <- lapply(allruns, function(L){ + proj <- as.data.frame(t(L[["qm1.val.proj"]])) + val.end.date <- min(grep("20201201-", names(proj))) + proj[,val.end.date:ncol(proj)] +}) + +## Convert to rasters --requires creation of x and y cols from row names ## For the comparison, just converting the observation and cpm for the cal and val perios (ie not the projection datasets) obsrastL <- lapply(list(obs.cal.df, obs.val.df), function(i){ @@ -182,9 +191,9 @@ remove(list2rast) gc() - } else { - print("Invalid input") -} +# } else { +# print("Invalid input") +#} From 4e8f1d0d2159128419bba241b306ad81c5acaa17 Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Sat, 9 Dec 2023 12:45:46 +0000 Subject: [PATCH 39/83] refactor(debias): add `debias_wrapper.py` for potential new python results --- python/.pytest.ini | 13 +- python/conftest.py | 119 ++++- python/debiasing/debias_wrapper.py | 721 +++++++++++++++++++++++++++++ python/tests/test_debiasing.py | 685 +-------------------------- 4 files changed, 864 insertions(+), 674 deletions(-) create mode 100644 python/debiasing/debias_wrapper.py diff --git a/python/.pytest.ini b/python/.pytest.ini index 5e0f35be..bf6d8df2 100644 --- a/python/.pytest.ini +++ b/python/.pytest.ini @@ -3,16 +3,17 @@ [pytest] minversion = 6.0 addopts = -ra -q - --doctest-modules - --ignore=python/debiasing/python-cmethods - -m "not server" - --pdbcls=IPython.terminal.debugger:TerminalPdb - --cov=. - --cov-report=term:skip-covered + --doctest-modules + --ignore=python/debiasing/python-cmethods + -m "not server" + --pdbcls=IPython.terminal.debugger:TerminalPdb + --cov=. + --cov-report=term:skip-covered pythonpath = . testpaths = tests + debiasing/debias_wrapper.py utils.py markers = slow: runs slowly. diff --git a/python/conftest.py b/python/conftest.py index 237d6751..2aef2372 100644 --- a/python/conftest.py +++ b/python/conftest.py @@ -6,6 +6,23 @@ import pytest from coverage_badge.__main__ import main as gen_cov_badge +from debiasing.debias_wrapper import ( + CALIB_DATES_STR_DEFAULT, + CMETHODS_FILE_NAME, + CMETHODS_OUT_FOLDER_DEFAULT, + DATA_PATH_DEFAULT, + MOD_FOLDER_DEFAULT, + OBS_FOLDER_DEFAULT, + PREPROCESS_FILE_NAME, + PREPROCESS_OUT_FOLDER_DEFAULT, + PROCESSESORS_DEFAULT, + VALID_DATES_STR_DEFAULT, + CityOptions, + MethodOptions, + RunOptions, + VariableOptions, +) +from utils import iter_to_tuple_strs BADGE_PATH: Final[Path] = Path("docs") / "assets" / "coverage.svg" CLIMATE_DATA_MOUNT_PATH = Path("/mnt/vmfileshare/ClimateData") @@ -18,6 +35,72 @@ "load_data", ) +CLI_PREPROCESS_DEFAULT_COMMAND_TUPLE_CORRECT: Final[tuple[str, ...]] = ( + "python", + PREPROCESS_FILE_NAME, + "--mod", + DATA_PATH_DEFAULT / MOD_FOLDER_DEFAULT / CityOptions.default(), + "--obs", + DATA_PATH_DEFAULT / OBS_FOLDER_DEFAULT / CityOptions.default(), + "-v", + VariableOptions.default(), + "-r", + RunOptions.default(), + "--out", + ( + DATA_PATH_DEFAULT + / PREPROCESS_OUT_FOLDER_DEFAULT + / CityOptions.default() + / RunOptions.default() + / VariableOptions.default() + ), + "--calib_dates", + CALIB_DATES_STR_DEFAULT, + "--valid_dates", + VALID_DATES_STR_DEFAULT, +) + +CLI_PREPROCESS_DEFAULT_COMMAND_TUPLE_STR_CORRECT: Final[ + tuple[str, ...] +] = iter_to_tuple_strs(CLI_PREPROCESS_DEFAULT_COMMAND_TUPLE_CORRECT) + +CLI_PREPROCESS_DEFAULT_COMMAND_STR_CORRECT: Final[str] = " ".join( + CLI_PREPROCESS_DEFAULT_COMMAND_TUPLE_STR_CORRECT +) + +CLI_CMETHODS_DEFAULT_COMMAND_TUPLE_CORRECT: Final[tuple[str, ...]] = ( + "python", + CMETHODS_FILE_NAME, + "--input_data_folder", + CLI_PREPROCESS_DEFAULT_COMMAND_TUPLE_CORRECT[11], + "--out", + ( + DATA_PATH_DEFAULT + / CMETHODS_OUT_FOLDER_DEFAULT + / CityOptions.default() + / RunOptions.default() + ).resolve(), + "--method", + MethodOptions.default(), + "-v", + VariableOptions.default(), + "-p", + PROCESSESORS_DEFAULT, +) + +CLI_CMETHODS_DEFAULT_COMMAND_TUPLE_STR_CORRECT: Final[ + tuple[str, ...] +] = iter_to_tuple_strs(CLI_CMETHODS_DEFAULT_COMMAND_TUPLE_CORRECT) + +CLI_CMETHODS_DEFAULT_COMMAND_STR_CORRECT: Final[str] = " ".join( + CLI_CMETHODS_DEFAULT_COMMAND_TUPLE_STR_CORRECT +) + + +MOD_FOLDER_FILES_COUNT_CORRECT: Final[int] = 1478 +OBS_FOLDER_FILES_COUNT_CORRECT: Final[int] = MOD_FOLDER_FILES_COUNT_CORRECT +PREPROCESS_OUT_FOLDER_FILES_COUNT_CORRECT: Final[int] = 4 + @pytest.fixture() def is_platform_darwin() -> bool: @@ -38,15 +121,47 @@ def ensure_python_path() -> None: raise ValueError( f"'clim-recal' python tests must be " f"run in 'clim-recal/{PYTHON_DIR_NAME}', " - f"not '{path.absolute()}'" + f"not '{TEST_PATH.absolute()}'" ) +@pytest.fixture +def preprocess_out_folder_files_count_correct() -> int: + """Return `PREPROCESS_OUT_FOLDER_FILES_COUNT_CORRECT`.""" + return PREPROCESS_OUT_FOLDER_FILES_COUNT_CORRECT + + @pytest.fixture(autouse=True) def doctest_auto_fixtures( - doctest_namespace: dict, is_platform_darwin: bool, is_climate_data_mounted: bool + doctest_namespace: dict, + is_platform_darwin: bool, + is_climate_data_mounted: bool, + preprocess_out_folder_files_count_correct: int, ) -> None: """Elements to add to default `doctest` namespace.""" + doctest_namespace[ + "CLI_PREPROCESS_DEFAULT_COMMAND_TUPLE_CORRECT" + ] = CLI_PREPROCESS_DEFAULT_COMMAND_TUPLE_CORRECT + doctest_namespace[ + "CLI_PREPROCESS_DEFAULT_COMMAND_STR_CORRECT" + ] = CLI_PREPROCESS_DEFAULT_COMMAND_STR_CORRECT + doctest_namespace[ + "CLI_PREPROCESS_DEFAULT_COMMAND_TUPLE_STR_CORRECT" + ] = CLI_PREPROCESS_DEFAULT_COMMAND_TUPLE_STR_CORRECT + doctest_namespace["MOD_FOLDER_FILES_COUNT_CORRECT"] = MOD_FOLDER_FILES_COUNT_CORRECT + doctest_namespace["OBS_FOLDER_FILES_COUNT_CORRECT"] = OBS_FOLDER_FILES_COUNT_CORRECT + doctest_namespace[ + "PREPROCESS_OUT_FOLDER_FILES_COUNT_CORRECT" + ] = preprocess_out_folder_files_count_correct + doctest_namespace[ + "CLI_CMEHTODS_DEFAULT_COMMAND_TUPLE_STR_CORRECT" + ] = CLI_CMETHODS_DEFAULT_COMMAND_TUPLE_STR_CORRECT + doctest_namespace[ + "CLI_CMETHODS_DEFAULT_COMMAND_TUPLE_CORRECT" + ] = CLI_CMETHODS_DEFAULT_COMMAND_TUPLE_CORRECT + doctest_namespace[ + "CLI_CMETHODS_DEFAULT_COMMAND_STR_CORRECT" + ] = CLI_CMETHODS_DEFAULT_COMMAND_STR_CORRECT doctest_namespace["is_platform_darwin"] = is_platform_darwin doctest_namespace["is_climate_data_mounted"] = is_climate_data_mounted doctest_namespace["pprint"] = pprint diff --git a/python/debiasing/debias_wrapper.py b/python/debiasing/debias_wrapper.py new file mode 100644 index 00000000..4e8713af --- /dev/null +++ b/python/debiasing/debias_wrapper.py @@ -0,0 +1,721 @@ +"""Wrapper for running `preprocess_data.py` and `run_cmethods.py`""" +import sys +from dataclasses import dataclass +from datetime import date +from enum import auto +from os import PathLike +from pathlib import Path +from typing import Final, Generator, Optional, Union + +import pytest +from utils import ( + DATE_FORMAT_SPLIT_STR, + DATE_FORMAT_STR, + DateType, + date_range_to_str, + iter_to_tuple_strs, + path_iterdir, +) + +if sys.version_info >= (3, 11): + from enum import StrEnum +else: + from backports.strenum import StrEnum + +DATA_PATH_DEFAULT: Final[Path] = Path( + "/mnt/vmfileshare/ClimateData/Cropped/three.cities/" +) + +COMMAND_DIR_DEFAULT: Final[Path] = Path("debiasing").resolve() +PREPROCESS_FILE_NAME: Final[Path] = Path("preprocess_data.py") +CMETHODS_FILE_NAME: Final[Path] = Path("run_cmethods.py") + + +class VariableOptions(StrEnum): + """Supported options for variables""" + + TASMAX = auto() + RAINFALL = auto() + TASMIN = auto() + + @classmethod + def default(cls) -> str: + """Default option.""" + return cls.TASMAX.value + + +class RunOptions(StrEnum): + """Supported options for variables""" + + FIVE = "05" + SEVEN = "07" + EIGHT = "08" + SIX = "06" + + @classmethod + def default(cls) -> str: + """Default option.""" + return cls.FIVE.value + + +class CityOptions(StrEnum): + """Supported options for variables.""" + + GLASGOW = "Glasgow" + MANCHESTER = "Manchester" + LONDON = "London" + + @classmethod + def default(cls) -> str: + """Default option.""" + return cls.MANCHESTER.value + + +class MethodOptions(StrEnum): + """Supported options for methods.""" + + QUANTILE_DELTA_MAPPING = auto() + QUANTILE_MAPPING = auto() + VARIANCE_SCALING = auto() + DELTA_METHOD = auto() + + @classmethod + def default(cls) -> str: + """Default method option.""" + return cls.QUANTILE_DELTA_MAPPING.value + + +PROCESSESORS_DEFAULT: Final[int] = 2 +RUN_PREFIX_DEFAULT: Final[str] = "python" + +MOD_FOLDER_DEFAULT: Final[Path] = Path("CPM") +OBS_FOLDER_DEFAULT: Final[Path] = Path("Hads.updated360") +PREPROCESS_OUT_FOLDER_DEFAULT: Final[Path] = Path("Preprocessed") +CMETHODS_OUT_FOLDER_DEFAULT: Final[Path] = Path("../../Debiased/three.cities.cropped") + +CALIB_DATE_START_DEFAULT: DateType = date(1981, 1, 1) +CALIB_DATE_END_DEFAULT: DateType = date(1981, 12, 30) + +VALID_DATE_START_DEFAULT: DateType = date(2010, 1, 1) +VALID_DATE_END_DEFAULT: DateType = date(2010, 12, 30) + +CALIB_DATES_STR_DEFAULT: Final[str] = date_range_to_str( + CALIB_DATE_START_DEFAULT, CALIB_DATE_END_DEFAULT +) +VALID_DATES_STR_DEFAULT: Final[str] = date_range_to_str( + VALID_DATE_START_DEFAULT, VALID_DATE_END_DEFAULT +) + + +@dataclass +class RunConfig: + + """Manage creating command line scripts to run `debiasing` `cli`.""" + + command_dir: Path = COMMAND_DIR_DEFAULT + variable: str = VariableOptions.default() + run: str = RunOptions.default() + city: str = CityOptions.default() + method: str = MethodOptions.default() + run_prefix: str = RUN_PREFIX_DEFAULT + preprocess_data_file: PathLike = PREPROCESS_FILE_NAME + run_cmethods_file: PathLike = CMETHODS_FILE_NAME + + data_path: Path = DATA_PATH_DEFAULT + mod_folder: PathLike = MOD_FOLDER_DEFAULT + obs_folder: PathLike = OBS_FOLDER_DEFAULT + preprocess_out_folder: PathLike = PREPROCESS_OUT_FOLDER_DEFAULT + cmethods_out_folder: PathLike = CMETHODS_OUT_FOLDER_DEFAULT + + calib_date_start: DateType = CALIB_DATE_START_DEFAULT + calib_date_end: DateType = CALIB_DATE_END_DEFAULT + + valid_date_start: DateType = VALID_DATE_START_DEFAULT + valid_date_end: DateType = VALID_DATE_END_DEFAULT + + processors: int = PROCESSESORS_DEFAULT + + date_format_str: str = DATE_FORMAT_STR + date_split_str: str = DATE_FORMAT_SPLIT_STR + + def calib_dates_to_str( + self, + start_date: DateType, + end_date: DateType, + in_format_str: Optional[str] = None, + out_format_str: Optional[str] = None, + split_str: Optional[str] = None, + ) -> str: + """Return date range as `str` from `calib_date_start` to `calib_date_end`. + + Example + ------- + >>> config: RunConfig = RunConfig() + >>> config.calib_dates_to_str('20100101', '20100330') + '20100101-20100330' + >>> config.calib_dates_to_str(date(2010, 1, 1), '20100330') + '20100101-20100330' + >>> config.calib_dates_to_str(date(2010, 1, 1), '20100330', split_str="_") + '20100101_20100330' + """ + start_date = start_date if start_date else self.calib_date_start + end_date = end_date if end_date else self.calib_date_end + return self._date_range_to_str( + start_date, end_date, in_format_str, out_format_str, split_str + ) + + def valid_dates_to_str( + self, + start_date: DateType, + end_date: DateType, + in_format_str: Optional[str] = None, + out_format_str: Optional[str] = None, + split_str: Optional[str] = None, + ) -> str: + """Return date range as `str` from `valid_date_start` to `valid_date_end`. + + Example + ------- + >>> config: RunConfig = RunConfig() + >>> config.valid_dates_to_str('20100101', '20100330') + '20100101-20100330' + >>> config.valid_dates_to_str(date(2010, 1, 1), '20100330') + '20100101-20100330' + >>> config.valid_dates_to_str(date(2010, 1, 1), '20100330', split_str="_") + '20100101_20100330' + """ + start_date = start_date if start_date else self.valid_date_start + end_date = end_date if end_date else self.valid_date_end + return self._date_range_to_str( + start_date, end_date, in_format_str, out_format_str, split_str + ) + + def _date_range_to_str( + self, + start_date: DateType, + end_date: DateType, + in_format_str: Optional[str] = None, + out_format_str: Optional[str] = None, + split_str: Optional[str] = None, + ) -> str: + """Return date range as `str` from `calib_date_start` to `calib_date_end`. + + Example + ------- + >>> config: RunConfig = RunConfig() + >>> config._date_range_to_str('20100101', '20100330') + '20100101-20100330' + >>> config._date_range_to_str(date(2010, 1, 1), '20100330') + '20100101-20100330' + >>> config._date_range_to_str(date(2010, 1, 1), '20100330', split_str="_") + '20100101_20100330' + """ + in_format_str = in_format_str if in_format_str else self.date_format_str + out_format_str = out_format_str if out_format_str else self.date_format_str + split_str = split_str if split_str else self.date_split_str + return date_range_to_str( + start_date=start_date, + end_date=end_date, + in_format_str=in_format_str, + out_format_str=out_format_str, + split_str=split_str, + ) + + def mod_path(self, city: Optional[str] = None) -> Path: + """Return city estimates path. + + Example + ------- + >>> if not is_climate_data_mounted: + ... pytest.skip('requires linux server mount paths') + >>> config: RunConfig = RunConfig() + >>> config.mod_path() + PosixPath('/mnt/vmfileshare/ClimateData/Cropped/three.cities/CPM/Manchester') + >>> config.mod_path('Glasgow') + PosixPath('/mnt/vmfileshare/ClimateData/Cropped/three.cities/CPM/Glasgow') + """ + city = city if city else self.city + return self.data_path / self.mod_folder / city + + def obs_path(self, city: Optional[str] = None) -> Path: + """Return city observations path. + + Example + ------- + >>> if not is_climate_data_mounted: + ... pytest.skip('requires linux server mount paths') + >>> config: RunConfig = RunConfig() + >>> config.obs_path() + PosixPath('/mnt/vmfileshare/ClimateData/Cropped/three.cities/Hads.updated360/Manchester') + >>> config.obs_path('Glasgow') + PosixPath('/mnt/vmfileshare/ClimateData/Cropped/three.cities/Hads.updated360/Glasgow') + """ + city = city if city else self.city + return self.data_path / self.obs_folder / city + + def preprocess_out_path( + self, + city: Optional[str] = None, + run: Optional[str] = None, + variable: Optional[str] = None, + ) -> Path: + """Return path to save results. + + Example + ------- + >>> if not is_climate_data_mounted: + ... pytest.skip('requires linux server mount paths') + >>> config: RunConfig = RunConfig() + >>> config.preprocess_out_path() + PosixPath('/mnt/vmfileshare/ClimateData/Cropped/three.cities/Preprocessed/Manchester/05/tasmax') + >>> config.preprocess_out_path(city='Glasgow', run='07') + PosixPath('/mnt/vmfileshare/ClimateData/Cropped/three.cities/Preprocessed/Glasgow/07/tasmax') + """ + city = city if city else self.city + run = run if run else self.run + variable = variable if variable else self.variable + return ( + self.data_path / self.preprocess_out_folder / city / run / variable + ).resolve() + + def cmethods_out_path( + self, + city: Optional[str] = None, + run: Optional[str] = None, + ) -> Path: + """Return path to save cmethods results. + + Example + ------- + >>> config: RunConfig = RunConfig() + >>> config.cmethods_out_path() + PosixPath('/mnt/vmfileshare/ClimateData/Debiased/three.cities.cropped/Manchester/05') + >>> config.cmethods_out_path(city='Glasgow', run='07') + PosixPath('/mnt/vmfileshare/ClimateData/Debiased/three.cities.cropped/Glasgow/07') + """ + city = city if city else self.city + run = run if run else self.run + return (self.data_path / self.cmethods_out_folder / city / run).resolve() + + @property + def run_prefix_tuple(self) -> tuple[str, ...]: + """Split `self.run_prefix` by ' ' to a `tuple`. + + Example + ------- + >>> config: RunConfig = RunConfig(run_prefix='python -m') + >>> config.run_prefix_tuple + ('python', '-m') + """ + return tuple(self.run_prefix.split(" ")) + + def to_cli_preprocess_tuple( + self, + variable: Optional[str] = None, + run: Optional[str] = None, + city: Optional[str] = None, + calib_start: Optional[DateType] = None, + calib_end: Optional[DateType] = None, + valid_start: Optional[DateType] = None, + valid_end: Optional[DateType] = None, + ) -> tuple[Union[str, PathLike], ...]: + """Generate a `tuple` of `str` for a command line command. + + Note + ---- + This will leave `Path` objects uncoverted. See + `self.to_cli_preprocess_tuple_strs` for passing to a terminal. + + Example + ------- + >>> config: RunConfig = RunConfig() + >>> command_str_tuple: tuple[str, ...] = config.to_cli_preprocess_tuple() + >>> assert command_str_tuple == CLI_PREPROCESS_DEFAULT_COMMAND_TUPLE_CORRECT + """ + city = city if city else self.city + variable = variable if variable else self.variable + run = run if run else self.run + + mod_path: Path = self.mod_path(city=city) + obs_path: Path = self.obs_path(city=city) + preprocess_out_path: Path = self.preprocess_out_path( + city=city, run=run, variable=variable + ) + calib_dates_str: str = self.calib_dates_to_str( + start_date=calib_start, end_date=calib_end + ) + valid_dates_str: str = self.valid_dates_to_str( + start_date=valid_start, end_date=valid_end + ) + + return ( + *self.run_prefix_tuple, + self.preprocess_data_file, + "--mod", + mod_path, + "--obs", + obs_path, + "-v", + variable, + "-r", + run, + "--out", + preprocess_out_path, + "--calib_dates", + calib_dates_str, + "--valid_dates", + valid_dates_str, + ) + + def to_cli_preprocess_tuple_strs( + self, + variable: Optional[str] = None, + run: Optional[str] = None, + city: Optional[str] = None, + calib_start: Optional[DateType] = None, + calib_end: Optional[DateType] = None, + valid_start: Optional[DateType] = None, + valid_end: Optional[DateType] = None, + ) -> tuple[str, ...]: + """Generate a command line interface `str` `tuple` a test example. + + Example + ------- + >>> config: RunConfig = RunConfig() + >>> command_str_tuple: tuple[str, ...] = config.to_cli_preprocess_tuple_strs() + >>> assert command_str_tuple == CLI_PREPROCESS_DEFAULT_COMMAND_TUPLE_STR_CORRECT + """ + return iter_to_tuple_strs( + self.to_cli_preprocess_tuple( + variable=variable, + run=run, + city=city, + calib_start=calib_start, + calib_end=calib_end, + valid_start=valid_start, + valid_end=valid_end, + ) + ) + + def to_cli_preprocess_str( + self, + variable: Optional[str] = None, + run: Optional[str] = None, + city: Optional[str] = None, + calib_start: Optional[DateType] = None, + calib_end: Optional[DateType] = None, + valid_start: Optional[DateType] = None, + valid_end: Optional[DateType] = None, + ) -> str: + """Generate a command line interface str as a test example. + + Example + ------- + >>> config: RunConfig = RunConfig() + >>> config.to_cli_preprocess_str() == CLI_PREPROCESS_DEFAULT_COMMAND_STR_CORRECT + True + >>> CLI_PREPROCESS_DEFAULT_COMMAND_STR_CORRECT[:96] #doctest: +ELLIPSIS + 'python preprocess_data.py --mod /.../CPM/Manchester' + """ + return " ".join( + self.to_cli_preprocess_tuple_strs( + variable=variable, + run=run, + city=city, + calib_start=calib_start, + calib_end=calib_end, + valid_start=valid_start, + valid_end=valid_end, + ) + ) + + def yield_mod_folder( + self, city: Optional[str] = None + ) -> Generator[Path, None, None]: + """`Iterable` of all `Path`s in `self.mod_folder`. + + Example + ------- + >>> if not is_climate_data_mounted: + ... pytest.skip('requires linux server mount paths') + >>> config: RunConfig = RunConfig() + >>> len(tuple(config.yield_mod_folder())) == MOD_FOLDER_FILES_COUNT_CORRECT + True + """ + city = city if city else self.city + return path_iterdir(self.obs_path(city=city)) + + def yield_obs_folder( + self, city: Optional[str] = None + ) -> Generator[Path, None, None]: + """`Iterable` of all `Path`s in `self.obs_folder`. + + Example + ------- + >>> if not is_climate_data_mounted: + ... pytest.skip('requires linux server mount paths') + >>> config: RunConfig = RunConfig() + >>> len(tuple(config.yield_obs_folder())) == OBS_FOLDER_FILES_COUNT_CORRECT + True + """ + city = city if city else self.city + return path_iterdir(self.obs_path(city=city)) + + def yield_preprocess_out_folder( + self, + city: Optional[str] = None, + run: Optional[str] = None, + variable: Optional[str] = None, + ) -> Generator[Path, None, None]: + """`Iterable` of all `Path`s in `self.preprocess_out_folder`. + + Example + ------- + >>> if not is_climate_data_mounted: + ... pytest.skip('requires linux server mount paths') + >>> config: RunConfig = RunConfig() + >>> (len(tuple(config.yield_preprocess_out_folder())) == + ... PREPROCESS_OUT_FOLDER_FILES_COUNT_CORRECT) + True + """ + city = city if city else self.city + run = run if run else self.run + variable = variable if variable else self.variable + return path_iterdir( + self.preprocess_out_path(city=city, run=run, variable=variable) + ) + + @property + def command_path(self) -> Path: + """Return command path relative to running tests.""" + return (Path() / self.command_dir).absolute() + + def to_cli_run_cmethods_tuple( + self, + city: Optional[str] = None, + run: Optional[str] = None, + variable: Optional[str] = None, + method: Optional[str] = None, + input_data_path: Optional[PathLike] = None, + cmethods_out_path: Optional[PathLike] = None, + processors: Optional[int] = None, + ) -> tuple[Union[str, PathLike], ...]: + """Generate a `tuple` of `str` for a command line command. + + Note + ---- + This will leave `Path` objects uncoverted. See + `self.to_cli_run_cmethods_tuple_strs` for passing to a terminal. + + Example + ------- + >>> config: RunConfig = RunConfig() + >>> command_str_tuple: tuple[str, ...] = config.to_cli_run_cmethods_tuple() + >>> assert command_str_tuple == CLI_CMETHODS_DEFAULT_COMMAND_TUPLE_CORRECT + """ + city = city if city else self.city + variable = variable if variable else self.variable + run = run if run else self.run + method = method if method else self.method + processors = processors if processors else self.processors + + input_data_path = ( + input_data_path + if input_data_path + else self.preprocess_out_path(city=city, run=run, variable=variable) + ) + + cmethods_out_path = ( + cmethods_out_path + if cmethods_out_path + else self.cmethods_out_path(city=city, run=run) + ) + + return ( + *self.run_prefix_tuple, + self.run_cmethods_file, + "--input_data_folder", + input_data_path, + "--out", + cmethods_out_path, + "--method", + method, + "-v", + variable, + "-p", + processors, + ) + + def to_cli_run_cmethods_tuple_strs( + self, + city: Optional[str] = None, + run: Optional[str] = None, + variable: Optional[str] = None, + method: Optional[str] = None, + input_data_path: Optional[PathLike] = None, + cmethods_out_path: Optional[PathLike] = None, + processors: Optional[int] = None, + ) -> tuple[str, ...]: + """Generate a command line interface `str` `tuple` a test example. + + Example + ------- + >>> config: RunConfig = RunConfig() + >>> command_str_tuple: tuple[str, ...] = config.to_cli_run_cmethods_tuple_strs() + >>> assert command_str_tuple == CLI_CMEHTODS_DEFAULT_COMMAND_TUPLE_STR_CORRECT + """ + return iter_to_tuple_strs( + self.to_cli_run_cmethods_tuple( + city=city, + run=run, + variable=variable, + method=method, + input_data_path=input_data_path, + cmethods_out_path=cmethods_out_path, + processors=processors, + ) + ) + + def to_cli_run_cmethods_str( + self, + city: Optional[str] = None, + run: Optional[str] = None, + variable: Optional[str] = None, + method: Optional[str] = None, + input_data_path: Optional[PathLike] = None, + cmethods_out_path: Optional[PathLike] = None, + processors: Optional[int] = None, + ) -> str: + """Generate a command line interface str as a test example. + + Example + ------- + >>> config: RunConfig = RunConfig() + >>> config.to_cli_run_cmethods_str() == CLI_CMETHODS_DEFAULT_COMMAND_STR_CORRECT + True + >>> CLI_CMETHODS_DEFAULT_COMMAND_STR_CORRECT #doctest: +ELLIPSIS + 'python run_cmethods.py...--method quantile_delta_mapping...' + """ + return " ".join( + self.to_cli_run_cmethods_tuple_strs( + city=city, + run=run, + variable=variable, + method=method, + input_data_path=input_data_path, + cmethods_out_path=cmethods_out_path, + processors=processors, + ) + ) + + +# @pytest.mark.server +# @pytest.mark.slow +# @pytest.mark.parametrize( +# +# "city, variable, run, method, calib_start, calib_end, valid_start, valid_end", +# ( +# ( +# CityOptions.GLASGOW, +# VariableOptions.TASMAX, +# RunOptions.FIVE, +# MethodOptions.QUANTILE_DELTA_MAPPING, +# date(1980, 12, 1), +# date(2010, 11, 29), +# date(2010, 11, 30), +# date(2020, 11, 30), +# ), +# ( +# CityOptions.GLASGOW, +# VariableOptions.TASMAX, +# RunOptions.FIVE, +# MethodOptions.DELTA_METHOD, +# date(1980, 12, 1), +# date(2010, 11, 29), +# date(2010, 11, 30), +# date(2020, 11, 30), +# ), +# ( +# CityOptions.GLASGOW, +# VariableOptions.TASMAX, +# RunOptions.FIVE, +# MethodOptions.QUANTILE_MAPPING, +# date(1980, 12, 1), +# date(2010, 11, 29), +# date(2010, 11, 30), +# date(2020, 11, 30), +# ), +# ( +# CityOptions.GLASGOW, +# VariableOptions.TASMAX, +# RunOptions.FIVE, +# MethodOptions.VARIANCE_SCALING, +# date(1980, 12, 1), +# date(2010, 11, 29), +# date(2010, 11, 30), +# date(2020, 11, 30), +# ), +# ( +# CityOptions.GLASGOW, +# VariableOptions.TASMAX, +# RunOptions.SIX, +# MethodOptions.QUANTILE_DELTA_MAPPING, +# date(1980, 12, 1), +# date(2010, 11, 29), +# date(2010, 11, 30), +# date(2020, 11, 30), +# ), +# ( +# CityOptions.GLASGOW, +# VariableOptions.TASMAX, +# RunOptions.SIX, +# MethodOptions.DELTA_METHOD, +# date(1980, 12, 1), +# date(2010, 11, 29), +# date(2010, 11, 30), +# date(2020, 11, 30), +# ), +# ( +# CityOptions.GLASGOW, +# VariableOptions.TASMAX, +# RunOptions.SIX, +# MethodOptions.QUANTILE_MAPPING, +# date(1980, 12, 1), +# date(2010, 11, 29), +# date(2010, 11, 30), +# date(2020, 11, 30), +# ), +# ( +# CityOptions.GLASGOW, +# VariableOptions.TASMAX, +# RunOptions.SIX, +# MethodOptions.VARIANCE_SCALING, +# date(1980, 12, 1), +# date(2010, 11, 29), +# date(2010, 11, 30), +# date(2020, 11, 30), +# ), +# ), +# ) +# def test_run_workshop(run_config, city, variable, run, method, calib_start, calib_end, valid_start, valid_end) -> None: +# """Test running generated command script via a subprocess.""" +# initial_folder: Path = Path().resolve() +# chdir(run_config.command_path) +# assert PREPROCESS_FILE_NAME in tuple(Path().iterdir()) +# preprocess_run: subprocess.CompletedProcess = subprocess.run( +# run_config.to_cli_preprocess_tuple_strs(city=city, variable=variable, run=run, calib_start=calib_start, calib_end=calib_end, valid_start=valid_start, valid_end=valid_end), +# capture_output=True, +# text=True, +# ) +# assert preprocess_run.returncode == 0 +# cmethods_run: subprocess.CompletedProcess = subprocess.run( +# run_config.to_cli_run_cmethods_tuple_strs( +# city=city, run=run, variable=variable, method=method +# ), +# capture_output=True, +# text=True, +# ) +# assert cmethods_run.returncode == 0 +# +# chdir(initial_folder) +# assert False diff --git a/python/tests/test_debiasing.py b/python/tests/test_debiasing.py index 5cf8c1d4..16d8ed5f 100644 --- a/python/tests/test_debiasing.py +++ b/python/tests/test_debiasing.py @@ -3,681 +3,34 @@ """ import subprocess -import sys -from dataclasses import dataclass -from datetime import date -from enum import auto -from os import PathLike, chdir +from os import chdir from pathlib import Path -from typing import Final, Generator, Optional, Union import pytest -from utils import ( - DATE_FORMAT_SPLIT_STR, - DATE_FORMAT_STR, - DateType, - date_range_to_str, - iter_to_tuple_strs, - path_iterdir, +from conftest import ( + CLI_PREPROCESS_DEFAULT_COMMAND_STR_CORRECT, + MOD_FOLDER_FILES_COUNT_CORRECT, + OBS_FOLDER_FILES_COUNT_CORRECT, + PREPROCESS_OUT_FOLDER_FILES_COUNT_CORRECT, ) - -if sys.version_info >= (3, 11): - from enum import StrEnum -else: - from backports.strenum import StrEnum - -DATA_PATH_DEFAULT: Final[Path] = Path( - "/mnt/vmfileshare/ClimateData/Cropped/three.cities/" -) - -COMMAND_DIR_DEFAULT: Final[Path] = Path("debiasing").resolve() -PREPROCESS_FILE_NAME: Final[Path] = Path("preprocess_data.py") -CMETHODS_FILE_NAME: Final[Path] = Path("run_cmethods.py") - - -class VariableOptions(StrEnum): - """Supported options for variables""" - - TASMAX = auto() - RAINFALL = auto() - TASMIN = auto() - - @classmethod - def default(cls) -> str: - """Default option.""" - return cls.TASMAX.value - - -class RunOptions(StrEnum): - """Supported options for variables""" - - FIVE = "05" - SEVEN = "07" - EIGHT = "08" - SIX = "06" - - @classmethod - def default(cls) -> str: - """Default option.""" - return cls.FIVE.value - - -class CityOptions(StrEnum): - """Supported options for variables.""" - - GLASGOW = "Glasgow" - MANCHESTER = "Manchester" - LONDON = "London" - - @classmethod - def default(cls) -> str: - """Default option.""" - return cls.MANCHESTER.value - - -class MethodOptions(StrEnum): - """Supported options for methods.""" - - QUANTILE_DELTA_MAPPING = auto() - QUANTILE_MAPPING = auto() - VARIANCE_SCALING = auto() - DELTA_METHOD = auto() - - @classmethod - def default(cls) -> str: - """Default method option.""" - return cls.QUANTILE_DELTA_MAPPING.value - - -PROCESSESORS_DEFAULT: Final[int] = 2 -RUN_PREFIX_DEFAULT: Final[str] = "python" - -MOD_FOLDER_DEFAULT: Final[Path] = Path("CPM") -OBS_FOLDER_DEFAULT: Final[Path] = Path("Hads.updated360") -PREPROCESS_OUT_FOLDER_DEFAULT: Final[Path] = Path("Preprocessed") -CMETHODS_OUT_FOLDER_DEFAULT: Final[Path] = Path("../../Debiased/three.cities.cropped") - -CALIB_DATE_START_DEFAULT: DateType = date(1981, 1, 1) -CALIB_DATE_END_DEFAULT: DateType = date(1981, 12, 30) - -VALID_DATE_START_DEFAULT: DateType = date(2010, 1, 1) -VALID_DATE_END_DEFAULT: DateType = date(2010, 12, 30) - -CALIB_DATES_STR_DEFAULT: Final[str] = date_range_to_str( - CALIB_DATE_START_DEFAULT, CALIB_DATE_END_DEFAULT -) -VALID_DATES_STR_DEFAULT: Final[str] = date_range_to_str( - VALID_DATE_START_DEFAULT, VALID_DATE_END_DEFAULT -) - - -CLI_PREPROCESS_DEFAULT_COMMAND_TUPLE_CORRECT: Final[tuple[str, ...]] = ( - "python", - PREPROCESS_FILE_NAME, - "--mod", - DATA_PATH_DEFAULT / MOD_FOLDER_DEFAULT / CityOptions.default(), - "--obs", - DATA_PATH_DEFAULT / OBS_FOLDER_DEFAULT / CityOptions.default(), - "-v", - VariableOptions.default(), - "-r", - RunOptions.default(), - "--out", - ( - DATA_PATH_DEFAULT - / PREPROCESS_OUT_FOLDER_DEFAULT - / CityOptions.default() - / RunOptions.default() - / VariableOptions.default() - ), - "--calib_dates", +from debiasing.debias_wrapper import ( CALIB_DATES_STR_DEFAULT, - "--valid_dates", - VALID_DATES_STR_DEFAULT, -) - -CLI_PREPROCESS_DEFAULT_COMMAND_TUPLE_STR_CORRECT: Final[ - tuple[str, ...] -] = iter_to_tuple_strs(CLI_PREPROCESS_DEFAULT_COMMAND_TUPLE_CORRECT) - -CLI_PREPROCESS_DEFAULT_COMMAND_STR_CORRECT: Final[str] = " ".join( - CLI_PREPROCESS_DEFAULT_COMMAND_TUPLE_STR_CORRECT -) - -CLI_CMETHODS_DEFAULT_COMMAND_TUPLE_CORRECT: Final[tuple[str, ...]] = ( - "python", CMETHODS_FILE_NAME, - "--input_data_folder", - CLI_PREPROCESS_DEFAULT_COMMAND_TUPLE_CORRECT[11], - "--out", - ( - DATA_PATH_DEFAULT - / CMETHODS_OUT_FOLDER_DEFAULT - / CityOptions.default() - / RunOptions.default() - ).resolve(), - "--method", - MethodOptions.default(), - "-v", - VariableOptions.default(), - "-p", + CMETHODS_OUT_FOLDER_DEFAULT, + DATA_PATH_DEFAULT, + MOD_FOLDER_DEFAULT, + OBS_FOLDER_DEFAULT, + PREPROCESS_FILE_NAME, + PREPROCESS_OUT_FOLDER_DEFAULT, PROCESSESORS_DEFAULT, + VALID_DATES_STR_DEFAULT, + CityOptions, + MethodOptions, + RunConfig, + RunOptions, + VariableOptions, ) -CLI_CMEHTODS_DEFAULT_COMMAND_TUPLE_STR_CORRECT: Final[ - tuple[str, ...] -] = iter_to_tuple_strs(CLI_CMETHODS_DEFAULT_COMMAND_TUPLE_CORRECT) -CLI_CMETHODS_DEFAULT_COMMAND_STR_CORRECT: Final[str] = " ".join( - CLI_CMEHTODS_DEFAULT_COMMAND_TUPLE_STR_CORRECT -) - - -MOD_FOLDER_FILES_COUNT_CORRECT: Final[int] = 1478 -OBS_FOLDER_FILES_COUNT_CORRECT: Final[int] = MOD_FOLDER_FILES_COUNT_CORRECT -PREPROCESS_OUT_FOLDER_FILES_COUNT_CORRECT: Final[int] = 4 - - -@dataclass -class RunConfig: - - """Manage creating command line scripts to run `debiasing` `cli`.""" - - command_dir: Path = COMMAND_DIR_DEFAULT - variable: str = VariableOptions.default() - run: str = RunOptions.default() - city: str = CityOptions.default() - method: str = MethodOptions.default() - run_prefix: str = RUN_PREFIX_DEFAULT - preprocess_data_file: PathLike = PREPROCESS_FILE_NAME - run_cmethods_file: PathLike = CMETHODS_FILE_NAME - - data_path: Path = DATA_PATH_DEFAULT - mod_folder: PathLike = MOD_FOLDER_DEFAULT - obs_folder: PathLike = OBS_FOLDER_DEFAULT - preprocess_out_folder: PathLike = PREPROCESS_OUT_FOLDER_DEFAULT - cmethods_out_folder: PathLike = CMETHODS_OUT_FOLDER_DEFAULT - - calib_date_start: DateType = CALIB_DATE_START_DEFAULT - calib_date_end: DateType = CALIB_DATE_END_DEFAULT - - valid_date_start: DateType = VALID_DATE_START_DEFAULT - valid_date_end: DateType = VALID_DATE_END_DEFAULT - - processors: int = PROCESSESORS_DEFAULT - - date_format_str: str = DATE_FORMAT_STR - date_split_str: str = DATE_FORMAT_SPLIT_STR - - def calib_dates_to_str( - self, - start_date: DateType, - end_date: DateType, - in_format_str: Optional[str] = None, - out_format_str: Optional[str] = None, - split_str: Optional[str] = None, - ) -> str: - """Return date range as `str` from `calib_date_start` to `calib_date_end`. - - Example - ------- - >>> config: RunConfig = RunConfig() - >>> config.calib_dates_to_str('20100101', '20100330') - '20100101-20100330' - >>> config.calib_dates_to_str(date(2010, 1, 1), '20100330') - '20100101-20100330' - >>> config.calib_dates_to_str(date(2010, 1, 1), '20100330', split_str="_") - '20100101_20100330' - """ - start_date = start_date if start_date else self.calib_date_start - end_date = end_date if end_date else self.calib_date_end - return self._date_range_to_str( - start_date, end_date, in_format_str, out_format_str, split_str - ) - - def valid_dates_to_str( - self, - start_date: DateType, - end_date: DateType, - in_format_str: Optional[str] = None, - out_format_str: Optional[str] = None, - split_str: Optional[str] = None, - ) -> str: - """Return date range as `str` from `valid_date_start` to `valid_date_end`. - - Example - ------- - >>> config: RunConfig = RunConfig() - >>> config.valid_dates_to_str('20100101', '20100330') - '20100101-20100330' - >>> config.valid_dates_to_str(date(2010, 1, 1), '20100330') - '20100101-20100330' - >>> config.valid_dates_to_str(date(2010, 1, 1), '20100330', split_str="_") - '20100101_20100330' - """ - start_date = start_date if start_date else self.valid_date_start - end_date = end_date if end_date else self.valid_date_end - return self._date_range_to_str( - start_date, end_date, in_format_str, out_format_str, split_str - ) - - def _date_range_to_str( - self, - start_date: DateType, - end_date: DateType, - in_format_str: Optional[str] = None, - out_format_str: Optional[str] = None, - split_str: Optional[str] = None, - ) -> str: - """Return date range as `str` from `calib_date_start` to `calib_date_end`. - - Example - ------- - >>> config: RunConfig = RunConfig() - >>> config._date_range_to_str('20100101', '20100330') - '20100101-20100330' - >>> config._date_range_to_str(date(2010, 1, 1), '20100330') - '20100101-20100330' - >>> config._date_range_to_str(date(2010, 1, 1), '20100330', split_str="_") - '20100101_20100330' - """ - in_format_str = in_format_str if in_format_str else self.date_format_str - out_format_str = out_format_str if out_format_str else self.date_format_str - split_str = split_str if split_str else self.date_split_str - return date_range_to_str( - start_date=start_date, - end_date=end_date, - in_format_str=in_format_str, - out_format_str=out_format_str, - split_str=split_str, - ) - - def mod_path(self, city: Optional[str] = None) -> Path: - """Return city estimates path. - - Example - ------- - >>> if not is_climate_data_mounted: - ... pytest.skip('requires linux server mount paths') - >>> config: RunConfig = RunConfig() - >>> config.mod_path() - PosixPath('/mnt/vmfileshare/ClimateData/Cropped/three.cities/CPM/Manchester') - >>> config.mod_path('Glasgow') - PosixPath('/mnt/vmfileshare/ClimateData/Cropped/three.cities/CPM/Glasgow') - """ - city = city if city else self.city - return self.data_path / self.mod_folder / city - - def obs_path(self, city: Optional[str] = None) -> Path: - """Return city observations path. - - Example - ------- - >>> if not is_climate_data_mounted: - ... pytest.skip('requires linux server mount paths') - >>> config: RunConfig = RunConfig() - >>> config.obs_path() - PosixPath('/mnt/vmfileshare/ClimateData/Cropped/three.cities/Hads.updated360/Manchester') - >>> config.obs_path('Glasgow') - PosixPath('/mnt/vmfileshare/ClimateData/Cropped/three.cities/Hads.updated360/Glasgow') - """ - city = city if city else self.city - return self.data_path / self.obs_folder / city - - def preprocess_out_path( - self, - city: Optional[str] = None, - run: Optional[str] = None, - variable: Optional[str] = None, - ) -> Path: - """Return path to save results. - - Example - ------- - >>> if not is_climate_data_mounted: - ... pytest.skip('requires linux server mount paths') - >>> config: RunConfig = RunConfig() - >>> config.preprocess_out_path() - PosixPath('/mnt/vmfileshare/ClimateData/Cropped/three.cities/Preprocessed/Manchester/05/tasmax') - >>> config.preprocess_out_path(city='Glasgow', run='07') - PosixPath('/mnt/vmfileshare/ClimateData/Cropped/three.cities/Preprocessed/Glasgow/07/tasmax') - """ - city = city if city else self.city - run = run if run else self.run - variable = variable if variable else self.variable - return ( - self.data_path / self.preprocess_out_folder / city / run / variable - ).resolve() - - def cmethods_out_path( - self, - city: Optional[str] = None, - run: Optional[str] = None, - ) -> Path: - """Return path to save cmethods results. - - Example - ------- - >>> config: RunConfig = RunConfig() - >>> config.cmethods_out_path() - PosixPath('/mnt/vmfileshare/ClimateData/Debiased/three.cities.cropped/Manchester/05') - >>> config.cmethods_out_path(city='Glasgow', run='07') - PosixPath('/mnt/vmfileshare/ClimateData/Debiased/three.cities.cropped/Glasgow/07') - """ - city = city if city else self.city - run = run if run else self.run - return (self.data_path / self.cmethods_out_folder / city / run).resolve() - - @property - def run_prefix_tuple(self) -> tuple[str, ...]: - """Split `self.run_prefix` by ' ' to a `tuple`. - - Example - ------- - >>> config: RunConfig = RunConfig(run_prefix='python -m') - >>> config.run_prefix_tuple - ('python', '-m') - """ - return tuple(self.run_prefix.split(" ")) - - def to_cli_preprocess_tuple( - self, - variable: Optional[str] = None, - run: Optional[str] = None, - city: Optional[str] = None, - calib_start: Optional[DateType] = None, - calib_end: Optional[DateType] = None, - valid_start: Optional[DateType] = None, - valid_end: Optional[DateType] = None, - ) -> tuple[Union[str, PathLike], ...]: - """Generate a `tuple` of `str` for a command line command. - - Note - ---- - This will leave `Path` objects uncoverted. See - `self.to_cli_preprocess_tuple_strs` for passing to a terminal. - - Example - ------- - >>> config: RunConfig = RunConfig() - >>> command_str_tuple: tuple[str, ...] = config.to_cli_preprocess_tuple() - >>> assert command_str_tuple == CLI_PREPROCESS_DEFAULT_COMMAND_TUPLE_CORRECT - """ - city = city if city else self.city - variable = variable if variable else self.variable - run = run if run else self.run - - mod_path: Path = self.mod_path(city=city) - obs_path: Path = self.obs_path(city=city) - preprocess_out_path: Path = self.preprocess_out_path( - city=city, run=run, variable=variable - ) - calib_dates_str: str = self.calib_dates_to_str( - start_date=calib_start, end_date=calib_end - ) - valid_dates_str: str = self.valid_dates_to_str( - start_date=valid_start, end_date=valid_end - ) - - return ( - *self.run_prefix_tuple, - self.preprocess_data_file, - "--mod", - mod_path, - "--obs", - obs_path, - "-v", - variable, - "-r", - run, - "--out", - preprocess_out_path, - "--calib_dates", - calib_dates_str, - "--valid_dates", - valid_dates_str, - ) - - def to_cli_preprocess_tuple_strs( - self, - variable: Optional[str] = None, - run: Optional[str] = None, - city: Optional[str] = None, - calib_start: Optional[DateType] = None, - calib_end: Optional[DateType] = None, - valid_start: Optional[DateType] = None, - valid_end: Optional[DateType] = None, - ) -> tuple[str, ...]: - """Generate a command line interface `str` `tuple` a test example. - - Example - ------- - >>> config: RunConfig = RunConfig() - >>> command_str_tuple: tuple[str, ...] = config.to_cli_preprocess_tuple_strs() - >>> assert command_str_tuple == CLI_PREPROCESS_DEFAULT_COMMAND_TUPLE_STR_CORRECT - """ - return iter_to_tuple_strs( - self.to_cli_preprocess_tuple( - variable=variable, - run=run, - city=city, - calib_start=calib_start, - calib_end=calib_end, - valid_start=valid_start, - valid_end=valid_end, - ) - ) - - def to_cli_preprocess_str( - self, - variable: Optional[str] = None, - run: Optional[str] = None, - city: Optional[str] = None, - calib_start: Optional[DateType] = None, - calib_end: Optional[DateType] = None, - valid_start: Optional[DateType] = None, - valid_end: Optional[DateType] = None, - ) -> str: - """Generate a command line interface str as a test example. - - Example - ------- - >>> config: RunConfig = RunConfig() - >>> config.to_cli_preprocess_str() == CLI_PREPROCESS_DEFAULT_COMMAND_STR_CORRECT - True - >>> CLI_PREPROCESS_DEFAULT_COMMAND_STR_CORRECT[:96] #doctest: +ELLIPSIS - 'python preprocess_data.py --mod /.../CPM/Manchester' - """ - return " ".join( - self.to_cli_preprocess_tuple_strs( - variable=variable, - run=run, - city=city, - calib_start=calib_start, - calib_end=calib_end, - valid_start=valid_start, - valid_end=valid_end, - ) - ) - - def yield_mod_folder( - self, city: Optional[str] = None - ) -> Generator[Path, None, None]: - """`Iterable` of all `Path`s in `self.mod_folder`. - - Example - ------- - >>> if not is_climate_data_mounted: - ... pytest.skip('requires linux server mount paths') - >>> config: RunConfig = RunConfig() - >>> len(tuple(config.yield_mod_folder())) == MOD_FOLDER_FILES_COUNT_CORRECT - True - """ - city = city if city else self.city - return path_iterdir(self.obs_path(city=city)) - - def yield_obs_folder( - self, city: Optional[str] = None - ) -> Generator[Path, None, None]: - """`Iterable` of all `Path`s in `self.obs_folder`. - - Example - ------- - >>> if not is_climate_data_mounted: - ... pytest.skip('requires linux server mount paths') - >>> config: RunConfig = RunConfig() - >>> len(tuple(config.yield_obs_folder())) == OBS_FOLDER_FILES_COUNT_CORRECT - True - """ - city = city if city else self.city - return path_iterdir(self.obs_path(city=city)) - - def yield_preprocess_out_folder( - self, - city: Optional[str] = None, - run: Optional[str] = None, - variable: Optional[str] = None, - ) -> Generator[Path, None, None]: - """`Iterable` of all `Path`s in `self.preprocess_out_folder`. - - Example - ------- - >>> if not is_climate_data_mounted: - ... pytest.skip('requires linux server mount paths') - >>> config: RunConfig = RunConfig() - >>> (len(tuple(config.yield_preprocess_out_folder())) == - ... PREPROCESS_OUT_FOLDER_FILES_COUNT_CORRECT) - True - """ - city = city if city else self.city - run = run if run else self.run - variable = variable if variable else self.variable - return path_iterdir( - self.preprocess_out_path(city=city, run=run, variable=variable) - ) - - @property - def command_path(self) -> Path: - """Return command path relative to running tests.""" - return (Path() / self.command_dir).absolute() - - def to_cli_run_cmethods_tuple( - self, - city: Optional[str] = None, - run: Optional[str] = None, - variable: Optional[str] = None, - method: Optional[str] = None, - input_data_path: Optional[PathLike] = None, - cmethods_out_path: Optional[PathLike] = None, - processors: Optional[int] = None, - ) -> tuple[Union[str, PathLike], ...]: - """Generate a `tuple` of `str` for a command line command. - - Note - ---- - This will leave `Path` objects uncoverted. See - `self.to_cli_run_cmethods_tuple_strs` for passing to a terminal. - - Example - ------- - >>> config: RunConfig = RunConfig() - >>> command_str_tuple: tuple[str, ...] = config.to_cli_run_cmethods_tuple() - >>> assert command_str_tuple == CLI_CMETHODS_DEFAULT_COMMAND_TUPLE_CORRECT - """ - city = city if city else self.city - variable = variable if variable else self.variable - run = run if run else self.run - method = method if method else self.method - processors = processors if processors else self.processors - - input_data_path = ( - input_data_path - if input_data_path - else self.preprocess_out_path(city=city, run=run, variable=variable) - ) - - cmethods_out_path = ( - cmethods_out_path - if cmethods_out_path - else self.cmethods_out_path(city=city, run=run) - ) - - return ( - *self.run_prefix_tuple, - self.run_cmethods_file, - "--input_data_folder", - input_data_path, - "--out", - cmethods_out_path, - "--method", - method, - "-v", - variable, - "-p", - processors, - ) - - def to_cli_run_cmethods_tuple_strs( - self, - city: Optional[str] = None, - run: Optional[str] = None, - variable: Optional[str] = None, - method: Optional[str] = None, - input_data_path: Optional[PathLike] = None, - cmethods_out_path: Optional[PathLike] = None, - processors: Optional[int] = None, - ) -> tuple[str, ...]: - """Generate a command line interface `str` `tuple` a test example. - - Example - ------- - >>> config: RunConfig = RunConfig() - >>> command_str_tuple: tuple[str, ...] = config.to_cli_run_cmethods_tuple_strs() - >>> assert command_str_tuple == CLI_CMEHTODS_DEFAULT_COMMAND_TUPLE_STR_CORRECT - """ - return iter_to_tuple_strs( - self.to_cli_run_cmethods_tuple( - city=city, - run=run, - variable=variable, - method=method, - input_data_path=input_data_path, - cmethods_out_path=cmethods_out_path, - processors=processors, - ) - ) - - def to_cli_run_cmethods_str( - self, - city: Optional[str] = None, - run: Optional[str] = None, - variable: Optional[str] = None, - method: Optional[str] = None, - input_data_path: Optional[PathLike] = None, - cmethods_out_path: Optional[PathLike] = None, - processors: Optional[int] = None, - ) -> str: - """Generate a command line interface str as a test example. - - Example - ------- - >>> config: RunConfig = RunConfig() - >>> config.to_cli_run_cmethods_str() == CLI_CMETHODS_DEFAULT_COMMAND_STR_CORRECT - True - >>> CLI_CMETHODS_DEFAULT_COMMAND_STR_CORRECT #doctest: +ELLIPSIS - 'python run_cmethods.py...--method quantile_delta_mapping...' - """ - return " ".join( - self.to_cli_run_cmethods_tuple_strs( - city=city, - run=run, - variable=variable, - method=method, - input_data_path=input_data_path, - cmethods_out_path=cmethods_out_path, - processors=processors, - ) - ) - @pytest.fixture def run_config(tmp_path: Path) -> RunConfig: From 7bd77872ae6051ba0426c02a818f75b350a40d8a Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Sat, 9 Dec 2023 12:50:21 +0000 Subject: [PATCH 40/83] feat: add `RuthUpdates...Rmd` notebook --- ...es_Bias_Assessment_correction_markdown.Rmd | 849 ++++++++++++++++++ 1 file changed, 849 insertions(+) create mode 100644 notebooks/Assessing_bc_data/RuthUpdates_Bias_Assessment_correction_markdown.Rmd diff --git a/notebooks/Assessing_bc_data/RuthUpdates_Bias_Assessment_correction_markdown.Rmd b/notebooks/Assessing_bc_data/RuthUpdates_Bias_Assessment_correction_markdown.Rmd new file mode 100644 index 00000000..8519512c --- /dev/null +++ b/notebooks/Assessing_bc_data/RuthUpdates_Bias_Assessment_correction_markdown.Rmd @@ -0,0 +1,849 @@ +--- +title: "Bias correction assessment" +author: "Ruth Bowyer" +date: "`r format(Sys.Date())`" +output: + html_document: + theme: cosmo + toc: TRUE + toc_float: TRUE + toc_depth: 4 + code_folding: hide + df_print: paged +params: + ask: false +--- + + +```{r libs and setup, message=FALSE, warning=F} +rm(list=ls()) + +knitr::opts_knit$set(root.dir="/mnt/vmfileshare/ClimateData/") + +# Add in the automatic installation process + +library(ggplot2) +library(terra) +library(tmap) #pretty maps +library(RColorBrewer) +library(tidyverse) +library(kableExtra) +install.packages("ncdf4") +library(ncdf4) + +if (!require(devtools)) install.packages("devtools") + +library(devtools) +install_github("hzambran/hydroTSM") +install_github("hzambran/hydroGOF") +``` + + +## **0. About** + +This is an example notebook for the assessment of bias corrected data, using output from the R 'qmap' package for the city of Glasgow and the variable 'tasmax'. + +**Input data** + +This script requires the following data: + +- 'obs.cal' - observation (HADs data) for the *calibration* period - the dataset used as the reference dataset in the bias correction +- 'obs.val' - as above, for the *validation* period +- 'cpm.cal.raw' - the raw (uncorrected) data for the *calibration* period +- 'cpm.cal.adj' - the adjusted (bias-corrected) data for the *calibration* period +- 'cpm.val.raw' - the raw (uncorrected) data for the *valibration* period +- 'cpm.val.adj' - the adjusted (bias-corrected) data for the *valibration* period +- 'cpm.proj.raw' - the raw (uncorrected) data for the *future/projected* period (optional) +- 'cpm.proj.radj' - the adjusted (bias-corrected) data for the *future/projected* period (optional) + +The data is required in raster format and dataframe formats + +**Calibration vs Validation dates** + +The calibration period runs between 01-12-1980 to the day prior to 01-12-2010 +The validation period runs between 01-12-2010 to the day prior to 01-12-2020 + +```{r data loading, include=FALSE} + +#This chunk attempts to apply the conversion to python output data to a form that this script will also use. This could (and probably should) be moved to a source script -- also the R pre-processing should probably be moved to the bias correction script? + +dd <- "/mnt/vmfileshare/ClimateData/" #Data directory of all data used in this script + +city <- "Glasgow" +var <- "tasmax" +runs <- c("05", "06", "07", "08") + + +## Load a source raster to extract the crs +r <- list.files(paste0(dd, "Reprojected_infill/UKCP2.2/tasmax/05/latest/"), full.names = T)[1] +rast <- rast(r) +crs <- crs(rast) + +####### PYTHON INPUTS HERE ###### +# if(input=="raster"){ +# This script uses both raster data and the raw data +# This script uses Lists to group everything by runs +# Therefore what is require from this here is to create a list object for each of the sets of the data as listed above, where the list items are the rasters or dataframes by run (ie each level of the list is a run) +# .nc and .tif files can be read with rast("path/to/file.nc") +# Conversion to df is just as.data.frame(raster, xy=T) - easiest thing is just to loop using lapply the files +#dfs are assumed to be cells x time + +rd.python <- "Debiased/three.cities.cropped/" +dd.python <- "/mnt/vmfileshare/ClimateData/" #Data directory of all data used in this script +#ncfname <- paste(dd, rd, city, runs[1], var, ".nc", sep="") + +r.python <- lapply(runs, function(i){ + fp = paste0(dd.python, rd.python, "/Glasgow/", i, "/tasmax") + list.files(fp, pattern="debiased*", full.names = T)}) + + +val_py <- lapply(1:4, function(x){ + xx <- r.python[[x]] + L <- lapply(1:4, function(i){ + rp <- xx[[i]] + r <- rast(rp) + df <- as.data.frame(r, xy=T) + r <- rast(df, type="xyz") + crs(r) <- crs + dfr <- list(df,r) + names(dfr) <- c("df", "rast") + return(dfr) + }) + names(L) <- c("delta_method", "quantile_delta", "quantile", "var_scaling") + return(L) +}) + +names(val_py) <- paste0("python_runs", runs) + +# } else if (input=="RDS"){ +### This R bit is a bit crazy because of the format output from the bias correction - at some point to be cleaned up and moved to a different script. + + +## The output created from the R bias correction framework is a list of dataframes containing all the data we need for this doc (although some are transposed). +rd <- "Debiased/R/QuantileMapping/three.cities/" + +files <- list.files(paste0(dd,rd,city),full.names=T) +files.v <- files[grepl(var, files)] + +allruns <- lapply(files.v, readRDS) + +names <- gsub(paste0(dd,rd,city,"|/|.RDS"),"",files.v) +names(allruns) <- names + +#This was returned for ease where multiple runs have been looped to apply this paritcular function, but actually we don't need a cope for each nor this data in a list. Therefore: +obs.cal.df <- as.data.frame(t(allruns[[1]]$t.obs)) + +obs.val.df <- allruns[[1]]$val.df[c(1:3601)] #To run until 30th Nov 2020 + +cpm.cal.raw.df.L <- lapply(allruns, function(L){ + as.data.frame(t(L[["t.cal"]])) + }) + +#In the R scirpt, the validation is corrected with the projected data as well - so needs to be seperated out (and transposed) +cpm.val.raw.df.L <- lapply(allruns, function(L){ + proj <- as.data.frame(t(L[["t.proj"]])) + val.end.date <- min(grep("20201201-", names(proj)))-1 + cpm.val.raw.df <- proj[,1:val.end.date] +}) + +cpm.proj.raw.df.L <- lapply(allruns, function(L){ + proj <- as.data.frame(t(L[["t.proj"]])) + val.end.date <- min(grep("20201201-", names(proj))) + cpm.val.raw.df <- proj[,val.end.date:ncol(proj)] +}) + +cpm.cal.adj.df.L <- lapply(allruns, function(L){ + adj <- as.data.frame(t(L[["qm1.hist"]])) +}) + + cpm.val.adj.df.L <- lapply(allruns, function(L){ + proj <- as.data.frame(t(L[["qm1.val.proj"]])) + val.end.date <- min(grep("20201201-", names(proj)))-1 + proj[,1:val.end.date] +}) + + cpm.proj.adj.df.L <- lapply(allruns, function(L){ + proj <- as.data.frame(t(L[["qm1.val.proj"]])) + val.end.date <- min(grep("20201201-", names(proj))) + proj[,val.end.date:ncol(proj)] +}) + +## Convert to rasters --requires creation of x and y cols from row names +## For the comparison, just converting the observation and cpm for the cal and val perios (ie not the projection datasets) + +obsrastL <- lapply(list(obs.cal.df, obs.val.df), function(i){ + rn <- row.names(i) #The rownames were saves as x_y coordinates + xi <- gsub("_.*", "", rn) + yi <- gsub(".*_", "", rn) + xy <- data.frame(x = xi, y = yi) + df <- cbind(xy, i) + r <- rast(df, type="xyz") + crs(r) <- crs + return(r) +}) + +names(obsrastL) <- c("obs.cal.rasts", "obs.val.rasts") +list2env(obsrastL, .GlobalEnv) +remove(obsrastL) + +list2rast <- list(cpm.cal.raw.df.L, cpm.cal.adj.df.L, cpm.val.raw.df.L, cpm.val.adj.df.L) + +rastsL <- lapply(list2rast, function(x){ + allruns <- x + df.rL <- lapply(runs, function(i){ + df <- allruns[[grep(i, names(allruns))]] #extract df based on run id + rn <- row.names(df) #The rownames were saves as x_y coordinates + xi <- gsub("_.*", "", rn) + yi <- gsub(".*_", "", rn) + xy <- data.frame(x = xi, y = yi) + df <- cbind(xy, df) + r <- rast(df, type="xyz") + crs(r) <- crs + return(r) + }) + names(df.rL) <- runs + return(df.rL) + }) + +names(rastsL) <- c("cpm.cal.raw.rasts.L", "cpm.cal.adj.rasts.L", "cpm.val.raw.rasts.L", "cpm.val.adj.rasts.L") + +list2env(rastsL, .GlobalEnv) + +remove(rastsL) +remove(list2rast) + +gc() + +# } else { +# print("Invalid input") +#} + + + +``` + + +## **1. Bias Correction Assessment: trends** + +An visual comparison of trends across observation, raw and adjusted data for the same time period + +### **1a. Raster comparison** + +Random selection of 3 days of the observation, calibration and two adjusted cals, for three historic days + +Adding in the city shapeoutline for prettier maps + +```{r} + +shape <-sf::st_as_sf(vect(paste0(dd, "shapefiles/three.cities/", city, "/", city, ".shp"))) + +``` + + + +#### **Day 1 - 1980-12-01 - calibration period ** {.tabset} + +##### Run05 + +```{r} + +tm_shape(val_py$python_runs05$delta_method$rast$tasmax_3000) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + + tm_borders(col="black") +``` + + + +```{r, fig.show="hold", out.width="33%"} + + tm_shape(obs.cal.rasts[[1]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + + +tm_shape(cpm.cal.raw.rasts.L$`05`[[1]]) + + tm_raster(title="CPM, Raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.adj.rasts.L$`05`[[1]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +``` + + +##### Run06 + +```{r, fig.show="hold", out.width="33%"} +tm_shape(obs.cal.rasts[[1]]) + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.raw.rasts.L$`06`[[1]]) + + tm_raster(title="CPM, Raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.adj.rasts.L$`06`[[1]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") +``` + + +##### Run07 + +```{r, fig.show="hold", out.width="33%"} + +tm_shape(obs.cal.rasts[[1]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.raw.rasts.L$`07`[[1]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.adj.rasts.L$`07`[[1]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") +``` + + +##### Run08 + +```{r, fig.show="hold", out.width="33%"} + +tm_shape(obs.cal.rasts[[1]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.raw.rasts.L$`08`[[1]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.adj.rasts.L$`08`[[1]]) + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +``` + +#### **Day 2 - 2008-08-01 - calibration period ** {.tabset} + +##### Run05 + +```{r, fig.show="hold", out.width="33%"} + + tm_shape(obs.cal.rasts[[7081]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + + +tm_shape(cpm.cal.raw.rasts.L$`05`[[7081]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.adj.rasts.L$`05`[[7081]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +``` + + +##### Run06 + +```{r, fig.show="hold", out.width="33%"} +tm_shape(obs.cal.rasts[[7081]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.raw.rasts.L$`06`[[7081]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.adj.rasts.L$`06`[[7081]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") +``` + + +##### Run07 + +```{r, fig.show="hold", out.width="33%"} + +tm_shape(obs.cal.rasts[[7081]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.raw.rasts.L$`07`[[7081]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.adj.rasts.L$`07`[[7081]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") +``` + + +##### Run08 + +```{r, fig.show="hold", out.width="33%"} + +tm_shape(obs.cal.rasts[[7081]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.raw.rasts.L$`08`[[7081]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.cal.adj.rasts.L$`08`[[7081]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +``` + + + +#### **Day 3 - 2015-05-01 - calibration period ** {.tabset} + +##### Run05 + +```{r, fig.show="hold", out.width="33%"} + + tm_shape(obs.val.rasts[[1590]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + + +tm_shape(cpm.val.raw.rasts.L$`05`[[1590]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.val.adj.rasts.L$`05`[[1590]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +``` + + +##### Run06 + +```{r, fig.show="hold", out.width="33%"} +tm_shape(obs.val.rasts[[1590]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.val.raw.rasts.L$`06`[[1590]]) + + tm_raster(title="CPM, Raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.val.adj.rasts.L$`06`[[1590]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") +``` + + +##### Run07 + +```{r, fig.show="hold", out.width="33%"} + +tm_shape(obs.val.rasts[[1590]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.val.raw.rasts.L$`07`[[1590]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.val.adj.rasts.L$`07`[[1590]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") +``` + + +##### Run08 + +```{r, fig.show="hold", out.width="33%"} + +tm_shape(obs.val.rasts[[1590]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.val.raw.rasts.L$`08`[[1590]]) + + tm_raster(title="CPM, raw") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +tm_shape(cpm.val.adj.rasts.L$`08`[[1590]]) + + tm_raster(title="CPM, bias-corrected") + + tm_layout(legend.outside = T) + + tm_shape(shape) + tm_borders(col="black") + +``` + +#### {-} + + +## **2. Bias Correction Assessment: Metrics** + +Using the validation data set for this + +#NOTE TO GRIFF - if the data has ran we can delete this bit and add in the new stuff + +```{r} + +hads <- obs.val.df[c(32:2551)] +raw.cpm_05 <- cpm.val.raw.df.L$Run05_tasmax[c(32:2551)] +R_quantile_05 <- cpm.val.adj.df.L$Run05_tasmax[c(32:2551)] + +py_quantile_05 <- val_py$python_runs05$quantile$df[c(361:2880)] +py_quantile_delta_05 <- val_py$python_runs05$quantile_delta$df[c(361:2880)] +py_delta_05 <- val_py$python_runs05$delta_method$df[c(361:2880)] +py_varscaling <- val_py$python_runs05$var_scaling$df[c(361:2880)] + +val.dfs <- list(hads, raw.cpm_05, R_quantile_05, py_quantile_05, py_quantile_delta_05, py_delta_05, py_varscaling) + +#Convert dfs to a vector +val.dfs.v <- lapply(val.dfs, function(d){ + #Convert to single vector + unlist(as.vector(d))}) + +val.dfs.v.df <- as.data.frame(val.dfs.v) +names(val.dfs.v.df) <- c("hads","cpm.raw_Run05", "R.quantile_Run05", "py.quantile_Run05", + "py.quantile_delta_Run05", "py.delta_Run05", "py.var_scaling_Run05") +``` + + +### **2a. Descriptive statistics** + +```{r descriptives validation} + +descriptives <- apply(val.dfs.v.df, 2, function(x){ + per <- data.frame(as.list(quantile(x, probs=c(0.1, 0.9)))) + data.frame(mean=mean(x), sd=sd(x), min = min(x), per10th=per$X10.,per90th=per$X90., max = max(x)) +}) + +descriptives <- descriptives %>% reduce(rbind) +row.names(descriptives) <- names(val.dfs.v.df) +d <- t(descriptives) + +d %>% + kable(booktabs = T) %>% + kable_styling() + +``` + + +#### Fig.Density plot of validation period + +**Note** - need to add back in some facetting to this fig + +```{r warning=F, message=F} +m <- reshape2::melt(val.dfs.v.df) + +ggplot(m, aes(value, fill=variable, colour=variable)) + + geom_density(alpha = 0.3, position="identity") + + theme_minimal() + + scale_fill_brewer(palette = "Set1") + + scale_color_brewer(palette = "Set1") + + facet_wrap(.~variable) + +``` + +#### Seasonal +#Note here: we can change the the 1:7 back to 1:9 if we have 10 years of data as a result of data rerun + +```{r} + +# Using the 360 year structure, derive row indexes for each of the seasons, assuming the first calendar date represents Dec 1st + +winter <- c(1:90) +for(i in 1:6){ + x <-1 + i*360 + y <-1 + i*360 + 90 #60 days is 3 months + winter <- c(winter, x:y) +} + +spring <- c(91:180) +for(i in 1:6){ + x <-91 + (i*360) + y <-91 + (i*360) + 90 #90 days is 3 months + sping <- c(spring, x:y) +} + +summer <- c(181:270) +for(i in 1:6){ + x <- 181 + (i*360) + y <- 181 + i*360 + 60 #60 days is 3 months + summer <- c(summer, x:y) +} + +autumn <- c(271:360) +for(i in 1:6){ + x <- 181 + (i*360) + y <- 181 + i*360 + 60 #60 days is 3 months + autumn <- c(autumn, x:y) +} + +seasons <- list(winter, spring, summer, autumn) + + +``` + + +```{r seasonal descriptives} + +seasonal.descriptives <- lapply(seasons, function(s){ + +#Convert dfs to a vector +df<- lapply(val.dfs, function(d){ + #Convert to single vector with just the seasonally defined columns + d <- d[,s] + unlist(as.vector(d))}) + +df <- as.data.frame(df) +names(df) <- c("hads","cpm.raw_Run05", "R.quantile_Run05", "py.quantile_Run05", + "py.quantile_delta_Run05", "py.delta_Run05", "py.var_scaling_Run05") + + + descriptives <- apply(df, 2, function(x){ + per <- data.frame(as.list(quantile(x, probs=c(0.1, 0.9)))) + data.frame(mean=mean(x), sd=sd(x), min = min(x), per10th=per$X10.,per90th=per$X90., max = max(x)) + }) + + descriptives <- descriptives %>% reduce(rbind) + row.names(descriptives) <- names(df) + d <- t(descriptives) +}) + + +``` + +#### Winter + +```{r} +seasonal.descriptives[[1]] %>% + kable(booktabs = T) %>% + kable_styling() +``` + +#### Spring + +```{r} +seasonal.descriptives[[2]] %>% + kable(booktabs = T) %>% + kable_styling() + + +#### Summer + +```{r} +seasonal.descriptives[[3]] %>% + kable(booktabs = T) %>% + kable_styling() +``` + +#### Autumn + +```{r} +seasonal.descriptives[[4]] %>% + kable(booktabs = T) %>% + kable_styling() +``` + + + +### **2b. Model fit statistics** + +Using the following to assess overall fit: + +- **R-squared (rsq)** +- **Root Square Mean Error (RMSE)** +- **Nash-Sutcliffe Efficiency (NSE):** Magnitude of residual variance compared to measured data variance, ranges -∞ to 1, 1 = perfect match to observations +- **Percent bias (PBIAS):** The optimal value of PBIAS is 0.0, with low-magnitude values indicating accurate model simulation. Positive values indicate overestimation bias, whereas negative values indicate model underestimation bias. + +```{r rsq} +actual <- val.dfs.v.df$hads + +rsq <- sapply(val.dfs.v.df[c(2:ncol(val.dfs.v.df))], function(x){ + cor(actual, x)^2 +}) + +``` + +```{r rmse} + +rmse <- sapply(val.dfs.v.df[c(2:ncol(val.dfs.v.df))], function(x){ + sqrt(mean((actual - x)^2)) +}) + +``` + +```{r pbias} + +pbias <- sapply(val.dfs.v.df[c(2:ncol(val.dfs.v.df))], function(x){ + hydroGOF::pbias(x, actual) +}) + +``` + +```{r nse} +nse <- sapply(val.dfs.v.df[c(2:ncol(val.dfs.v.df))], function(x){ + hydroGOF::NSE(x, actual) +}) + +``` + +Highlighting the bias corrected statistics + +```{r} + +k <- cbind(rsq, rmse, pbias, nse) +k %>% + kable(booktabs = T) %>% + kable_styling() %>% + row_spec(grep(".bc.",row.names(k)), background = "lightgrey") + +``` +#### Seasonal + +```{r rsq} + + +seasonal.model.stats <- lapply(seasons, function(s){ + + #Convert dfs to a vector + df<- lapply(val.dfs, function(d){ + + #Convert to single vector with just the seasonally defined columns + d <- d[,s] + unlist(as.vector(d))}) + + df <- as.data.frame(df) + names(df) <- c("hads","cpm.raw_Run05", "R.quantile_Run05", "py.quantile_Run05", + "py.quantile_delta_Run05", "py.delta_Run05", "py.var_scaling_Run05") + + actual <- df$hads + + rsq <- sapply(df[c(2:ncol(df))], function(x){ + cor(actual, x)^2 + }) + + + rmse <- sapply(df[c(2:ncol(df))], function(x){ + sqrt(mean((actual - x)^2)) + }) + + + pbias <- sapply(df[c(2:ncol(df))], function(x){ + hydroGOF::pbias(x, actual) + }) + + + nse <- sapply(df[c(2:ncol(df))], function(x){ + hydroGOF::NSE(x, actual) + }) + + k <- cbind(rsq, rmse, pbias, nse)}) + +``` + +Highlighting the bias corrected statistics + +#### Winter + +```{r} + +seasonal.model.stats[[1]] %>% + kable(booktabs = T) %>% + kable_styling() +``` + + +#### Spring + +```{r} + +seasonal.model.stats[[2]] %>% + kable(booktabs = T) %>% + kable_styling() +``` + + +#### Summer + +```{r} + +seasonal.model.stats[[3]] %>% + kable(booktabs = T) %>% + kable_styling() +``` + + + +#### Autumn + +```{r} + +seasonal.model.stats[[4]] %>% + kable(booktabs = T) %>% + kable_styling() +``` + + +## **3. Bias Correction Assessment: Metric specific - tasmax** + +### **3b Days above 30 degrees** + +(Not considered consecutively here) + +```{r eval=FALSE, include=FALSE} + +### Ruth to update + +val.dfs.v.df$year <- substr(row.names(val.dfs.v.df), 8,11) + +over30 <- lapply(names(val.dfs.v.df), function(i){ + x <- val.dfs.v.df[,i] + df <- aggregate(x, list(val.dfs.v.df$year), function(x){sum(x>=30)}) + names(df) <- c("year", paste0("Days.over.30.", i)) + return(df) +}) + +over30 %>% reduce(left_join, "year") +``` + + + From 55ecdf8bf23355fc4130651e81127d89bf4e1577 Mon Sep 17 00:00:00 2001 From: Grigorios Mingas Date: Sun, 10 Dec 2023 20:01:46 +0000 Subject: [PATCH 41/83] Automate installation of packages --- ...uthUpdates_Bias_Assessment_correction_markdown.Rmd | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/notebooks/Assessing_bc_data/RuthUpdates_Bias_Assessment_correction_markdown.Rmd b/notebooks/Assessing_bc_data/RuthUpdates_Bias_Assessment_correction_markdown.Rmd index 8519512c..db60ff8a 100644 --- a/notebooks/Assessing_bc_data/RuthUpdates_Bias_Assessment_correction_markdown.Rmd +++ b/notebooks/Assessing_bc_data/RuthUpdates_Bias_Assessment_correction_markdown.Rmd @@ -18,21 +18,24 @@ params: ```{r libs and setup, message=FALSE, warning=F} rm(list=ls()) -knitr::opts_knit$set(root.dir="/mnt/vmfileshare/ClimateData/") +# install packages +list.of.packages <- c("ggplot2", "terra", "tmap", "RColorBrewer", "tidyverse", "kableExtra", "ncdf4", "knitr") +new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] +if(length(new.packages)) install.packages(new.packages) -# Add in the automatic installation process +knitr::opts_knit$set(root.dir="/mnt/vmfileshare/ClimateData/") +# import packages library(ggplot2) library(terra) library(tmap) #pretty maps library(RColorBrewer) library(tidyverse) library(kableExtra) -install.packages("ncdf4") library(ncdf4) +library(devtools) if (!require(devtools)) install.packages("devtools") - library(devtools) install_github("hzambran/hydroTSM") install_github("hzambran/hydroGOF") From eec0e56c2f2bab5b8ee086cc600c824092fcdb92 Mon Sep 17 00:00:00 2001 From: Grigorios Mingas Date: Sun, 10 Dec 2023 20:11:40 +0000 Subject: [PATCH 42/83] Minor modifications in install and import order --- .../RuthUpdates_Bias_Assessment_correction_markdown.Rmd | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/notebooks/Assessing_bc_data/RuthUpdates_Bias_Assessment_correction_markdown.Rmd b/notebooks/Assessing_bc_data/RuthUpdates_Bias_Assessment_correction_markdown.Rmd index db60ff8a..62f1f373 100644 --- a/notebooks/Assessing_bc_data/RuthUpdates_Bias_Assessment_correction_markdown.Rmd +++ b/notebooks/Assessing_bc_data/RuthUpdates_Bias_Assessment_correction_markdown.Rmd @@ -22,6 +22,10 @@ rm(list=ls()) list.of.packages <- c("ggplot2", "terra", "tmap", "RColorBrewer", "tidyverse", "kableExtra", "ncdf4", "knitr") new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] if(length(new.packages)) install.packages(new.packages) +if (!require(devtools)) install.packages("devtools") +library(devtools) +install_github("hzambran/hydroTSM") +install_github("hzambran/hydroGOF") knitr::opts_knit$set(root.dir="/mnt/vmfileshare/ClimateData/") @@ -35,10 +39,6 @@ library(kableExtra) library(ncdf4) library(devtools) -if (!require(devtools)) install.packages("devtools") -library(devtools) -install_github("hzambran/hydroTSM") -install_github("hzambran/hydroGOF") ``` From ad405cf78c95eb208f834aa64d993c6a51d14cea Mon Sep 17 00:00:00 2001 From: Grigorios Mingas Date: Sun, 10 Dec 2023 20:14:02 +0000 Subject: [PATCH 43/83] Remove duplicate library call --- .../RuthUpdates_Bias_Assessment_correction_markdown.Rmd | 1 - 1 file changed, 1 deletion(-) diff --git a/notebooks/Assessing_bc_data/RuthUpdates_Bias_Assessment_correction_markdown.Rmd b/notebooks/Assessing_bc_data/RuthUpdates_Bias_Assessment_correction_markdown.Rmd index 62f1f373..2fd160da 100644 --- a/notebooks/Assessing_bc_data/RuthUpdates_Bias_Assessment_correction_markdown.Rmd +++ b/notebooks/Assessing_bc_data/RuthUpdates_Bias_Assessment_correction_markdown.Rmd @@ -23,7 +23,6 @@ list.of.packages <- c("ggplot2", "terra", "tmap", "RColorBrewer", "tidyverse", " new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] if(length(new.packages)) install.packages(new.packages) if (!require(devtools)) install.packages("devtools") -library(devtools) install_github("hzambran/hydroTSM") install_github("hzambran/hydroGOF") From 98f03cb78e6c918b3a3a022e109a612c374e695f Mon Sep 17 00:00:00 2001 From: Ruth Bowyer <105492883+RuthBowyer@users.noreply.github.com> Date: Sun, 10 Dec 2023 20:42:20 +0000 Subject: [PATCH 44/83] Adding in the index to look up Testing for conflicts --- ...es_Bias_Assessment_correction_markdown.Rmd | 169 +++++++++--------- 1 file changed, 85 insertions(+), 84 deletions(-) diff --git a/notebooks/Assessing_bc_data/RuthUpdates_Bias_Assessment_correction_markdown.Rmd b/notebooks/Assessing_bc_data/RuthUpdates_Bias_Assessment_correction_markdown.Rmd index 2fd160da..479d2d29 100644 --- a/notebooks/Assessing_bc_data/RuthUpdates_Bias_Assessment_correction_markdown.Rmd +++ b/notebooks/Assessing_bc_data/RuthUpdates_Bias_Assessment_correction_markdown.Rmd @@ -18,53 +18,45 @@ params: ```{r libs and setup, message=FALSE, warning=F} rm(list=ls()) -# install packages -list.of.packages <- c("ggplot2", "terra", "tmap", "RColorBrewer", "tidyverse", "kableExtra", "ncdf4", "knitr") -new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] -if(length(new.packages)) install.packages(new.packages) -if (!require(devtools)) install.packages("devtools") -install_github("hzambran/hydroTSM") -install_github("hzambran/hydroGOF") - -knitr::opts_knit$set(root.dir="/mnt/vmfileshare/ClimateData/") +# Add in the automatic installation process -# import packages library(ggplot2) library(terra) library(tmap) #pretty maps library(RColorBrewer) library(tidyverse) library(kableExtra) +install.packages("ncdf4") library(ncdf4) -library(devtools) +if (!require(devtools)) install.packages("devtools") + +library(devtools) +install_github("hzambran/hydroTSM") +install_github("hzambran/hydroGOF") ``` ## **0. About** -This is an example notebook for the assessment of bias corrected data, using output from the R 'qmap' package for the city of Glasgow and the variable 'tasmax'. - -**Input data** - -This script requires the following data: +This is an example notebook for the assessment of bias corrected data, comparing it with our differing outputs from different methods across R and python. -- 'obs.cal' - observation (HADs data) for the *calibration* period - the dataset used as the reference dataset in the bias correction -- 'obs.val' - as above, for the *validation* period -- 'cpm.cal.raw' - the raw (uncorrected) data for the *calibration* period -- 'cpm.cal.adj' - the adjusted (bias-corrected) data for the *calibration* period -- 'cpm.val.raw' - the raw (uncorrected) data for the *valibration* period -- 'cpm.val.adj' - the adjusted (bias-corrected) data for the *valibration* period -- 'cpm.proj.raw' - the raw (uncorrected) data for the *future/projected* period (optional) -- 'cpm.proj.radj' - the adjusted (bias-corrected) data for the *future/projected* period (optional) - -The data is required in raster format and dataframe formats **Calibration vs Validation dates** +We have used split sample testing of 30 years : 10 years for calibration and validation. + The calibration period runs between 01-12-1980 to the day prior to 01-12-2010 The validation period runs between 01-12-2010 to the day prior to 01-12-2020 + +**Data used in this script:** + +Here we load data from **validation** period + +The dates have been aligned manually and represent xx dys whilst we fix a bug + + ```{r data loading, include=FALSE} #This chunk attempts to apply the conversion to python output data to a form that this script will also use. This could (and probably should) be moved to a source script -- also the R pre-processing should probably be moved to the bias correction script? @@ -76,26 +68,25 @@ var <- "tasmax" runs <- c("05", "06", "07", "08") -## Load a source raster to extract the crs +## Load a source raster to extract the crs as this sometimes fails between python and R r <- list.files(paste0(dd, "Reprojected_infill/UKCP2.2/tasmax/05/latest/"), full.names = T)[1] rast <- rast(r) crs <- crs(rast) -####### PYTHON INPUTS HERE ###### -# if(input=="raster"){ +####### PYTHON INPUTS ###### # This script uses both raster data and the raw data # This script uses Lists to group everything by runs # Therefore what is require from this here is to create a list object for each of the sets of the data as listed above, where the list items are the rasters or dataframes by run (ie each level of the list is a run) # .nc and .tif files can be read with rast("path/to/file.nc") # Conversion to df is just as.data.frame(raster, xy=T) - easiest thing is just to loop using lapply the files #dfs are assumed to be cells x time +#/vmfileshare/ClimateData/Debiased/three.cities.cropped/workshop/ -rd.python <- "Debiased/three.cities.cropped/" +rd.python <- "Debiased/three.cities.cropped/workshop/" dd.python <- "/mnt/vmfileshare/ClimateData/" #Data directory of all data used in this script -#ncfname <- paste(dd, rd, city, runs[1], var, ".nc", sep="") r.python <- lapply(runs, function(i){ - fp = paste0(dd.python, rd.python, "/Glasgow/", i, "/tasmax") + fp = paste0(dd.python, rd.python, city,"/", i, "/", var) list.files(fp, pattern="debiased*", full.names = T)}) @@ -111,15 +102,15 @@ val_py <- lapply(1:4, function(x){ names(dfr) <- c("df", "rast") return(dfr) }) - names(L) <- c("delta_method", "quantile_delta", "quantile", "var_scaling") + names(L) <- c("py.delta_method", "py.quantile_delta", "py.quantile", "py.var_scaling") return(L) }) names(val_py) <- paste0("python_runs", runs) -# } else if (input=="RDS"){ -### This R bit is a bit crazy because of the format output from the bias correction - at some point to be cleaned up and moved to a different script. +``` +```{r} ## The output created from the R bias correction framework is a list of dataframes containing all the data we need for this doc (although some are transposed). rd <- "Debiased/R/QuantileMapping/three.cities/" @@ -133,30 +124,20 @@ names <- gsub(paste0(dd,rd,city,"|/|.RDS"),"",files.v) names(allruns) <- names #This was returned for ease where multiple runs have been looped to apply this paritcular function, but actually we don't need a cope for each nor this data in a list. Therefore: -obs.cal.df <- as.data.frame(t(allruns[[1]]$t.obs)) - -obs.val.df <- allruns[[1]]$val.df[c(1:3601)] #To run until 30th Nov 2020 - -cpm.cal.raw.df.L <- lapply(allruns, function(L){ - as.data.frame(t(L[["t.cal"]])) - }) +#obs.cal.df <- as.data.frame(t(allruns[[1]]$t.obs)) +obs.val.df <- allruns[[1]]$val.df #To run between 1st Dec 2010 and 30th Nov 2020 +obs.val.df <- obs.val.df[c(1:3600)] +obs.val.df <- obs.val.df[,-removethisindex] #In the R scirpt, the validation is corrected with the projected data as well - so needs to be seperated out (and transposed) cpm.val.raw.df.L <- lapply(allruns, function(L){ proj <- as.data.frame(t(L[["t.proj"]])) - val.end.date <- min(grep("20201201-", names(proj)))-1 + val.end.date <- min(grep("20201201-", names(proj)))-1 cpm.val.raw.df <- proj[,1:val.end.date] + cpm.val.raw.df <- cpm.val.raw.df[,!removethisindex] }) -cpm.proj.raw.df.L <- lapply(allruns, function(L){ - proj <- as.data.frame(t(L[["t.proj"]])) - val.end.date <- min(grep("20201201-", names(proj))) - cpm.val.raw.df <- proj[,val.end.date:ncol(proj)] -}) -cpm.cal.adj.df.L <- lapply(allruns, function(L){ - adj <- as.data.frame(t(L[["qm1.hist"]])) -}) cpm.val.adj.df.L <- lapply(allruns, function(L){ proj <- as.data.frame(t(L[["qm1.val.proj"]])) @@ -164,16 +145,16 @@ cpm.cal.adj.df.L <- lapply(allruns, function(L){ proj[,1:val.end.date] }) - cpm.proj.adj.df.L <- lapply(allruns, function(L){ - proj <- as.data.frame(t(L[["qm1.val.proj"]])) - val.end.date <- min(grep("20201201-", names(proj))) - proj[,val.end.date:ncol(proj)] -}) +# cpm.proj.adj.df.L <- lapply(allruns, function(L){ + # proj <- as.data.frame(t(L[["qm1.val.proj"]])) +# val.end.date <- min(grep("20201201-", names(proj))) +# proj[,val.end.date:ncol(proj)] +#}) ## Convert to rasters --requires creation of x and y cols from row names ## For the comparison, just converting the observation and cpm for the cal and val perios (ie not the projection datasets) -obsrastL <- lapply(list(obs.cal.df, obs.val.df), function(i){ +i <- obs.val.df rn <- row.names(i) #The rownames were saves as x_y coordinates xi <- gsub("_.*", "", rn) yi <- gsub(".*_", "", rn) @@ -181,14 +162,11 @@ obsrastL <- lapply(list(obs.cal.df, obs.val.df), function(i){ df <- cbind(xy, i) r <- rast(df, type="xyz") crs(r) <- crs - return(r) -}) +obs.val.rasts <- r + -names(obsrastL) <- c("obs.cal.rasts", "obs.val.rasts") -list2env(obsrastL, .GlobalEnv) -remove(obsrastL) -list2rast <- list(cpm.cal.raw.df.L, cpm.cal.adj.df.L, cpm.val.raw.df.L, cpm.val.adj.df.L) +list2rast <- list(cpm.val.raw.df.L, cpm.val.adj.df.L) rastsL <- lapply(list2rast, function(x){ allruns <- x @@ -216,10 +194,6 @@ remove(list2rast) gc() -# } else { -# print("Invalid input") -#} - ``` @@ -227,11 +201,11 @@ gc() ## **1. Bias Correction Assessment: trends** -An visual comparison of trends across observation, raw and adjusted data for the same time period +A visual comparison of trends across observation, raw and adjusted data for the same time period ### **1a. Raster comparison** -Random selection of 3 days of the observation, calibration and two adjusted cals, for three historic days +A visualisation across different runs and methods Adding in the city shapeoutline for prettier maps @@ -243,17 +217,52 @@ shape <-sf::st_as_sf(vect(paste0(dd, "shapefiles/three.cities/", city, "/", city -#### **Day 1 - 1980-12-01 - calibration period ** {.tabset} +#### **Compare across the same day** -##### Run05 +Here we take a day and visualise the differences between the methods and runs + +The example below is set up to compare Run 05, but Run 06, 07 or 08 can all be compared ```{r} -tm_shape(val_py$python_runs05$delta_method$rast$tasmax_3000) + - tm_raster(title="CPM, bias-corrected") + +t1 <- tm_shape(val_py$python_runs05$delta_method$rast$tasmax_3000) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + + tm_borders(col="black") + + +t2 <- tm_shape(val_py$python_runs05$delta_method$rast$tasmax_3000) + + tm_raster(title="CPM, raw (unadjusted)") + + tm_layout(legend.outside = T) + + tm_shape(shape) + + tm_borders(col="black") + +t3 <- tm_shape(val_py$python_runs05$delta_method$rast$tasmax_3000) + + tm_raster(title="CPM, delta method, python") + + tm_layout(legend.outside = T) + + tm_shape(shape) + + tm_borders(col="black") + +t4 <- tm_shape(val_py$python_runs05$delta_method$rast$tasmax_3000) + + tm_raster(title="CPM, quantile mapping, python") + tm_layout(legend.outside = T) + tm_shape(shape) + tm_borders(col="black") + + +t5 <- tm_shape(val_py$python_runs05$delta_method$rast$tasmax_3000) + + tm_raster(title="CPM, quantile delta mapping, python") + + tm_layout(legend.outside = T) + + tm_shape(shape) + + tm_borders(col="black") + +t5 <- tm_shape(val_py$python_runs05$delta_method$rast$tasmax_3000) + + tm_raster(title="CPM, quantile delta mapping") + + tm_layout(legend.outside = T) + + tm_shape(shape) + + tm_borders(col="black") + ``` @@ -339,6 +348,8 @@ tm_shape(cpm.cal.adj.rasts.L$`08`[[1]]) + tm_raster(title="CPM, bias-corrected") ``` +#### {-} + #### **Day 2 - 2008-08-01 - calibration period ** {.tabset} ##### Run05 @@ -522,19 +533,9 @@ tm_shape(cpm.val.adj.rasts.L$`08`[[1590]]) + Using the validation data set for this -#NOTE TO GRIFF - if the data has ran we can delete this bit and add in the new stuff ```{r} -hads <- obs.val.df[c(32:2551)] -raw.cpm_05 <- cpm.val.raw.df.L$Run05_tasmax[c(32:2551)] -R_quantile_05 <- cpm.val.adj.df.L$Run05_tasmax[c(32:2551)] - -py_quantile_05 <- val_py$python_runs05$quantile$df[c(361:2880)] -py_quantile_delta_05 <- val_py$python_runs05$quantile_delta$df[c(361:2880)] -py_delta_05 <- val_py$python_runs05$delta_method$df[c(361:2880)] -py_varscaling <- val_py$python_runs05$var_scaling$df[c(361:2880)] - val.dfs <- list(hads, raw.cpm_05, R_quantile_05, py_quantile_05, py_quantile_delta_05, py_delta_05, py_varscaling) #Convert dfs to a vector @@ -668,7 +669,7 @@ seasonal.descriptives[[2]] %>% kable(booktabs = T) %>% kable_styling() - +``` #### Summer ```{r} From f7a280301834730cfce4c4ee82c2fa747bb61fd6 Mon Sep 17 00:00:00 2001 From: Grigorios Mingas Date: Sun, 10 Dec 2023 20:44:31 +0000 Subject: [PATCH 45/83] Recommit installation cell that was overwritten --- ...es_Bias_Assessment_correction_markdown.Rmd | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/notebooks/Assessing_bc_data/RuthUpdates_Bias_Assessment_correction_markdown.Rmd b/notebooks/Assessing_bc_data/RuthUpdates_Bias_Assessment_correction_markdown.Rmd index 479d2d29..89fe414d 100644 --- a/notebooks/Assessing_bc_data/RuthUpdates_Bias_Assessment_correction_markdown.Rmd +++ b/notebooks/Assessing_bc_data/RuthUpdates_Bias_Assessment_correction_markdown.Rmd @@ -16,24 +16,29 @@ params: ```{r libs and setup, message=FALSE, warning=F} + rm(list=ls()) -# Add in the automatic installation process +# install packages +list.of.packages <- c("ggplot2", "terra", "tmap", "RColorBrewer", "tidyverse", "kableExtra", "ncdf4", "knitr") +new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] +if(length(new.packages)) install.packages(new.packages) +if (!require(devtools)) install.packages("devtools") +install_github("hzambran/hydroTSM") +install_github("hzambran/hydroGOF") + +knitr::opts_knit$set(root.dir="/mnt/vmfileshare/ClimateData/") +# import packages library(ggplot2) library(terra) library(tmap) #pretty maps library(RColorBrewer) library(tidyverse) library(kableExtra) -install.packages("ncdf4") library(ncdf4) - -if (!require(devtools)) install.packages("devtools") - library(devtools) -install_github("hzambran/hydroTSM") -install_github("hzambran/hydroGOF") + ``` From 3b141c7102b05666f89990b72996885be40fb7ff Mon Sep 17 00:00:00 2001 From: Ruth Bowyer <105492883+RuthBowyer@users.noreply.github.com> Date: Sun, 10 Dec 2023 22:22:15 +0000 Subject: [PATCH 46/83] Final notebook that knits --- .../MethodsAssessment_DecWorkshop.Rmd | 617 ++++++++++++++++++ 1 file changed, 617 insertions(+) create mode 100644 notebooks/Assessing_bc_data/MethodsAssessment_DecWorkshop.Rmd diff --git a/notebooks/Assessing_bc_data/MethodsAssessment_DecWorkshop.Rmd b/notebooks/Assessing_bc_data/MethodsAssessment_DecWorkshop.Rmd new file mode 100644 index 00000000..e72f3bc3 --- /dev/null +++ b/notebooks/Assessing_bc_data/MethodsAssessment_DecWorkshop.Rmd @@ -0,0 +1,617 @@ +--- +title: "Bias correction assessment" +author: "Ruth Bowyer" +date: "`r format(Sys.Date())`" +output: + html_document: + theme: cosmo + toc: TRUE + toc_float: TRUE + toc_depth: 4 + code_folding: hide + df_print: paged +params: + ask: false +--- + + +```{r libs and setup, message=FALSE, warning=F} + +rm(list=ls()) + +# install packages +list.of.packages <- c("ggplot2", "terra", "tmap", "RColorBrewer", "tidyverse", "kableExtra", "ncdf4", "knitr") +new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] +if(length(new.packages)) install.packages(new.packages) +if (!require(devtools)) install.packages("devtools") +install_github("hzambran/hydroTSM") +install_github("hzambran/hydroGOF") + +knitr::opts_knit$set(root.dir="/mnt/vmfileshare/ClimateData/") + +# import packages +library(ggplot2) +library(terra) +library(tmap) #pretty maps +library(RColorBrewer) +library(tidyverse) +library(kableExtra) +library(ncdf4) +library(devtools) + + +``` + + +## **0. About** + +This is an example notebook for the assessment of bias corrected data, comparing it with our differing outputs from different methods across R and python. + + +**Calibration vs Validation dates** + +We have used split sample testing of 30 years : 10 years for calibration and validation. + +The calibration period runs between 01-12-1980 to the day prior to 01-12-2010 +The validation period runs between 01-12-2010 to the day prior to 01-12-2020 + + +**Data used in this script:** + +Here we load data from **validation** period. + +Unfortunetly, we have a bug in some of code that we are working to fix, meaning data bias corrected using python is missing 45 days over the 10 year validation period (0) +The dates have been aligned manually and represent 3555 days over the 10 year period whilst we fix the bug + + +```{r data loading, include=FALSE} + +dd <- "/mnt/vmfileshare/ClimateData/" #Data directory of all data used in this script + +city <- "Glasgow" +var <- "tasmax" +runs <- c("05", "06", "07", "08") + + +## Load a source raster to extract the crs as this sometimes fails between python and R +r <- list.files(paste0(dd, "Reprojected_infill/UKCP2.2/tasmax/05/latest/"), full.names = T)[1] +rast <- rast(r) +crs <- crs(rast) + +####### PYTHON INPUTS ###### +# This script uses both raster data and the raw data +# This script uses Lists to group everything by runs +# Therefore what is require from this here is to create a list object for each of the sets of the data as listed above, where the list items are the rasters or dataframes by run (ie each level of the list is a run) +# .nc and .tif files can be read with rast("path/to/file.nc") +# Conversion to df is just as.data.frame(raster, xy=T) - easiest thing is just to loop using lapply the files +#dfs are assumed to be cells x time +#/vmfileshare/ClimateData/Debiased/three.cities.cropped/workshop/ + +rd.python <- "Debiased/three.cities.cropped/workshop/" +dd.python <- "/mnt/vmfileshare/ClimateData/" #Data directory of all data used in this script + +r.python <- lapply(runs, function(i){ + fp = paste0(dd.python, rd.python, city,"/", i, "/", var) + list.files(fp, pattern="debiased*", full.names = T)}) + + +val_py <- lapply(1:4, function(x){ + xx <- r.python[[x]] + L <- lapply(1:4, function(i){ + rp <- xx[[i]] + r <- rast(rp) + df <- as.data.frame(r, xy=T) + r <- rast(df, type="xyz") + df <- df[,3:ncol(df)] + crs(r) <- crs + dfr <- list(df,r) + names(dfr) <- c("df", "rast") + return(dfr) + }) + names(L) <- c("py.delta_method", "py.quantile_delta", "py.quantile", "py.var_scaling") + return(L) +}) + +names(val_py) <- paste0("python_runs", runs) + +``` + +```{r} + +## We've found a bug which we're working to fix in the python code, that means some days are dropped and some are duplicated. The below creates an index to drop those days in the R outputs and the observational data + +xx <- r.python[[1]] +rp <- xx[[1]] + +ncin <- nc_open(rp) +time <- ncvar_get(ncin, "time") + +missing <- c(0:3600) +tmissing <- missing %in% time + +removethisindex <- missing[tmissing==F] + 1 + +removethisindex2 <- removethisindex[-c(1, 4, 8, 12, 52, 20, 24, 29, 34, 38, 44, 48, 56)] + + +## The output created from the R bias correction framework is a list of dataframes containing all the data we need for this doc (although some are transposed). +rd <- "Debiased/R/QuantileMapping/three.cities/" + +files <- list.files(paste0(dd,rd,city),full.names=T) +files.v <- files[grepl(var, files)] + +allruns <- lapply(files.v, readRDS) + +names <- gsub(paste0(dd,rd,city,"|/|.RDS"),"",files.v) +names(allruns) <- names + +#This was returned for ease where multiple runs have been looped to apply this paritcular function, but actually we don't need a cope for each nor this data in a list. Therefore: + +obs.val.df <- allruns[[1]]$val.df #To run between 1st Dec 2010 and 30th Nov 2020 +obs.val.df <- obs.val.df[c(1:3600)] +obs.val.df <- obs.val.df[,-removethisindex2] + +#In the R scirpt, the validation is corrected with the projected data as well - so needs to be seperated out (and transposed) +cpm.val.raw.df.L <- lapply(allruns, function(L){ + proj <- as.data.frame(t(L[["t.proj"]])) + val.end.date <- min(grep("20201201-", names(proj)))-1 + cpm.val.raw.df <- proj[,1:(val.end.date-1)] + cpm.val.raw.df <- cpm.val.raw.df[,-removethisindex2] +}) + + + +cpm.val.adj.df.L <- lapply(allruns, function(L){ + proj <- as.data.frame(t(L[["qm1.val.proj"]])) + val.end.date <- min(grep("20201201-", names(proj)))-1 + proj[,1:val.end.date] + cpm.val.adj.df <- proj[,1:(val.end.date-1)] + cpm.val.adj.df <- cpm.val.adj.df[,-removethisindex2] +}) + + +i <- obs.val.df + rn <- row.names(i) #The rownames were saves as x_y coordinates + xi <- gsub("_.*", "", rn) + yi <- gsub(".*_", "", rn) + xy <- data.frame(x = xi, y = yi) + df <- cbind(xy, i) + r <- rast(df, type="xyz") + crs(r) <- crs +obs.val.rasts <- r + + + +list2rast <- list(cpm.val.raw.df.L, cpm.val.adj.df.L) + +rastsL <- lapply(list2rast, function(x){ + allruns <- x + df.rL <- lapply(runs, function(i){ + df <- allruns[[grep(i, names(allruns))]] #extract df based on run id + rn <- row.names(df) #The rownames were saves as x_y coordinates + xi <- gsub("_.*", "", rn) + yi <- gsub(".*_", "", rn) + xy <- data.frame(x = xi, y = yi) + df <- cbind(xy, df) + r <- rast(df, type="xyz") + crs(r) <- crs + return(r) + }) + names(df.rL) <- runs + return(df.rL) + }) + +names(rastsL) <- c("cpm.val.raw.rasts.L", "cpm.val.adj.rasts.L") + +list2env(rastsL, .GlobalEnv) + +remove(rastsL) +remove(list2rast) +remove(allruns) + +obsL <- list(obs.val.df, obs.val.rasts) +names(obsL) <- c("observation.df", "observation.rasts") + +cpm.raw <- lapply(1:4, function(i){ + rast1 <- cpm.val.raw.rasts.L[[i]] + df1 <- cpm.val.raw.df.L[[i]] + l <- list(rast1, df1) + names(l) <- c("rast", "df") + return(l) + }) + +names(cpm.raw) <- paste0("cpm_raw_Run", names(cpm.val.raw.rasts.L)) + +cpm.adj <- lapply(1:4, function(i){ + rast1 <- cpm.val.adj.rasts.L[[i]] + df1 <- cpm.val.adj.df.L[[i]] + l <- list(rast1, df1) + names(l) <- c("rast", "df") + return(l) + }) + +names(cpm.adj) <- paste0("cpm_R_quantilemapping_Run", names(cpm.val.adj.rasts.L)) +``` + +```{r} + +names(cpm.adj) <- paste0("r.quantile_Run", names(cpm.val.adj.rasts.L)) + +dataL <- c(obsL, cpm.raw, val_py, cpm.adj) +``` + +## **1. Bias Correction Assessment: trends** + +A visual comparison of trends across observation, raw and adjusted data for the same time period + +### **1a. Raster comparison** + +A visualisation across different runs and methods + +Adding in the city shapeoutline for prettier maps + +```{r} + +shape <-sf::st_as_sf(vect(paste0(dd, "shapefiles/three.cities/", city, "/", city, ".shp"))) + + +``` + + + +## **Compare across the same day** + +Here we take a day and visualise the differences between the methods and runs + +The example below is set up to compare Run 05, but Run 06, 07 or 08 can all be compared + +Choose a day between 1 - 3555 +You can change the Run in the code below + +```{r fig.height= 10} +day <- 1777 + +t1 <- tm_shape(dataL$observation.rasts[[day]]) + + tm_raster(title="Observation") + + tm_layout(legend.outside = T) + + tm_shape(shape) + + tm_borders(col="black") + + +t2 <- tm_shape(dataL$cpm_raw_Run05$rast[[day]]) + + tm_raster(title="Raw (unadjusted), Run 05") + + tm_layout(legend.outside = T) + + tm_shape(shape) + + tm_borders(col="black") + +t3 <- tm_shape(dataL$python_runs05$py.delta_method$rast[[day]]) + + tm_raster(title="Delta method, cmethods, Run05") + + tm_layout(legend.outside = T) + + tm_shape(shape) + + tm_borders(col="black") + +t4 <- tm_shape(dataL$python_runs05$py.quantile_delta$rast[[day]]) + + tm_raster(title="QDM, cmethods, Run05") + + tm_layout(legend.outside = T) + + tm_shape(shape) + + tm_borders(col="black") + +t5 <- tm_shape(dataL$python_runs05$py.var_scaling$rast[[day]]) + + tm_raster(title="Variance scaling, cmethods, Run05") + + tm_layout(legend.outside = T) + + tm_shape(shape) + + tm_borders(col="black") + +t6 <- tm_shape(dataL$python_runs05$py.quantile$rast[[day]]) + + tm_raster(title="Quantile mapping, cmethods, Run05") + + tm_layout(legend.outside = T) + + tm_shape(shape) + + tm_borders(col="black") + + +t7 <- tm_shape(dataL$r.quantile_Run05$rast[[day]]) + + tm_raster(title="NP quantile mapping, qmap, Run05") + + tm_layout(legend.outside = T) + + tm_shape(shape) + + tm_borders(col="black") + + +tmap_arrange(t1, t2, t3, t4, t5, t6, t7, nrow = 4) + +``` + + + +## **2. Bias Correction Assessment: Metrics** + +We use Run 05 again here, but you could change the run in the code below to see how it effects the outcome + + +```{r} + +val.dfs <- list(dataL$observation.df, + dataL$cpm_raw_Run05$df, + dataL$python_runs05$py.delta_method$df, + dataL$python_runs05$py.quantile_delta$df, + dataL$python_runs05$py.quantile$df, + dataL$python_runs05$py.var_scaling$df, + dataL$r.quantile_Run05$df) + +#Convert dfs to a vector +val.dfs.v <- lapply(val.dfs, function(d){ + #Convert to single vector + unlist(as.vector(d))}) + +val.dfs.v.df <- as.data.frame(val.dfs.v) + +names <- c("Obs","Raw_Run05", "Delta_mapping_cmethods_Run05", "QDM_cmethods_Run05", + "QM_cmethods_Run05","Var_scaling_cmethods_Run05", "QM_qmap_Run05") +names(val.dfs.v.df) <- names +``` + + +### **2a. Descriptive statistics** + +```{r descriptives validation} + +descriptives <- apply(val.dfs.v.df, 2, function(x){ + per <- data.frame(as.list(quantile(x, probs=c(0.1, 0.9)))) + data.frame(mean=mean(x), sd=sd(x), min = min(x), per10th=per$X10.,per90th=per$X90., max = max(x)) +}) + +descriptives <- descriptives %>% reduce(rbind) +row.names(descriptives) <- names(val.dfs.v.df) +d <- t(descriptives) + +d %>% + kable(booktabs = T) %>% + kable_styling() + +``` + + +#### **Fig.Density plot of validation period** + +**Note** - need to add back in some facetting to this fig + +```{r warning=F, message=F, fig.height=8} +m <- reshape2::melt(val.dfs.v.df) + +ggplot(m, aes(value, fill=variable, colour=variable)) + + geom_density(alpha = 0.3, position="identity") + + theme_minimal() + + scale_fill_brewer(palette = "Set1") + + scale_color_brewer(palette = "Set1") + + facet_wrap(.~variable) + +``` + +#### **Seasonal** + +```{r} + +# Using the 360 year structure, derive row indexes for each of the seasons, assuming the first calendar date represents Dec 1st + +winter <- c(1:90) +for(i in 1:8){ + x <-1 + i*360 + y <-1 + i*360 + 90 #60 days is 3 months + winter <- c(winter, x:y) +} + +winter <- winter[!winter%in%removethisindex2] + +spring <- c(91:180) +for(i in 1:8){ + x <-91 + (i*360) + y <-91 + (i*360) + 90 #90 days is 3 months + sping <- c(spring, x:y) +} +spring <- spring[!spring%in%removethisindex2] + +summer <- c(181:270) +for(i in 1:8){ + x <- 181 + (i*360) + y <- 181 + i*360 + 60 #60 days is 3 months + summer <- c(summer, x:y) +} + +summer <- summer[!summer%in%removethisindex2] + +autumn <- c(271:360) +for(i in 1:8){ + x <- 181 + (i*360) + y <- 181 + i*360 + 60 #60 days is 3 months + autumn <- c(autumn, x:y) +} + +autumn <- autumn[!autumn%in%removethisindex2] + +seasons <- list(winter, spring, summer, autumn) + + +``` + + +```{r seasonal descriptives} + +seasonal.descriptives <- lapply(seasons, function(s){ + +#Convert dfs to a vector +df<- lapply(val.dfs, function(d){ + #Convert to single vector with just the seasonally defined columns + d <- d[,s] + unlist(as.vector(d))}) + +df <- as.data.frame(df) + + names(df) <- names + + descriptives <- apply(df, 2, function(x){ + per <- data.frame(as.list(quantile(x, probs=c(0.1, 0.9)))) + data.frame(mean=mean(x), sd=sd(x), min = min(x), per10th=per$X10.,per90th=per$X90., max = max(x)) + }) + + descriptives <- descriptives %>% reduce(rbind) + row.names(descriptives) <- names(df) + d <- t(descriptives) +}) + + +``` + +##### **Winter** + +```{r} +seasonal.descriptives[[1]] %>% + kable(booktabs = T) %>% + kable_styling() +``` + +##### **Spring** + +```{r} +seasonal.descriptives[[2]] %>% + kable(booktabs = T) %>% + kable_styling() + +``` +##### **Summer** + +```{r} +seasonal.descriptives[[3]] %>% + kable(booktabs = T) %>% + kable_styling() +``` + +##### **Autumn** + +```{r} +seasonal.descriptives[[4]] %>% + kable(booktabs = T) %>% + kable_styling() +``` + + + +### **2b. Model fit statistics** + +Using the following to assess overall fit: + +- **R-squared (rsq)** +- **Root Square Mean Error (RMSE)** +- **Nash-Sutcliffe Efficiency (NSE):** Magnitude of residual variance compared to measured data variance, ranges -∞ to 1, 1 = perfect match to observations +- **Percent bias (PBIAS):** The optimal value of PBIAS is 0.0, with low-magnitude values indicating accurate model simulation. Positive values indicate overestimation bias, whereas negative values indicate model underestimation bias. + +```{r} +actual <- val.dfs.v.df$Obs + +rsq <- sapply(val.dfs.v.df[c(2:ncol(val.dfs.v.df))], function(x){ + cor(actual, x)^2 +}) + +rmse <- sapply(val.dfs.v.df[c(2:ncol(val.dfs.v.df))], function(x){ + sqrt(mean((actual - x)^2)) +}) + + +pbias <- sapply(val.dfs.v.df[c(2:ncol(val.dfs.v.df))], function(x){ + hydroGOF::pbias(x, actual) +}) + + +nse <- sapply(val.dfs.v.df[c(2:ncol(val.dfs.v.df))], function(x){ + hydroGOF::NSE(x, actual) +}) + + +k <- cbind(rsq, rmse, pbias, nse) +k %>% + kable(booktabs = T) %>% + kable_styling() + +``` +#### **Seasonal** + +```{r} + + +seasonal.model.stats <- lapply(seasons, function(s){ + + #Convert dfs to a vector + df<- lapply(val.dfs, function(d){ + + #Convert to single vector with just the seasonally defined columns + d <- d[,s] + unlist(as.vector(d))}) + + df <- as.data.frame(df) + names(df) <- names + + actual <- df$Obs + + rsq <- sapply(df[c(2:ncol(df))], function(x){ + cor(actual, x)^2 + }) + + + rmse <- sapply(df[c(2:ncol(df))], function(x){ + sqrt(mean((actual - x)^2)) + }) + + + pbias <- sapply(df[c(2:ncol(df))], function(x){ + hydroGOF::pbias(x, actual) + }) + + + nse <- sapply(df[c(2:ncol(df))], function(x){ + hydroGOF::NSE(x, actual) + }) + + k <- cbind(rsq, rmse, pbias, nse)}) + +``` + +Highlighting the bias corrected statistics + +##### **Winter** + +```{r} + +seasonal.model.stats[[1]] %>% + kable(booktabs = T) %>% + kable_styling() +``` + + +##### **Spring** + +```{r} + +seasonal.model.stats[[2]] %>% + kable(booktabs = T) %>% + kable_styling() +``` + + +##### **Summer** + +```{r} + +seasonal.model.stats[[3]] %>% + kable(booktabs = T) %>% + kable_styling() +``` + + + +##### **Autumn** + +```{r} + +seasonal.model.stats[[4]] %>% + kable(booktabs = T) %>% + kable_styling() +``` + From 76048fd7fdb481764f39859a06766b511c501a26 Mon Sep 17 00:00:00 2001 From: Ruth Bowyer <105492883+RuthBowyer@users.noreply.github.com> Date: Sun, 10 Dec 2023 22:27:03 +0000 Subject: [PATCH 47/83] adding html rendered notebook --- .../Bias correction assessment.html | 5458 +++++++++++++++++ .../MathJax.js | 19 + .../MethodsAssessment_DecWorkshop.Rmd | 1 - 3 files changed, 5477 insertions(+), 1 deletion(-) create mode 100644 notebooks/Assessing_bc_data/Bias correction assessment.html create mode 100644 notebooks/Assessing_bc_data/Bias correction assessment_files/MathJax.js diff --git a/notebooks/Assessing_bc_data/Bias correction assessment.html b/notebooks/Assessing_bc_data/Bias correction assessment.html new file mode 100644 index 00000000..2847aa0c --- /dev/null +++ b/notebooks/Assessing_bc_data/Bias correction assessment.html @@ -0,0 +1,5458 @@ + + + + + + + + + + + + + +Bias correction assessment + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+
+
+
  • 0. About
  • 1. Bias Correction Assessment: trends
  • Compare across the same day
  • 2. Bias Correction Assessment: Metrics
+
+ +
+ + + + + + + +
rm(list=ls())
+
+# install packages
+list.of.packages <- c("ggplot2", "terra", "tmap", "RColorBrewer", "tidyverse", "kableExtra", "ncdf4", "knitr")
+new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
+if(length(new.packages)) install.packages(new.packages)
+if (!require(devtools)) install.packages("devtools")
+install_github("hzambran/hydroTSM")
+install_github("hzambran/hydroGOF")
+
+knitr::opts_knit$set(root.dir="/mnt/vmfileshare/ClimateData/")
+
+# import packages
+library(ggplot2)
+library(terra)
+library(tmap) #pretty maps
+library(RColorBrewer)
+library(tidyverse)
+library(kableExtra)
+library(ncdf4)
+library(devtools)
+
+

0. About

+

This is an example notebook for the assessment of bias corrected +data, comparing it with our differing outputs from different methods +across R and python.

+

Calibration vs Validation dates

+

We have used split sample testing of 30 years : 10 years for +calibration and validation.

+

The calibration period runs between 01-12-1980 to the day prior to +01-12-2010 The validation period runs between 01-12-2010 to the day +prior to 01-12-2020

+

Data used in this script:

+

Here we load data from validation period

+

The dates have been aligned manually and represent 3555 days whilst +we fix a bug

+
xx <- r.python[[1]]
+rp <- xx[[1]]
+
+ncin <- nc_open(rp)
+time <- ncvar_get(ncin, "time")
+
+missing <- c(0:3600)
+tmissing <- missing %in% time
+
+removethisindex <- missing[tmissing==F] + 1 
+
+removethisindex2 <- removethisindex[-c(1, 4, 8, 12, 52, 20, 24, 29, 34, 38, 44, 48, 56)]
+
+
## <environment: R_GlobalEnv>
+
remove(rastsL) 
+remove(list2rast)
+
+gc()
+
##            used (Mb) gc trigger  (Mb) max used  (Mb)
+## Ncells  3574880  191    6358087 339.6  6358087 339.6
+## Vcells 42063927  321   73809384 563.2 61441141 468.8
+
obsL <- list(obs.val.df, obs.val.rasts)
+names(obsL) <- c("observation.df", "observation.rasts")
+
+cpm.raw <- lapply(1:4, function(i){
+  rast1 <- cpm.val.raw.rasts.L[[i]]
+  df1 <- cpm.val.raw.df.L[[i]]
+  l <- list(rast1, df1)
+  names(l) <- c("rast", "df")
+  return(l)
+  })
+
+names(cpm.raw) <- paste0("cpm_raw_Run", names(cpm.val.raw.rasts.L))
+
+cpm.adj <- lapply(1:4, function(i){
+  rast1 <- cpm.val.adj.rasts.L[[i]]
+  df1 <- cpm.val.adj.df.L[[i]]
+  l <- list(rast1, df1)
+  names(l) <- c("rast", "df")
+  return(l)
+  })
+
+names(cpm.adj) <- paste0("cpm_R_quantilemapping_Run", names(cpm.val.adj.rasts.L))
+
names(cpm.adj) <- paste0("r.quantile_Run", names(cpm.val.adj.rasts.L))
+
+dataL <- c(obsL, cpm.raw, val_py, cpm.adj)
+
+ +
+

Compare across the same day

+

Here we take a day and visualise the differences between the methods +and runs

+

The example below is set up to compare Run 05, but Run 06, 07 or 08 +can all be compared

+

Choose a day between 1 - 3555 You can change the Run in the code +below

+
day <- 1777
+
+t1 <- tm_shape(dataL$observation.rasts[[day]]) + 
+  tm_raster(title="Observation") + 
+  tm_layout(legend.outside = T) + 
+  tm_shape(shape) + 
+  tm_borders(col="black")
+
+
+t2 <- tm_shape(dataL$cpm_raw_Run05$rast[[day]]) + 
+  tm_raster(title="Raw (unadjusted), Run 05") + 
+  tm_layout(legend.outside = T) + 
+  tm_shape(shape) + 
+  tm_borders(col="black")
+
+t3 <- tm_shape(dataL$python_runs05$py.delta_method$rast[[day]]) + 
+  tm_raster(title="Delta method, cmethods, Run05") + 
+  tm_layout(legend.outside = T) + 
+  tm_shape(shape) + 
+  tm_borders(col="black")
+
+t4 <- tm_shape(dataL$python_runs05$py.quantile_delta$rast[[day]]) + 
+  tm_raster(title="QDM, cmethods, Run05") + 
+  tm_layout(legend.outside = T) + 
+  tm_shape(shape) + 
+  tm_borders(col="black")
+
+t5 <- tm_shape(dataL$python_runs05$py.var_scaling$rast[[day]]) + 
+  tm_raster(title="Variance scaling, cmethods, Run05") + 
+  tm_layout(legend.outside = T) + 
+  tm_shape(shape) + 
+  tm_borders(col="black")
+
+t6 <- tm_shape(dataL$python_runs05$py.quantile$rast[[day]]) + 
+  tm_raster(title="Quantile mapping, cmethods, Run05") + 
+  tm_layout(legend.outside = T) + 
+  tm_shape(shape) + 
+  tm_borders(col="black")
+
+
+t7 <- tm_shape(dataL$r.quantile_Run05$rast[[day]]) + 
+  tm_raster(title="NP quantile mapping, qmap, Run05") + 
+  tm_layout(legend.outside = T) + 
+  tm_shape(shape) + 
+  tm_borders(col="black")
+
+
+tmap_arrange(t1, t2, t3, t4, t5, t6, t7,  nrow = 4)
+

+
+
+

2. Bias Correction Assessment: Metrics

+

We use Run 05 again here, but you could change the run in the code +below to see how it effects the outcome

+
val.dfs <- list(dataL$observation.df, 
+                dataL$cpm_raw_Run05$df,
+                dataL$python_runs05$py.delta_method$df,
+                dataL$python_runs05$py.quantile_delta$df,
+                dataL$python_runs05$py.quantile$df,
+                dataL$python_runs05$py.var_scaling$df,
+                dataL$r.quantile_Run05$df)
+
+#Convert dfs to a vector
+val.dfs.v <- lapply(val.dfs, function(d){
+  #Convert to single vector
+  unlist(as.vector(d))})
+
+val.dfs.v.df <- as.data.frame(val.dfs.v)
+
+names <-  c("Obs","Raw_Run05", "Delta_mapping_cmethods_Run05", "QDM_cmethods_Run05",
+                         "QM_cmethods_Run05","Var_scaling_cmethods_Run05", "QM_qmap_Run05")
+names(val.dfs.v.df) <- names
+
+

2a. Descriptive statistics

+
descriptives <- apply(val.dfs.v.df, 2, function(x){ 
+  per <- data.frame(as.list(quantile(x, probs=c(0.1, 0.9))))
+  data.frame(mean=mean(x), sd=sd(x), min = min(x), per10th=per$X10.,per90th=per$X90., max = max(x))
+})
+
+descriptives <- descriptives %>% reduce(rbind)
+row.names(descriptives) <- names(val.dfs.v.df)
+d <- t(descriptives)
+
+d %>% 
+  kable(booktabs = T) %>%
+  kable_styling()
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +Obs + +Raw_Run05 + +Delta_mapping_cmethods_Run05 + +QDM_cmethods_Run05 + +QM_cmethods_Run05 + +Var_scaling_cmethods_Run05 + +QM_qmap_Run05 +
+mean + +12.987187 + +12.454529 + +13.206697 + +14.354030 + +12.446912 + +13.6338065 + +13.546393 +
+sd + +5.379666 + +5.985046 + +5.580365 + +5.682610 + +5.983804 + +5.4667575 + +5.555303 +
+min + +-5.222516 + +-3.289649 + +-6.927810 + +-3.942330 + +-3.289649 + +-0.0068817 + +-2.261268 +
+per10th + +6.004220 + +4.400269 + +5.983167 + +6.595541 + +4.407593 + +6.4332728 + +6.496371 +
+per90th + +19.933914 + +20.196924 + +20.189136 + +21.603798 + +20.196924 + +20.7896622 + +20.831031 +
+max + +31.605967 + +32.398094 + +32.394508 + +33.386124 + +32.398094 + +34.7932587 + +34.132040 +
+
+

Fig.Density plot of validation period

+

Note - need to add back in some facetting to this +fig

+
m <- reshape2::melt(val.dfs.v.df)
+
+ggplot(m, aes(value, fill=variable, colour=variable)) + 
+  geom_density(alpha = 0.3, position="identity") + 
+  theme_minimal() +
+  scale_fill_brewer(palette = "Set1") +
+  scale_color_brewer(palette = "Set1") +
+  facet_wrap(.~variable)
+

+
+
+

Seasonal

+
# Using the 360 year structure, derive row indexes for each of the seasons, assuming the first calendar date represents Dec 1st
+
+winter <- c(1:90)
+for(i in 1:8){
+  x <-1 + i*360
+  y <-1 + i*360 + 90 #60 days is 3 months
+  winter <- c(winter, x:y)
+}
+
+winter <- winter[!winter%in%removethisindex2]
+
+spring <- c(91:180)
+for(i in 1:8){
+  x <-91 + (i*360)
+  y <-91 + (i*360) + 90 #90 days is 3 months
+  sping <- c(spring, x:y)
+}
+spring <- spring[!spring%in%removethisindex2]
+
+summer <- c(181:270)
+for(i in 1:8){
+  x <- 181 + (i*360)
+  y <- 181 + i*360 + 60 #60 days is 3 months
+  summer <- c(summer, x:y)
+}
+
+summer <- summer[!summer%in%removethisindex2]
+
+autumn <- c(271:360)
+for(i in 1:8){
+  x <- 181 + (i*360)
+  y <- 181 + i*360 + 60 #60 days is 3 months
+  autumn <- c(autumn, x:y)
+}
+
+autumn <- autumn[!autumn%in%removethisindex2]
+
+seasons <- list(winter, spring, summer, autumn)
+
seasonal.descriptives <- lapply(seasons, function(s){
+  
+#Convert dfs to a vector
+df<- lapply(val.dfs, function(d){
+    #Convert to single vector with just the seasonally defined columns
+  d <- d[,s]
+  unlist(as.vector(d))})
+
+df <- as.data.frame(df)
+
+  names(df) <- names
+  
+  descriptives <- apply(df, 2, function(x){ 
+    per <- data.frame(as.list(quantile(x, probs=c(0.1, 0.9))))
+    data.frame(mean=mean(x), sd=sd(x), min = min(x), per10th=per$X10.,per90th=per$X90., max = max(x))
+  })
+
+  descriptives <- descriptives %>% reduce(rbind)
+  row.names(descriptives) <- names(df)
+  d <- t(descriptives)
+})
+
+
+

Winter

+
seasonal.descriptives[[1]] %>% 
+    kable(booktabs = T) %>%
+    kable_styling() 
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +Obs + +Raw_Run05 + +Delta_mapping_cmethods_Run05 + +QDM_cmethods_Run05 + +QM_cmethods_Run05 + +Var_scaling_cmethods_Run05 + +QM_qmap_Run05 +
+mean + +7.545045 + +6.531616 + +7.477307 + +8.740652 + +6.540570 + +8.3213606 + +8.140226 +
+sd + +3.161591 + +3.510415 + +3.251183 + +3.544761 + +3.512938 + +3.0433078 + +3.162931 +
+min + +-5.222516 + +-3.289649 + +-6.927810 + +-3.942330 + +-3.289649 + +-0.0068817 + +-2.261268 +
+per10th + +3.395401 + +1.345557 + +3.180538 + +3.511754 + +1.329981 + +3.8597124 + +3.401121 +
+per90th + +11.369810 + +10.873218 + +11.344628 + +12.974035 + +10.862696 + +12.0620594 + +11.981927 +
+max + +18.378738 + +14.865137 + +19.443403 + +16.748768 + +14.865137 + +16.5707417 + +15.496591 +
+
+
+

Spring

+
seasonal.descriptives[[2]] %>% 
+    kable(booktabs = T) %>%
+    kable_styling() 
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +Obs + +Raw_Run05 + +Delta_mapping_cmethods_Run05 + +QDM_cmethods_Run05 + +QM_cmethods_Run05 + +Var_scaling_cmethods_Run05 + +QM_qmap_Run05 +
+mean + +13.409216 + +11.0399991 + +13.767192 + +13.033342 + +10.9968695 + +12.956441 + +12.192291 +
+sd + +3.698367 + +4.3338851 + +3.748335 + +4.149504 + +4.3766697 + +4.330499 + +3.934073 +
+min + +2.120107 + +0.1720215 + +3.540063 + +2.777159 + +0.1720215 + +2.550656 + +1.889193 +
+per10th + +8.211545 + +5.6897948 + +8.923459 + +7.695422 + +5.4196288 + +7.495068 + +7.579992 +
+per90th + +18.027275 + +17.0698734 + +18.580704 + +18.692587 + +17.0698734 + +18.972151 + +17.462591 +
+max + +21.960329 + +25.3600101 + +21.897438 + +26.726339 + +25.3600101 + +26.338200 + +26.581937 +
+
+
+

Summer

+
seasonal.descriptives[[3]] %>% 
+    kable(booktabs = T) %>%
+    kable_styling()
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +Obs + +Raw_Run05 + +Delta_mapping_cmethods_Run05 + +QDM_cmethods_Run05 + +QM_cmethods_Run05 + +Var_scaling_cmethods_Run05 + +QM_qmap_Run05 +
+mean + +18.885843 + +19.668851 + +19.414031 + +21.138834 + +19.673040 + +20.249657 + +20.356901 +
+sd + +2.844915 + +2.860375 + +3.154909 + +2.730936 + +2.862330 + +3.078232 + +3.133506 +
+min + +9.794850 + +9.882959 + +11.033133 + +12.156556 + +9.882959 + +9.117639 + +11.233149 +
+per10th + +15.730152 + +16.361451 + +15.923339 + +17.986867 + +16.331885 + +16.637978 + +16.800074 +
+per90th + +22.770690 + +23.189868 + +24.174515 + +24.504973 + +23.189868 + +24.042363 + +24.335204 +
+max + +31.013290 + +32.398094 + +30.361607 + +33.386124 + +32.398094 + +34.793259 + +34.132040 +
+
+
+

Autumn

+
seasonal.descriptives[[4]] %>% 
+    kable(booktabs = T) %>%
+    kable_styling() 
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +Obs + +Raw_Run05 + +Delta_mapping_cmethods_Run05 + +QDM_cmethods_Run05 + +QM_cmethods_Run05 + +Var_scaling_cmethods_Run05 + +QM_qmap_Run05 +
+mean + +18.196291 + +18.681650 + +18.391090 + +20.213214 + +18.679398 + +19.268436 + +19.423338 +
+sd + +3.564522 + +4.108216 + +4.271161 + +3.895637 + +4.131500 + +4.192679 + +4.152337 +
+min + +1.543686 + +2.619775 + +2.361020 + +4.513280 + +2.619775 + +4.586860 + +4.605356 +
+per10th + +14.272142 + +14.116602 + +12.865782 + +15.899373 + +14.074073 + +14.308384 + +14.893791 +
+per90th + +22.662147 + +23.006763 + +24.122185 + +24.306695 + +23.006763 + +23.855662 + +24.109881 +
+max + +31.013290 + +32.398094 + +30.361607 + +33.386124 + +32.398094 + +34.793259 + +34.132040 +
+
+
+
+

2b. Model fit statistics

+

Using the following to assess overall fit:

+
    +
  • R-squared (rsq)
  • +
  • Root Square Mean Error (RMSE)
  • +
  • Nash-Sutcliffe Efficiency (NSE): Magnitude of +residual variance compared to measured data variance, ranges -∞ to 1, 1 += perfect match to observations
  • +
  • Percent bias (PBIAS): The optimal value of PBIAS is +0.0, with low-magnitude values indicating accurate model simulation. +Positive values indicate overestimation bias, whereas negative values +indicate model underestimation bias.
  • +
+
actual <- val.dfs.v.df$Obs
+
+rsq <- sapply(val.dfs.v.df[c(2:ncol(val.dfs.v.df))], function(x){
+  cor(actual, x)^2
+})
+
+rmse <- sapply(val.dfs.v.df[c(2:ncol(val.dfs.v.df))], function(x){
+  sqrt(mean((actual - x)^2))
+})
+
+
+pbias <- sapply(val.dfs.v.df[c(2:ncol(val.dfs.v.df))], function(x){
+  hydroGOF::pbias(x, actual)
+})
+
+
+nse <- sapply(val.dfs.v.df[c(2:ncol(val.dfs.v.df))], function(x){
+  hydroGOF::NSE(x, actual)
+})
+
+
+k <- cbind(rsq, rmse, pbias, nse)
+k %>% 
+  kable(booktabs = T) %>%
+  kable_styling() 
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +rsq + +rmse + +pbias + +nse +
+Raw_Run05 + +0.5547920 + +4.132913 + +-4.1 + +0.4097943 +
+Delta_mapping_cmethods_Run05 + +0.5206012 + +4.099781 + +1.7 + +0.4192192 +
+QDM_cmethods_Run05 + +0.5431828 + +4.247295 + +10.5 + +0.3766733 +
+QM_cmethods_Run05 + +0.5499737 + +4.158476 + +-4.2 + +0.4024705 +
+Var_scaling_cmethods_Run05 + +0.5406815 + +3.999291 + +5.0 + +0.4473414 +
+QM_qmap_Run05 + +0.5457176 + +3.995015 + +4.3 + +0.4485227 +
+
+

Seasonal

+
seasonal.model.stats <- lapply(seasons, function(s){
+  
+  #Convert dfs to a vector
+  df<- lapply(val.dfs, function(d){
+  
+  #Convert to single vector with just the seasonally defined columns
+  d <- d[,s]
+  unlist(as.vector(d))})
+
+  df <- as.data.frame(df)
+  names(df) <- names
+
+  actual <- df$Obs
+
+  rsq <- sapply(df[c(2:ncol(df))], function(x){
+    cor(actual, x)^2
+  })
+
+
+  rmse <- sapply(df[c(2:ncol(df))], function(x){
+    sqrt(mean((actual - x)^2))
+  })
+
+
+  pbias <- sapply(df[c(2:ncol(df))], function(x){
+    hydroGOF::pbias(x, actual)
+  })
+
+
+  nse <- sapply(df[c(2:ncol(df))], function(x){
+    hydroGOF::NSE(x, actual)
+  })
+
+  k <- cbind(rsq, rmse, pbias, nse)})
+

Highlighting the bias corrected statistics

+
+
+

Winter

+
seasonal.model.stats[[1]]  %>% 
+  kable(booktabs = T) %>%
+  kable_styling() 
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +rsq + +rmse + +pbias + +nse +
+Raw_Run05 + +0.0170801 + +4.521556 + +-13.4 + +-1.0453669 +
+Delta_mapping_cmethods_Run05 + +0.0007238 + +4.474042 + +-0.9 + +-1.0026056 +
+QDM_cmethods_Run05 + +0.0178275 + +4.582295 + +15.8 + +-1.1006881 +
+QM_cmethods_Run05 + +0.0177739 + +4.514827 + +-13.3 + +-1.0392835 +
+Var_scaling_cmethods_Run05 + +0.0190308 + +4.147905 + +10.3 + +-0.7212862 +
+QM_qmap_Run05 + +0.0180254 + +4.203405 + +7.9 + +-0.7676566 +
+
+
+

Spring

+
seasonal.model.stats[[2]]  %>% 
+  kable(booktabs = T) %>%
+  kable_styling() 
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +rsq + +rmse + +pbias + +nse +
+Raw_Run05 + +0.0597691 + +5.498464 + +-17.7 + +-1.2106695 +
+Delta_mapping_cmethods_Run05 + +0.0579244 + +4.602206 + +2.7 + +-0.5487213 +
+QDM_cmethods_Run05 + +0.0590227 + +4.855678 + +-2.8 + +-0.7240140 +
+QM_cmethods_Run05 + +0.0581359 + +5.553689 + +-18.0 + +-1.2552995 +
+Var_scaling_cmethods_Run05 + +0.0634245 + +4.956393 + +-3.4 + +-0.7962737 +
+QM_qmap_Run05 + +0.0608302 + +4.843107 + +-9.1 + +-0.7150988 +
+
+
+

Summer

+
seasonal.model.stats[[3]]  %>% 
+  kable(booktabs = T) %>%
+  kable_styling() 
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +rsq + +rmse + +pbias + +nse +
+Raw_Run05 + +0.0000367 + +4.097490 + +4.1 + +-1.0744674 +
+Delta_mapping_cmethods_Run05 + +0.0642809 + +3.711397 + +2.8 + +-0.7019451 +
+QDM_cmethods_Run05 + +0.0008483 + +4.591269 + +11.9 + +-1.6045713 +
+QM_cmethods_Run05 + +0.0007039 + +4.163879 + +4.2 + +-1.1422337 +
+Var_scaling_cmethods_Run05 + +0.0001434 + +4.431523 + +7.2 + +-1.4264800 +
+QM_qmap_Run05 + +0.0000007 + +4.482345 + +7.8 + +-1.4824546 +
+
+
+

Autumn

+
seasonal.model.stats[[4]]  %>% 
+  kable(booktabs = T) %>%
+  kable_styling() 
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +rsq + +rmse + +pbias + +nse +
+Raw_Run05 + +0.1869855 + +4.141732 + +2.7 + +-0.3501154 +
+Delta_mapping_cmethods_Run05 + +0.2960919 + +3.797036 + +1.1 + +-0.1347399 +
+QDM_cmethods_Run05 + +0.1724539 + +4.518442 + +11.1 + +-0.6068830 +
+QM_cmethods_Run05 + +0.1770343 + +4.197067 + +2.7 + +-0.3864324 +
+Var_scaling_cmethods_Run05 + +0.1650665 + +4.392008 + +5.9 + +-0.5182142 +
+QM_qmap_Run05 + +0.1603198 + +4.427219 + +6.7 + +-0.5426548 +
+
+
+
+ + + +
+
+ +
+ + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/notebooks/Assessing_bc_data/Bias correction assessment_files/MathJax.js b/notebooks/Assessing_bc_data/Bias correction assessment_files/MathJax.js new file mode 100644 index 00000000..792f7e45 --- /dev/null +++ b/notebooks/Assessing_bc_data/Bias correction assessment_files/MathJax.js @@ -0,0 +1,19 @@ +/* + * /MathJax.js + * + * Copyright (c) 2009-2017 The MathJax Consortium + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +if(document.getElementById&&document.childNodes&&document.createElement){if(!(window.MathJax&&MathJax.Hub)){if(window.MathJax){window.MathJax={AuthorConfig:window.MathJax}}else{window.MathJax={}}MathJax.isPacked=true;MathJax.version="2.7.2";MathJax.fileversion="2.7.2";MathJax.cdnVersion="2.7.2";MathJax.cdnFileVersions={};(function(d){var b=window[d];if(!b){b=window[d]={}}var e=[];var c=function(f){var g=f.constructor;if(!g){g=function(){}}for(var h in f){if(h!=="constructor"&&f.hasOwnProperty(h)){g[h]=f[h]}}return g};var a=function(){return function(){return arguments.callee.Init.call(this,arguments)}};b.Object=c({constructor:a(),Subclass:function(f,h){var g=a();g.SUPER=this;g.Init=this.Init;g.Subclass=this.Subclass;g.Augment=this.Augment;g.protoFunction=this.protoFunction;g.can=this.can;g.has=this.has;g.isa=this.isa;g.prototype=new this(e);g.prototype.constructor=g;g.Augment(f,h);return g},Init:function(f){var g=this;if(f.length===1&&f[0]===e){return g}if(!(g instanceof f.callee)){g=new f.callee(e)}return g.Init.apply(g,f)||g},Augment:function(f,g){var h;if(f!=null){for(h in f){if(f.hasOwnProperty(h)){this.protoFunction(h,f[h])}}if(f.toString!==this.prototype.toString&&f.toString!=={}.toString){this.protoFunction("toString",f.toString)}}if(g!=null){for(h in g){if(g.hasOwnProperty(h)){this[h]=g[h]}}}return this},protoFunction:function(g,f){this.prototype[g]=f;if(typeof f==="function"){f.SUPER=this.SUPER.prototype}},prototype:{Init:function(){},SUPER:function(f){return f.callee.SUPER},can:function(f){return typeof(this[f])==="function"},has:function(f){return typeof(this[f])!=="undefined"},isa:function(f){return(f instanceof Object)&&(this instanceof f)}},can:function(f){return this.prototype.can.call(this,f)},has:function(f){return this.prototype.has.call(this,f)},isa:function(g){var f=this;while(f){if(f===g){return true}else{f=f.SUPER}}return false},SimpleSUPER:c({constructor:function(f){return this.SimpleSUPER.define(f)},define:function(f){var h={};if(f!=null){for(var g in f){if(f.hasOwnProperty(g)){h[g]=this.wrap(g,f[g])}}if(f.toString!==this.prototype.toString&&f.toString!=={}.toString){h.toString=this.wrap("toString",f.toString)}}return h},wrap:function(i,h){if(typeof(h)!=="function"||!h.toString().match(/\.\s*SUPER\s*\(/)){return h}var g=function(){this.SUPER=g.SUPER[i];try{var f=h.apply(this,arguments)}catch(j){delete this.SUPER;throw j}delete this.SUPER;return f};g.toString=function(){return h.toString.apply(h,arguments)};return g}})});b.Object.isArray=Array.isArray||function(f){return Object.prototype.toString.call(f)==="[object Array]"};b.Object.Array=Array})("MathJax");(function(BASENAME){var BASE=window[BASENAME];if(!BASE){BASE=window[BASENAME]={}}var isArray=BASE.Object.isArray;var CALLBACK=function(data){var cb=function(){return arguments.callee.execute.apply(arguments.callee,arguments)};for(var id in CALLBACK.prototype){if(CALLBACK.prototype.hasOwnProperty(id)){if(typeof(data[id])!=="undefined"){cb[id]=data[id]}else{cb[id]=CALLBACK.prototype[id]}}}cb.toString=CALLBACK.prototype.toString;return cb};CALLBACK.prototype={isCallback:true,hook:function(){},data:[],object:window,execute:function(){if(!this.called||this.autoReset){this.called=!this.autoReset;return this.hook.apply(this.object,this.data.concat([].slice.call(arguments,0)))}},reset:function(){delete this.called},toString:function(){return this.hook.toString.apply(this.hook,arguments)}};var ISCALLBACK=function(f){return(typeof(f)==="function"&&f.isCallback)};var EVAL=function(code){return eval.call(window,code)};var TESTEVAL=function(){EVAL("var __TeSt_VaR__ = 1");if(window.__TeSt_VaR__){try{delete window.__TeSt_VaR__}catch(error){window.__TeSt_VaR__=null}}else{if(window.execScript){EVAL=function(code){BASE.__code=code;code="try {"+BASENAME+".__result = eval("+BASENAME+".__code)} catch(err) {"+BASENAME+".__result = err}";window.execScript(code);var result=BASE.__result;delete BASE.__result;delete BASE.__code;if(result instanceof Error){throw result}return result}}else{EVAL=function(code){BASE.__code=code;code="try {"+BASENAME+".__result = eval("+BASENAME+".__code)} catch(err) {"+BASENAME+".__result = err}";var head=(document.getElementsByTagName("head"))[0];if(!head){head=document.body}var script=document.createElement("script");script.appendChild(document.createTextNode(code));head.appendChild(script);head.removeChild(script);var result=BASE.__result;delete BASE.__result;delete BASE.__code;if(result instanceof Error){throw result}return result}}}TESTEVAL=null};var USING=function(args,i){if(arguments.length>1){if(arguments.length===2&&!(typeof arguments[0]==="function")&&arguments[0] instanceof Object&&typeof arguments[1]==="number"){args=[].slice.call(args,i)}else{args=[].slice.call(arguments,0)}}if(isArray(args)&&args.length===1&&typeof(args[0])==="function"){args=args[0]}if(typeof args==="function"){if(args.execute===CALLBACK.prototype.execute){return args}return CALLBACK({hook:args})}else{if(isArray(args)){if(typeof(args[0])==="string"&&args[1] instanceof Object&&typeof args[1][args[0]]==="function"){return CALLBACK({hook:args[1][args[0]],object:args[1],data:args.slice(2)})}else{if(typeof args[0]==="function"){return CALLBACK({hook:args[0],data:args.slice(1)})}else{if(typeof args[1]==="function"){return CALLBACK({hook:args[1],object:args[0],data:args.slice(2)})}}}}else{if(typeof(args)==="string"){if(TESTEVAL){TESTEVAL()}return CALLBACK({hook:EVAL,data:[args]})}else{if(args instanceof Object){return CALLBACK(args)}else{if(typeof(args)==="undefined"){return CALLBACK({})}}}}}throw Error("Can't make callback from given data")};var DELAY=function(time,callback){callback=USING(callback);callback.timeout=setTimeout(callback,time);return callback};var WAITFOR=function(callback,signal){callback=USING(callback);if(!callback.called){WAITSIGNAL(callback,signal);signal.pending++}};var WAITEXECUTE=function(){var signals=this.signal;delete this.signal;this.execute=this.oldExecute;delete this.oldExecute;var result=this.execute.apply(this,arguments);if(ISCALLBACK(result)&&!result.called){WAITSIGNAL(result,signals)}else{for(var i=0,m=signals.length;i0&&priority=0;i--){this.hooks.splice(i,1)}this.remove=[]}});var EXECUTEHOOKS=function(hooks,data,reset){if(!hooks){return null}if(!isArray(hooks)){hooks=[hooks]}if(!isArray(data)){data=(data==null?[]:[data])}var handler=HOOKS(reset);for(var i=0,m=hooks.length;ig){g=document.styleSheets.length}if(!i){i=document.head||((document.getElementsByTagName("head"))[0]);if(!i){i=document.body}}return i};var f=[];var c=function(){for(var k=0,j=f.length;k=this.timeout){i(this.STATUS.ERROR);return 1}return 0},file:function(j,i){if(i<0){a.Ajax.loadTimeout(j)}else{a.Ajax.loadComplete(j)}},execute:function(){this.hook.call(this.object,this,this.data[0],this.data[1])},checkSafari2:function(i,j,k){if(i.time(k)){return}if(document.styleSheets.length>j&&document.styleSheets[j].cssRules&&document.styleSheets[j].cssRules.length){k(i.STATUS.OK)}else{setTimeout(i,i.delay)}},checkLength:function(i,l,n){if(i.time(n)){return}var m=0;var j=(l.sheet||l.styleSheet);try{if((j.cssRules||j.rules||[]).length>0){m=1}}catch(k){if(k.message.match(/protected variable|restricted URI/)){m=1}else{if(k.message.match(/Security error/)){m=1}}}if(m){setTimeout(a.Callback([n,i.STATUS.OK]),0)}else{setTimeout(i,i.delay)}}},loadComplete:function(i){i=this.fileURL(i);var j=this.loading[i];if(j&&!j.preloaded){a.Message.Clear(j.message);clearTimeout(j.timeout);if(j.script){if(f.length===0){setTimeout(c,0)}f.push(j.script)}this.loaded[i]=j.status;delete this.loading[i];this.addHook(i,j.callback)}else{if(j){delete this.loading[i]}this.loaded[i]=this.STATUS.OK;j={status:this.STATUS.OK}}if(!this.loadHooks[i]){return null}return this.loadHooks[i].Execute(j.status)},loadTimeout:function(i){if(this.loading[i].timeout){clearTimeout(this.loading[i].timeout)}this.loading[i].status=this.STATUS.ERROR;this.loadError(i);this.loadComplete(i)},loadError:function(i){a.Message.Set(["LoadFailed","File failed to load: %1",i],null,2000);a.Hub.signal.Post(["file load error",i])},Styles:function(k,l){var i=this.StyleString(k);if(i===""){l=a.Callback(l);l()}else{var j=document.createElement("style");j.type="text/css";this.head=h(this.head);this.head.appendChild(j);if(j.styleSheet&&typeof(j.styleSheet.cssText)!=="undefined"){j.styleSheet.cssText=i}else{j.appendChild(document.createTextNode(i))}l=this.timer.create.call(this,l,j)}return l},StyleString:function(n){if(typeof(n)==="string"){return n}var k="",o,m;for(o in n){if(n.hasOwnProperty(o)){if(typeof n[o]==="string"){k+=o+" {"+n[o]+"}\n"}else{if(a.Object.isArray(n[o])){for(var l=0;l="0"&&q<="9"){f[j]=p[f[j]-1];if(typeof f[j]==="number"){f[j]=this.number(f[j])}}else{if(q==="{"){q=f[j].substr(1);if(q>="0"&&q<="9"){f[j]=p[f[j].substr(1,f[j].length-2)-1];if(typeof f[j]==="number"){f[j]=this.number(f[j])}}else{var k=f[j].match(/^\{([a-z]+):%(\d+)\|(.*)\}$/);if(k){if(k[1]==="plural"){var d=p[k[2]-1];if(typeof d==="undefined"){f[j]="???"}else{d=this.plural(d)-1;var h=k[3].replace(/(^|[^%])(%%)*%\|/g,"$1$2%\uEFEF").split(/\|/);if(d>=0&&d=3){c.push([f[0],f[1],this.processSnippet(g,f[2])])}else{c.push(e[d])}}}}else{c.push(e[d])}}return c},markdownPattern:/(%.)|(\*{1,3})((?:%.|.)+?)\2|(`+)((?:%.|.)+?)\4|\[((?:%.|.)+?)\]\(([^\s\)]+)\)/,processMarkdown:function(b,h,d){var j=[],e;var c=b.split(this.markdownPattern);var g=c[0];for(var f=1,a=c.length;f1?d[1]:""));f=null}if(e&&(!b.preJax||d)){c.nodeValue=c.nodeValue.replace(b.postJax,(e.length>1?e[1]:""))}if(f&&!f.nodeValue.match(/\S/)){f=f.previousSibling}}if(b.preRemoveClass&&f&&f.className===b.preRemoveClass){a.MathJax.preview=f}a.MathJax.checked=1},processInput:function(a){var b,i=MathJax.ElementJax.STATE;var h,e,d=a.scripts.length;try{while(a.ithis.processUpdateTime&&a.i1){d.jax[a.outputJax].push(b)}b.MathJax.state=c.OUTPUT},prepareOutput:function(c,f){while(c.jthis.processUpdateTime&&h.i=0;q--){if((b[q].src||"").match(f)){s.script=b[q].innerHTML;if(RegExp.$2){var t=RegExp.$2.substr(1).split(/\&/);for(var p=0,l=t.length;p=parseInt(y[z])}}return true},Select:function(j){var i=j[d.Browser];if(i){return i(d.Browser)}return null}};var e=k.replace(/^Mozilla\/(\d+\.)+\d+ /,"").replace(/[a-z][-a-z0-9._: ]+\/\d+[^ ]*-[^ ]*\.([a-z][a-z])?\d+ /i,"").replace(/Gentoo |Ubuntu\/(\d+\.)*\d+ (\([^)]*\) )?/,"");d.Browser=d.Insert(d.Insert(new String("Unknown"),{version:"0.0"}),a);for(var v in a){if(a.hasOwnProperty(v)){if(a[v]&&v.substr(0,2)==="is"){v=v.slice(2);if(v==="Mac"||v==="PC"){continue}d.Browser=d.Insert(new String(v),a);var r=new RegExp(".*(Version/| Trident/.*; rv:)((?:\\d+\\.)+\\d+)|.*("+v+")"+(v=="MSIE"?" ":"/")+"((?:\\d+\\.)*\\d+)|(?:^|\\(| )([a-z][-a-z0-9._: ]+|(?:Apple)?WebKit)/((?:\\d+\\.)+\\d+)");var u=r.exec(e)||["","","","unknown","0.0"];d.Browser.name=(u[1]!=""?v:(u[3]||u[5]));d.Browser.version=u[2]||u[4]||u[6];break}}}try{d.Browser.Select({Safari:function(j){var i=parseInt((String(j.version).split("."))[0]);if(i>85){j.webkit=j.version}if(i>=538){j.version="8.0"}else{if(i>=537){j.version="7.0"}else{if(i>=536){j.version="6.0"}else{if(i>=534){j.version="5.1"}else{if(i>=533){j.version="5.0"}else{if(i>=526){j.version="4.0"}else{if(i>=525){j.version="3.1"}else{if(i>500){j.version="3.0"}else{if(i>400){j.version="2.0"}else{if(i>85){j.version="1.0"}}}}}}}}}}j.webkit=(navigator.appVersion.match(/WebKit\/(\d+)\./))[1];j.isMobile=(navigator.appVersion.match(/Mobile/i)!=null);j.noContextMenu=j.isMobile},Firefox:function(j){if((j.version==="0.0"||k.match(/Firefox/)==null)&&navigator.product==="Gecko"){var m=k.match(/[\/ ]rv:(\d+\.\d.*?)[\) ]/);if(m){j.version=m[1]}else{var i=(navigator.buildID||navigator.productSub||"0").substr(0,8);if(i>="20111220"){j.version="9.0"}else{if(i>="20111120"){j.version="8.0"}else{if(i>="20110927"){j.version="7.0"}else{if(i>="20110816"){j.version="6.0"}else{if(i>="20110621"){j.version="5.0"}else{if(i>="20110320"){j.version="4.0"}else{if(i>="20100121"){j.version="3.6"}else{if(i>="20090630"){j.version="3.5"}else{if(i>="20080617"){j.version="3.0"}else{if(i>="20061024"){j.version="2.0"}}}}}}}}}}}}j.isMobile=(navigator.appVersion.match(/Android/i)!=null||k.match(/ Fennec\//)!=null||k.match(/Mobile/)!=null)},Chrome:function(i){i.noContextMenu=i.isMobile=!!navigator.userAgent.match(/ Mobile[ \/]/)},Opera:function(i){i.version=opera.version()},Edge:function(i){i.isMobile=!!navigator.userAgent.match(/ Phone/)},MSIE:function(j){j.isMobile=!!navigator.userAgent.match(/ Phone/);j.isIE9=!!(document.documentMode&&(window.performance||window.msPerformance));MathJax.HTML.setScriptBug=!j.isIE9||document.documentMode<9;MathJax.Hub.msieHTMLCollectionBug=(document.documentMode<9);if(document.documentMode<10&&!s.params.NoMathPlayer){try{new ActiveXObject("MathPlayer.Factory.1");j.hasMathPlayer=true}catch(m){}try{if(j.hasMathPlayer){var i=document.createElement("object");i.id="mathplayer";i.classid="clsid:32F66A20-7614-11D4-BD11-00104BD3F987";g.appendChild(i);document.namespaces.add("m","http://www.w3.org/1998/Math/MathML");j.mpNamespace=true;if(document.readyState&&(document.readyState==="loading"||document.readyState==="interactive")){document.write('');j.mpImported=true}}else{document.namespaces.add("mjx_IE_fix","http://www.w3.org/1999/xlink")}}catch(m){}}}})}catch(c){console.error(c.message)}d.Browser.Select(MathJax.Message.browsers);if(h.AuthorConfig&&typeof h.AuthorConfig.AuthorInit==="function"){h.AuthorConfig.AuthorInit()}d.queue=h.Callback.Queue();d.queue.Push(["Post",s.signal,"Begin"],["Config",s],["Cookie",s],["Styles",s],["Message",s],function(){var i=h.Callback.Queue(s.Jax(),s.Extensions());return i.Push({})},["Menu",s],s.onLoad(),function(){MathJax.isReady=true},["Typeset",s],["Hash",s],["MenuZoom",s],["Post",s.signal,"End"])})("MathJax")}}; diff --git a/notebooks/Assessing_bc_data/MethodsAssessment_DecWorkshop.Rmd b/notebooks/Assessing_bc_data/MethodsAssessment_DecWorkshop.Rmd index e72f3bc3..c9abc54a 100644 --- a/notebooks/Assessing_bc_data/MethodsAssessment_DecWorkshop.Rmd +++ b/notebooks/Assessing_bc_data/MethodsAssessment_DecWorkshop.Rmd @@ -1,6 +1,5 @@ --- title: "Bias correction assessment" -author: "Ruth Bowyer" date: "`r format(Sys.Date())`" output: html_document: From f0de8ebed3a4abb4cccdc5bff86ecfd3bd0947e2 Mon Sep 17 00:00:00 2001 From: Ruth Bowyer <105492883+RuthBowyer@users.noreply.github.com> Date: Sun, 10 Dec 2023 22:31:10 +0000 Subject: [PATCH 48/83] small change to format --- notebooks/Assessing_bc_data/MethodsAssessment_DecWorkshop.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/Assessing_bc_data/MethodsAssessment_DecWorkshop.Rmd b/notebooks/Assessing_bc_data/MethodsAssessment_DecWorkshop.Rmd index c9abc54a..90618949 100644 --- a/notebooks/Assessing_bc_data/MethodsAssessment_DecWorkshop.Rmd +++ b/notebooks/Assessing_bc_data/MethodsAssessment_DecWorkshop.Rmd @@ -258,7 +258,7 @@ shape <-sf::st_as_sf(vect(paste0(dd, "shapefiles/three.cities/", city, "/", city -## **Compare across the same day** +#### **Compare across the same day** Here we take a day and visualise the differences between the methods and runs From 42d8d23f462e98faaeb4fad35b05d63189dca9d6 Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Mon, 11 Dec 2023 03:01:41 +0000 Subject: [PATCH 49/83] feat: add `rstudio` docker deploy and `python.utils.make_user` --- .dockerignore | 6 ++++ compose.yml | 7 +++++ compose/Dockerfile | 7 +++-- compose/server/Dockerfile | 38 ++++++++++++++++++++++++++ python/.pytest.ini | 2 ++ python/debiasing/debias_wrapper.py | 1 - python/tests/test_debiasing.py | 9 ------ python/utils.py | 44 ++++++++++++++++++++++++++++++ 8 files changed, 101 insertions(+), 13 deletions(-) create mode 100644 compose/server/Dockerfile diff --git a/.dockerignore b/.dockerignore index e69de29b..6d0c3e2d 100644 --- a/.dockerignore +++ b/.dockerignore @@ -0,0 +1,6 @@ +README_files +_reference +_sidebar* +_site* +_extensions +*.bak diff --git a/compose.yml b/compose.yml index 2021fcb4..eacf7f4e 100644 --- a/compose.yml +++ b/compose.yml @@ -24,6 +24,13 @@ services: - "8080:80" # command: quarto preview --port 8080 + rstudio: + build: + context: . + dockerfile: ./compose/server/Dockerfile + ports: + - "8787:8787" + # volumes: # climate_data: # driver: local diff --git a/compose/Dockerfile b/compose/Dockerfile index d74fa6a6..07fe785a 100644 --- a/compose/Dockerfile +++ b/compose/Dockerfile @@ -1,4 +1,4 @@ -FROM jupyter/r-notebook as clim-recal-base +FROM quay.io/jupyter/r-notebook as clim-recal-base # This is derived from documentation available at # https://jupyter-docker-stacks.readthedocs.io/en/latest/ @@ -27,10 +27,11 @@ ENV SHELL /bin/bash ARG HOST_DATA_PATH=/Volumes/vmfileshare # The local_data_path is an absolute path to mount ClimateData within `docker` -ARG DOCKER_DATA_PATH=/mnt/vmfileshareRG env_name=clim-recal +ARG DOCKER_DATA_PATH=/mnt/vmfileshare +ARG env_name=clim-recal # `py_ver` is not currently used below and is specified in `environment.yaml` -# here as reminder and clarity if future change needed. +# included here as reminder and clarity if future change needed. ARG py_ver=3.9 USER root diff --git a/compose/server/Dockerfile b/compose/server/Dockerfile new file mode 100644 index 00000000..4daf96d2 --- /dev/null +++ b/compose/server/Dockerfile @@ -0,0 +1,38 @@ +FROM rocker/rstudio:4.3.2 + +# The local_data_path is an absolute local path to ClimateData on the machine hosting running `docker` +ARG HOST_DATA_PATH=/Volumes/vmfileshare + +# The local_data_path is an absolute path to mount ClimateData within `docker` +ARG DOCKER_DATA_PATH=/mnt/vmfileshare + +# ENV S6_VERSION=v2.1.0.2 +# ENV RSTUDIO_VERSION=2023.09.1+494 +# ENV DEFAULT_USER=rstudio +# ENV PANDOC_VERSION=default +# ENV QUARTO_VERSION=default + +RUN /rocker_scripts/install_pandoc.sh +RUN /rocker_scripts/install_quarto.sh +# RUN /rocker_scripts/install_jupyter.sh + +# Install user script +# + +# Maybe install necessary geo packages for terra following https://github.com/rspatial/terra/issues/248 +RUN apt-get update && apt-get -y install libudunits2-dev libgdal-dev libgeos-dev libproj-dev \ +gdal-bin python3-gdal libgdal-dev build-essential # Python specific deps +# Then install +# library(ggplot2) +# library(terra) +# library(tmap) #pretty maps +# library(RColorBrewer) +# library(tidyverse) +# library(kableExtra) + +# Install +# install.packages("hydroGOF") +COPY . /home/rstudio/ +# EXPOSE 8787 + +CMD ["/init"] diff --git a/python/.pytest.ini b/python/.pytest.ini index bf6d8df2..90ba61db 100644 --- a/python/.pytest.ini +++ b/python/.pytest.ini @@ -10,6 +10,8 @@ addopts = -ra -q --cov=. --cov-report=term:skip-covered +doctest_optionflags = ELLIPSIS + pythonpath = . testpaths = tests diff --git a/python/debiasing/debias_wrapper.py b/python/debiasing/debias_wrapper.py index 4e8713af..6337120c 100644 --- a/python/debiasing/debias_wrapper.py +++ b/python/debiasing/debias_wrapper.py @@ -7,7 +7,6 @@ from pathlib import Path from typing import Final, Generator, Optional, Union -import pytest from utils import ( DATE_FORMAT_SPLIT_STR, DATE_FORMAT_STR, diff --git a/python/tests/test_debiasing.py b/python/tests/test_debiasing.py index 16d8ed5f..a08c99a0 100644 --- a/python/tests/test_debiasing.py +++ b/python/tests/test_debiasing.py @@ -14,16 +14,7 @@ PREPROCESS_OUT_FOLDER_FILES_COUNT_CORRECT, ) from debiasing.debias_wrapper import ( - CALIB_DATES_STR_DEFAULT, - CMETHODS_FILE_NAME, - CMETHODS_OUT_FOLDER_DEFAULT, - DATA_PATH_DEFAULT, - MOD_FOLDER_DEFAULT, - OBS_FOLDER_DEFAULT, PREPROCESS_FILE_NAME, - PREPROCESS_OUT_FOLDER_DEFAULT, - PROCESSESORS_DEFAULT, - VALID_DATES_STR_DEFAULT, CityOptions, MethodOptions, RunConfig, diff --git a/python/utils.py b/python/utils.py index 9bba22ae..e25c1728 100644 --- a/python/utils.py +++ b/python/utils.py @@ -1,4 +1,5 @@ """Utility functions.""" +import subprocess from datetime import date, datetime from pathlib import Path from typing import Any, Final, Generator, Iterable, Optional, Union @@ -6,6 +7,8 @@ DateType = Union[date, str] DATE_FORMAT_STR: Final[str] = "%Y%m%d" DATE_FORMAT_SPLIT_STR: Final[str] = "-" +RSTUDIO_CODE_COPY_PATH: Path = Path("/home/rstudio/*") +DEBIAN_HOME_PATH: Path = Path("/home/") def date_to_str( @@ -105,3 +108,44 @@ def path_iterdir( raise error else: return + + +def make_user( + name: str, + password: str, + code_path: Path = RSTUDIO_CODE_COPY_PATH, + user_home_path: Path = DEBIAN_HOME_PATH, +) -> Path: + """Make user account and copy code to that environment. + + Args: + name: user and home folder name + password: login password + code_path: path to copy code from to user path + + Example: + ```pycon + >>> import os + >>> from shutil import rmtree + >>> if os.geteuid() != 0: + ... pytest.skip('requires root permission to run') + >>> user_name: str = 'very_unlinkely_test_user' + >>> password: str = 'test_pass' + >>> code_path: Path = Path('/home/jovyan') + >>> make_user(user_name, password, code_path=code_path) + PosixPath('/home/very_unlinkely_test_user') + >>> Path(f'/home/{user_name}/python/conftest.py').is_file() + True + >>> subprocess.run(f'userdel {user_name}', shell=True) + CompletedProcess(args='userdel very_unlinkely_test_user', returncode=0) + >>> rmtree(f'/home/{user_name}') + + ``` + """ + home_path: Path = user_home_path / name + subprocess.run(f"useradd {name}", shell=True) + subprocess.run(f"echo {name}:{password} | chpasswd", shell=True) + subprocess.run(f"mkdir {home_path}", shell=True) + subprocess.run(f"cp -r {code_path}/* {home_path}", shell=True) + subprocess.run(f"chown -R {name}:{name} home_path", shell=True) + return home_path From 6415abf76cf972cffe2d20e7ec2263279f7d90c7 Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Mon, 11 Dec 2023 08:14:54 +0000 Subject: [PATCH 50/83] feat: add `docker` config for `RStudio` users with `compose` refactor --- .github/workflows/ci.yaml | 6 +- _quarto.yml | 2 + compose.yml | 23 ++--- compose/docs/Dockerfile | 7 +- compose/{ => jupyter}/Dockerfile | 0 compose/linux-compose.yml | 11 +++ compose/mac-compose.yml | 11 +++ .../three_cities_debiasing_workshop.sh | 25 +++++ python/utils.py | 95 ++++++++++++++++--- 9 files changed, 145 insertions(+), 35 deletions(-) rename compose/{ => jupyter}/Dockerfile (100%) create mode 100644 compose/linux-compose.yml create mode 100644 compose/mac-compose.yml create mode 100644 python/debiasing/three_cities_debiasing_workshop.sh diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index d6da476b..0cb2abe7 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -15,11 +15,11 @@ env: on: pull_request: - branches: ['main', 'doc-deploy', 'ruth-notebook-for-workshop'] + branches: ['main', 'doc-deploy', 'ruth-notebook-for-workshop', 'r-docker-refactor'] paths-ignore: ['docs/**'] push: - branches: ['main', 'doc-deploy', 'ruth-doc-deploy'] + branches: ['main', 'doc-deploy', 'ruth-notebook-for-workshop', 'r-docker-refactor'] concurrency: group: ${{ github.head_ref || github.run_id }} @@ -122,7 +122,7 @@ jobs: run: | # A potentially quicker build option to try in future, requires running in detatched mode # DOCKER_BUILDKIT=1 docker build --no-cache -f compose/docs/Dockerfile --target builder --tag 'clim-recal-docs' . - docker compose build + docker compose build docs docker compose up --detach docker cp $(docker compose ps -q docs):/usr/local/apache2/htdocs/ ${{ env.GH_PAGE_PATH }} diff --git a/_quarto.yml b/_quarto.yml index a5e2b808..97a7148e 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -15,6 +15,8 @@ project: - "docs/pipeline.qmd" - "docs/contributing.md" - "python/README.md" + # - "notebooks/Assessing_bc_data/MethodsAssessment_DecWorkshop.Rmd" + # Requires dataset mounted to run notebook toc: True number-sections: True diff --git a/compose.yml b/compose.yml index eacf7f4e..6d8572ac 100644 --- a/compose.yml +++ b/compose.yml @@ -5,24 +5,21 @@ services: jupyter: build: context: . - dockerfile: ./compose/Dockerfile + dockerfile: ./compose/jupyter/Dockerfile target: clim-recal-base ports: - "8888:8888" - # volumes: - # - climate_data:/mnt/vmfileshare - # - type: bind - # source: /Volumes/vmfileshare - # target: /mnt/vmfileshare + volumes: + - .:/home/jovyan:rw docs: build: context: . dockerfile: ./compose/docs/Dockerfile - # target: clim-recal-docs ports: - "8080:80" - # command: quarto preview --port 8080 + volumes: + - .:/home/jovyan rstudio: build: @@ -30,11 +27,5 @@ services: dockerfile: ./compose/server/Dockerfile ports: - "8787:8787" - -# volumes: -# climate_data: -# driver: local -# driver_opts: -# type: none -# device: /Volumes/vmfileshare -# o: bind + volumes: + - .:/home/rstudio diff --git a/compose/docs/Dockerfile b/compose/docs/Dockerfile index 76b3b680..a840dddc 100644 --- a/compose/docs/Dockerfile +++ b/compose/docs/Dockerfile @@ -4,6 +4,7 @@ FROM ghcr.io/quarto-dev/quarto:${QUARTO_VERSION} AS builder ARG PORT=8080 ARG py_ver=3.9 +ENV DEBIAN_FRONTEND=noninteractive # ARG RIG_VERSION="latest" # ARG R_VERSION="release" @@ -15,14 +16,16 @@ ARG py_ver=3.9 # WORKDIR /app # RUN Rscript -e "renv::restore()" # RUN quarto render . -ADD . /app +COPY . /app WORKDIR /app # RUN Rscript -e "renv::restore()" EXPOSE ${PORT}:${PORT} # RUN quarto preview --port ${PORT}:${PORT} -RUN apt-get update && apt-get install -y python${py_ver} python3-pip +RUN apt-get update && apt-get install -y python${py_ver} python3-pip r-base r-base-dev RUN pip3 install quartodoc && quartodoc build +RUN Rscript -e 'install.packages("rmarkdown", repos="https://cloud.r-project.org")' + RUN quarto render FROM httpd:alpine diff --git a/compose/Dockerfile b/compose/jupyter/Dockerfile similarity index 100% rename from compose/Dockerfile rename to compose/jupyter/Dockerfile diff --git a/compose/linux-compose.yml b/compose/linux-compose.yml new file mode 100644 index 00000000..6ce3fbf2 --- /dev/null +++ b/compose/linux-compose.yml @@ -0,0 +1,11 @@ +version: "3.8" + +services: + + jupyter: + volumes: + - /mnt/vmfileshare:/mnt/vmfileshare + + rstudio: + volumes: + - /mnt/vmfileshare:/mnt/vmfileshare diff --git a/compose/mac-compose.yml b/compose/mac-compose.yml new file mode 100644 index 00000000..36b0ffcf --- /dev/null +++ b/compose/mac-compose.yml @@ -0,0 +1,11 @@ +version: "3.8" + +services: + + jupyter: + volumes: + - /Volumes/vmfileshare:/mnt/vmfileshare + + rstudio: + volumes: + - /Volumes/vmfileshare:/mnt/vmfileshare diff --git a/python/debiasing/three_cities_debiasing_workshop.sh b/python/debiasing/three_cities_debiasing_workshop.sh new file mode 100644 index 00000000..28f70eee --- /dev/null +++ b/python/debiasing/three_cities_debiasing_workshop.sh @@ -0,0 +1,25 @@ +#!/bin/sh + +declare -a vars=("tasmax" "rainfall" "tasmin") +declare -a runs=("05" "07" "08" "06") +declare -a cities=("Glasgow") +declare -a methods=("quantile_delta_mapping" "quantile_mapping") +declare -a methods_2=("variance_scaling" "delta_method") + +for run in "${runs[@]}"; do + for city in "${cities[@]}"; do + for var in "${vars[@]}"; do + + python preprocess_data.py --mod /mnt/vmfileshare/ClimateData/Cropped/three.cities/CPM/$city --obs /mnt/vmfileshare/ClimateData/Cropped/three.cities/Hads.updated360/$city -v $var -r $run --out /mnt/vmfileshare/ClimateData/Cropped/three.cities/Preprocessed/workshop/$city/$run/$var --calib_dates 19801201-20101129 --valid_dates 20101130-20201130 + + for method in "${methods[@]}"; do + python run_cmethods.py --input_data_folder /mnt/vmfileshare/ClimateData/Cropped/three.cities/Preprocessed/workshop/$city/$run/$var --out /mnt/vmfileshare/ClimateData/Debiased/three.cities.cropped/workshop/$city/$run --method $method --v $var -p 32 + done + + for method in "${methods_2[@]}"; do + python run_cmethods.py --input_data_folder /mnt/vmfileshare/ClimateData/Cropped/three.cities/Preprocessed/workshop/$city/$run/$var --out /mnt/vmfileshare/ClimateData/Debiased/three.cities.cropped/workshop/$city/$run --method $method --group time.month --v $var -p 32 + done + + done + done +done diff --git a/python/utils.py b/python/utils.py index e25c1728..e0ea784b 100644 --- a/python/utils.py +++ b/python/utils.py @@ -2,12 +2,14 @@ import subprocess from datetime import date, datetime from pathlib import Path -from typing import Any, Final, Generator, Iterable, Optional, Union +from shutil import rmtree +from typing import Any, Callable, Final, Generator, Iterable, Optional, Union DateType = Union[date, str] DATE_FORMAT_STR: Final[str] = "%Y%m%d" DATE_FORMAT_SPLIT_STR: Final[str] = "-" -RSTUDIO_CODE_COPY_PATH: Path = Path("/home/rstudio/*") +RSTUDIO_DOCKER_USER_PATH: Path = Path("/home/rstudio") +JUPYTER_DOCKER_USER_PATH: Path = Path("/home/jovyan") DEBIAN_HOME_PATH: Path = Path("/home/") @@ -111,41 +113,106 @@ def path_iterdir( def make_user( - name: str, + user: str, password: str, - code_path: Path = RSTUDIO_CODE_COPY_PATH, + code_path: Path = RSTUDIO_DOCKER_USER_PATH, user_home_path: Path = DEBIAN_HOME_PATH, ) -> Path: """Make user account and copy code to that environment. Args: - name: user and home folder name + user: user and home folder name password: login password code_path: path to copy code from to user path Example: ```pycon >>> import os - >>> from shutil import rmtree >>> if os.geteuid() != 0: ... pytest.skip('requires root permission to run') >>> user_name: str = 'very_unlinkely_test_user' >>> password: str = 'test_pass' >>> code_path: Path = Path('/home/jovyan') - >>> make_user(user_name, password, code_path=code_path) + >>> make_user(user_name, password, code_path=JUPYTER_DOCKER_USER_PATH) PosixPath('/home/very_unlinkely_test_user') >>> Path(f'/home/{user_name}/python/conftest.py').is_file() True - >>> subprocess.run(f'userdel {user_name}', shell=True) - CompletedProcess(args='userdel very_unlinkely_test_user', returncode=0) - >>> rmtree(f'/home/{user_name}') + >>> rm_user(user_name) + 'very_unlinkely_test_user' ``` """ - home_path: Path = user_home_path / name - subprocess.run(f"useradd {name}", shell=True) - subprocess.run(f"echo {name}:{password} | chpasswd", shell=True) + home_path: Path = user_home_path / user + subprocess.run(f"useradd {user}", shell=True) + subprocess.run(f"echo {user}:{password} | chpasswd", shell=True) subprocess.run(f"mkdir {home_path}", shell=True) subprocess.run(f"cp -r {code_path}/* {home_path}", shell=True) - subprocess.run(f"chown -R {name}:{name} home_path", shell=True) + subprocess.run(f"chown -R {user}:{user} home_path", shell=True) return home_path + + +def rm_user(user: str, user_home_path: Path = DEBIAN_HOME_PATH) -> str: + """Remove user and user home folder. + + Args: + user: user and home folder name + password: login password + + Example: + ```pycon + >>> import os + >>> if os.geteuid() != 0: + ... pytest.skip('requires root permission to run') + >>> user_name: str = 'very_unlinkely_test_user' + >>> password: str = 'test_pass' + >>> make_user(user_name, password, code_path=JUPYTER_DOCKER_USER_PATH) + PosixPath('/home/very_unlinkely_test_user') + >>> rm_user(user_name) + 'very_unlinkely_test_user' + + ``` + """ + subprocess.run(f"userdel {user}", shell=True) + rmtree(user_home_path / user) + return user + + +def make_users( + file_path: Path, user_col: str, password_col: str, file_reader: Callable, **kwargs +) -> Generator[Path, None, None]: + """Load a file of usernames and passwords and to pass to make_user. + + Args: + file_path: path to collumned file including user names and passwords per row + user_col: str of column name for user names + password_col: name of column name for passwords + file_reader: function to read `file_path` + **kwargs: additional parameters for to pass to `file_reader` + + Example: + ```pycon + >>> import os + >>> if os.geteuid() != 0: + ... pytest.skip('requires root permission to run') + >>> from pandas import read_excel + >>> code_path: Path = Path('/home/jovyan') + >>> def excel_row_iter(path: Path, **kwargs) -> dict: + ... df: DataFrame = read_excel(path, **kwargs) + ... return df.to_dict(orient="records") + >>> test_accounts_path: Path = Path('tests/test_user_accounts.xlsx') + >>> user_paths: tuple[Path, ...] = tuple(make_users( + ... file_path=test_accounts_path, + ... user_col="User Name", + ... password_col="Password", + ... file_reader=excel_row_iter, + ... code_path=JUPYTER_DOCKER_USER_PATH, + ... )) + >>> [(path / 'python' / 'conftest.py').is_file() for path in user_paths] + [True, True, True, True, True] + >>> [rm_user(user_path.name) for user_path in user_paths] + ['sally', 'george', 'jean', 'felicity', 'frank'] + + ``` + """ + for record in file_reader(file_path): + yield make_user(user=record[user_col], password=record[password_col], **kwargs) From 1bc889102a110c93ea891bab5f94c29a59fffe8c Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Mon, 11 Dec 2023 08:37:08 +0000 Subject: [PATCH 51/83] fix(ci): add needed `openpyxl` dependency --- environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yml b/environment.yml index 737eb843..a44857e3 100644 --- a/environment.yml +++ b/environment.yml @@ -39,6 +39,7 @@ dependencies: - matplotlib==3.6.1 - netcdf4==1.6.1 - numpy==1.23.4 + - openpyxl==3.1.2 - packaging==21.3 - pandas==1.5.1 - pillow==9.4.0 From c7ff9ac89ba8c23a695fcea85a9a7aa3248d75fd Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Mon, 11 Dec 2023 08:38:23 +0000 Subject: [PATCH 52/83] fix(ci): only build `jupyter` for `python` testing --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 0cb2abe7..604e340f 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -57,7 +57,7 @@ jobs: - name: Build, Test and Save Test Coverage run: | - docker compose build + docker compose build jupyter docker compose up --detach docker compose exec jupyter bash -c "conda run -n $CONDA_ENV_NAME --cwd python pytest -p no:sugar" export JUPYTER_ID=$(docker compose ps -q jupyter) From 3ffacedc82a3adaf77ef0ce677ff49fdec6fc01a Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Mon, 11 Dec 2023 09:02:40 +0000 Subject: [PATCH 53/83] fix: `jupyter` `user` permission --- compose/jupyter/Dockerfile | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/compose/jupyter/Dockerfile b/compose/jupyter/Dockerfile index 07fe785a..cd8ebbaf 100644 --- a/compose/jupyter/Dockerfile +++ b/compose/jupyter/Dockerfile @@ -63,18 +63,20 @@ RUN "${CONDA_DIR}/envs/${env_name}/bin/python" -m ipykernel install --user --nam fix-permissions "/home/${NB_USER}" # Copy the rest of the clim-recal code to volume -ADD --chown=${NB_UID}:${NB_GID} . . +COPY --chown=${NB_UID}:${NB_GID} . . +# Switch to default jupyter user +USER ${NB_UID} # Add custom activate script to reflect environment -USER root +# USER root RUN activate_custom_env_script=/usr/local/bin/before-notebook.d/activate_custom_env.sh && \ echo "#!/bin/bash" > ${activate_custom_env_script} && \ echo "eval \"$(conda shell.bash activate "${env_name}")\"" >> ${activate_custom_env_script} && \ chmod +x ${activate_custom_env_script} # Switch to default jupyter user -USER ${NB_UID} +# USER ${NB_UID} # This eases running shell commands outside docker following: # https://pythonspeed.com/articles/activate-conda-dockerfile/ From 98f2ce1dae4529e898621365447c72639b6fac1c Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Mon, 11 Dec 2023 09:14:11 +0000 Subject: [PATCH 54/83] fix: move `chmod` to after activation command in `jupyter/Dockerfile` --- compose/jupyter/Dockerfile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/compose/jupyter/Dockerfile b/compose/jupyter/Dockerfile index cd8ebbaf..67f1ae6f 100644 --- a/compose/jupyter/Dockerfile +++ b/compose/jupyter/Dockerfile @@ -67,7 +67,7 @@ COPY --chown=${NB_UID}:${NB_GID} . . # Switch to default jupyter user -USER ${NB_UID} +# USER ${NB_UID} # Add custom activate script to reflect environment # USER root RUN activate_custom_env_script=/usr/local/bin/before-notebook.d/activate_custom_env.sh && \ @@ -75,8 +75,10 @@ RUN activate_custom_env_script=/usr/local/bin/before-notebook.d/activate_custom_ echo "eval \"$(conda shell.bash activate "${env_name}")\"" >> ${activate_custom_env_script} && \ chmod +x ${activate_custom_env_script} +RUN chown -R ${NB_UID}:${NB_GID} . + # Switch to default jupyter user -# USER ${NB_UID} +USER ${NB_UID} # This eases running shell commands outside docker following: # https://pythonspeed.com/articles/activate-conda-dockerfile/ From f69a3879a00a472738bf47a4a2794bebc2f83054 Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Mon, 11 Dec 2023 09:43:19 +0000 Subject: [PATCH 55/83] fix(ci): `jupyter` build during `pytest` detach --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 604e340f..963510f7 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -58,7 +58,7 @@ jobs: - name: Build, Test and Save Test Coverage run: | docker compose build jupyter - docker compose up --detach + docker compose up jupyter --detach docker compose exec jupyter bash -c "conda run -n $CONDA_ENV_NAME --cwd python pytest -p no:sugar" export JUPYTER_ID=$(docker compose ps -q jupyter) echo "jupyter_id=$JUPYTER_ID" >> $GITHUB_ENV From 8cdeea214042b4d0586b01dc4fc07c8e47ae6ec6 Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Mon, 11 Dec 2023 10:03:14 +0000 Subject: [PATCH 56/83] fix(ci): comment out `docker` `compose` `volume` mount for permission --- .github/workflows/ci.yaml | 2 +- compose.yml | 4 ++-- compose/jupyter/Dockerfile | 5 +++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 963510f7..d9dc724d 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -59,7 +59,7 @@ jobs: run: | docker compose build jupyter docker compose up jupyter --detach - docker compose exec jupyter bash -c "conda run -n $CONDA_ENV_NAME --cwd python pytest -p no:sugar" + docker compose exec jupyter bash -c "conda run -n $env_name --cwd python pytest pytest" export JUPYTER_ID=$(docker compose ps -q jupyter) echo "jupyter_id=$JUPYTER_ID" >> $GITHUB_ENV echo "jupyter_id=$JUPYTER_ID" diff --git a/compose.yml b/compose.yml index 6d8572ac..1de2bdbe 100644 --- a/compose.yml +++ b/compose.yml @@ -9,8 +9,8 @@ services: target: clim-recal-base ports: - "8888:8888" - volumes: - - .:/home/jovyan:rw + # volumes: + # - .:/home/jovyan:rw docs: build: diff --git a/compose/jupyter/Dockerfile b/compose/jupyter/Dockerfile index 67f1ae6f..3df90ba9 100644 --- a/compose/jupyter/Dockerfile +++ b/compose/jupyter/Dockerfile @@ -63,8 +63,9 @@ RUN "${CONDA_DIR}/envs/${env_name}/bin/python" -m ipykernel install --user --nam fix-permissions "/home/${NB_USER}" # Copy the rest of the clim-recal code to volume -COPY --chown=${NB_UID}:${NB_GID} . . - +# COPY --chown=${NB_UID}:${NB_GID} . . +# Keep below while permission ambiguity persists on compose volume mounting +COPY . . # Switch to default jupyter user # USER ${NB_UID} From 9c43ff1cf04f91f517b932d4cd00ab495c2593ca Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Mon, 11 Dec 2023 10:45:47 +0000 Subject: [PATCH 57/83] fix(ci): fix `docker` `ci` and reinstate env var --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index d9dc724d..00b782b0 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -59,7 +59,7 @@ jobs: run: | docker compose build jupyter docker compose up jupyter --detach - docker compose exec jupyter bash -c "conda run -n $env_name --cwd python pytest pytest" + docker compose exec -u 0 jupyter bash -c "conda run -n ${{ env.CONDA_ENV_NAME }} --cwd python pytest" export JUPYTER_ID=$(docker compose ps -q jupyter) echo "jupyter_id=$JUPYTER_ID" >> $GITHUB_ENV echo "jupyter_id=$JUPYTER_ID" From f934c50fc1cfbd546e8295ed907d358fbcd00230 Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Mon, 11 Dec 2023 10:52:11 +0000 Subject: [PATCH 58/83] fix(test): add `tests/test_user_accounts.xlsx` for tests --- python/tests/test_user_accounts.xlsx | Bin 0 -> 8944 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 python/tests/test_user_accounts.xlsx diff --git a/python/tests/test_user_accounts.xlsx b/python/tests/test_user_accounts.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..9e84eb8f529f5226a83200ad03b86d562dccfcda GIT binary patch literal 8944 zcmeHN1y>x|)@>lT1b27$1eYK|g9dkL+#7dEaDrQq;K40Okj9dR3iM_g0;K_qqF&k{rx4Yycbp0RR9{0E~|^EcBrOfS6|h02TlNT1VX8 z&KYRuZ1C2@0SMA#b+@%4&v^z-p9O%1T>roJU;G4069yGJ+0iAhr60tFM;FV+vEr%199i_@JRP#AwzI7Z4DD%BL5T?HYg5xF z;$iFVJ)mjDBZza-);i9@CKBZ!FffQRO9P}h)_1CKORT+;EhV(T!4>x3$e8a{$5`oU zKq=NP7D4H?i0i7KuSB7#%3!S0;B=QP z*8-yd$~9$Z0=>*090NvM8eV~GYT+S$!%bLS6n;^x-4kQY)atG4?b%BIA>u;muJI*~ zg#n9Brf2{~;P@;+$vUD2w~W(n#HULHCs%&(q=S6vo@Nb;xNC*Tv@tU}##8~H(w2EO zzc*7tk&lNl(D!p+|5x5N#X}QfZ@Ypq`0vjoZ=SHPFm@Zy+r-fO+bF1gLJn8^b_Ki+ zMmKeWj!u(y{Wu@t0Dz|_7=Y4WU|FliPIUo^HF*f?kRh-%a01$Z*jRra{|CqaVhsLi z>ZNgS6g$~bf{&#iLi(gWeS!%~4{sKQ^gb0z zZp<6b5vg99tV=)b8t&7<4391z#Lkkwlm5U6vchEuL;W2j0m2E%=a47T2muKp00GL~ zhV6HpxY;{d8{6Ak|75R!V+IOBVGu9>-K|vRja(NyIvDvegxxL86$fL^nT_&5ZT~q! ze+|O|Ed`hF#R?H)qn_rp92<;%u*dOGuj};-v~?J)%XX&1aLi|(ST@H3u=YdeLof)& zhh?OJQm|N<2YdS^hsbCjL0ApKaed60*t}ae^fUw^xdoIG7Y*aV+*ZTvE}T}NQIY{j z%na6i_rH6mdQcWvQ(sU1k-=>9sX=_<3hq;&2ie#;0iui`ss~_fXOe>Hrd{$1;jH<= z?2X5;b$jj|$|iA%9BJ_jA@5*k@{m3$GCzhm&}L3*liKl`v)KIrL+ipB^Wep?+4uh5 zsTIhZ{(H#0Y)}>Pf&l=AAlc7z$Q6)~`E#(8s@dA-@}fWbj{kt~aoIhFI*;I$gynXhqB(pn>fUuoy6 zi%BoTkEO%#$hGnsRDE1N%qxj4zn3ReE7G2^8Xqh*jZ2C>YLBzuX+q_2?r*cIG>&9Z zl){7OTR|wxW8$22LKMqfZ69-i8C1?BkNleSwnk!{&z@$-fo!J2%C0L@gBZJ zdKWv14IyvU_UpKVH=@1}=G2QnG;KHItvtJY zzym#*7v6>#Rr{#M79$^h7<==?h+aEccU{VI`+B$cokiy7eEW7QuK6P^h7##|?+M|9 zAChy8&>gd1rf4q$C~v*7raisB-8%sn3Z zi{tdX@{jsG*KP-3i53lBrextTdNK`Z1P>UFkwNs(ijc&|MmyjvOUGks&=D@1BP+y~ z>*`zE%WpPY_;q9eWiK-VkU%YG@8CGI<6Do(Wby}+>$Spbhr2HNBpeJU2I*4-vI1B= znml-lLulw0`+nLFpHjH8EVg;!VPOkmsmpD7Ez#z>b_GFa8r6&sQ}FgwWKs62l<+%m zr}8%V-ddaMdT~ zuN(6XQ`nvWjVysw#?us#YD{{Z$~cMj-s5wq?S+@T32qg}X-A>jFDxz`4F|f9Ft_xt zw91t&AYXk2exvYLdfBP7{SlU;Zjn>wFwrh0kgfJ*9RKV`)T$_>n(D_GY-+SlhS&rq zL2_o(TM_cOkK1*KvMfYw?F$@INY5yq`QNHwJS{R0MVaH@s)6IB%@&v+SNo$zy%Vmt^e-EBLv&oy(*MTS9{V!u!i*$pq>{c`Yc*H&Z>`t|q4=nbSWVh5oatnBd*+ zle&SS+Y6O4=Del~PL%6Do;3?G3V2W((MwXl$d`t#1h_7jG{WQ`EDU0V^QH8UQT(oE z)Y`rilaDrTb~s+bclc|$Y~#MuhuW^tUrATR-I3_Fnn${JzDrsA@*_;GIq{TX4|&CM zM%?tqM#Mcor)Z>||Ju-~*-)}mYCbPyX&IY}5^=`q) z&IN;9_*K-IR=*|Xwd(dGuhQfUFcBDtbo-+V|FJELW=;ksS$VdEJx zV886TU}DQ)R}{o=>Zwpu9rZJP3X72#J6Ux<>+f*vf2JW-_5mphOpa;ur1Ff9@zt;& z+^zjT@=iM*dW!-i(tF5%ZnJ;m9cK%mEs*WEBgfCEKF}J9AmYJp#k&(h0l7WA*odZC zULCQCU!pe6NFc7QJ5W~V;*4u1!h)vZyHugi%TE@w7fKixg+T>hL?cqy?2?Vq4b4)O zZBnP%$ZETyMo2w^KYzZ?^z}SWt|sIOFiVXY(~c0c}+^5P3_1dPGNvQvH;wVD(*9dyP z)vg7a2?R|0U`$%WuWuPpWz}vfgaaN6sKV>mIr9?r48t^uRIc2OyZYtp9+FbY7sKZO#>=wcV^ ziW9}yaVZq8sXXjhAlB@GZ5C{`o@*pKZJ_>;TJgFHcF`o#72l12wfN)U z$%#Xz{8IdRLs(PTXsq=St86LYdYdatV_jt~suH+`8jcxOz}r5q+FB(ZNm9}z9=bUr z#JMhC8Fq1mUJ(>JK_l*DNTb49BW@fz*a0MeH-9#?Y+KP;Xgww5ZAxVX83(&>XwhcR zparY)feN1l#|x5{2-EMJPJZoDwB>Bn5}x4<=XWN|@8q`lLoNwlwIZQxpxu+|b-j#} zbJ`hih`{HH6>^=8QW6Bs%g6O~`O>0MNXEJa$>4cM&k0Bfi4!YTb1c>))^+?aqx=f{ z-RffkUFaQfn53#XG8IgQ=NwoW0h|}Co2?xR;}YOMFG`$m!dG>U?7{b{7Y1pL8gUJ7 zH;ide8YYq)$d)qo4Xa(I8$to&imj&^oRSfkfY+x9ww31gj)CsD^kEn)dAEcTOR{>r zo9S;oGd`fbS$zS$N%lQrf?)=Bhdr*TRc1d5LCZYZ6Zgnl&wVX$ATgR$bjM|F>cRntAV z4UoD)f#6`Hbx!XS_|xct+Y$j^5UbZ8$<<;dYtJG%ifC1X$vz<)E=BiCDiiD#<*zP7VH=k$CW0COeg$v4!bPZNe!<084 zU!*WECQz|#IMRyHan47NqG*c)m)g?9^ z=9shQn9^Qyw)ShsO(b?(931z}t5r$xX2`4gSHWh)3EL((pKDT^Bg>J7Q(t?=a}}jR zDH*eIkw-hL&NVuESh-jhgC+A=s3M)eM6r*v2jZrQHW)V$IdSbp zJ;$(?q@?>AKufDS+9hCjTWf_-&q&2>eT%h&-#OzB*FZI?gO{WF)s;Xd)9%L2D2V{C zIz%FYUpHQ10FEOz#VetU#el^x5tVz4ILDSr!yjE(d*h=h6Iq%gy0bu^J7+tUW$+Td za;5@Vz?y2g!((yx3P~yT9B~JjEGild!@tdgk*BH2#svQ%%s(Q?X>W)lAgvN7cJtP4 z-EhESra63^FX00fe}s~2PKs$}$BVG?Z)1u~WcZ46AKSC{Sp`#yVXnZ6OFU##10Xod zE%SnI#=S}B_}vhJ@2Dkb;*-^k`F1}PSe(kOSsTzWr#xhv*Cu@;VA|PPK7HeSqt&d? z8UE#q@Pr{#1aeHq`hT|}ApWixK>1{RyA%;4Pu1U()6ct5M` zD2dr8Q_lk`*NQ?|Y38G2dGckjfnSoHWl113YDIq=q>_zMSMuH!Uw!aL zrS6^!X@mXRs z0DW_{3gHuF1q$nleq{Y1XEHI|WbiR<0bE-()jj>AQlP<}KfVM>Ct)EJjQ>0R{@nI~ zEPy~~5ZiCvZycN+uM?fdj{bgy=~9He6-&G$=h^#G1n#{SwI%(`D1ty7gTlkbx0kJM z1kEqAe`|yD2;C#H-_T>(T+TnR2M!aFDx$QSQ&6~h7gfKf@ zc65U4bpVZYYCL8}qew1X61}KB((!)Dycx;`57FeeSC-%H$8OMQPJ1o0fL^<;X&V~o z3&SQnk{jYq)myp&8~9qpK+e90-}Vf@C0Yp+V`zco*l?>Fj)J10S&8CxeRxJ)(}^z} z)$LbeUvzP?tt&jttp&l1uVkcoauLw<_LkrabV#p#GCMh$a#CqdWv>JKcy5;w6?{kt z38%MAF2YMe_7jXWSCi#uk~w6Xu&P5>os27P ziS_l%Kw9z916L?UiFORc@yN9Z>6!Xd*q>XW-=0r?UCD!DV{wwwZ4B$-h(adaMGT)(Ji7A{n|_HJ9FksSW4eqgN9G5*O*Y(DESF=|V;NH+bAI`7-b__cXc7 zq7_c>smx+f6{Xeq9Qw0dn1kYr z$S*i4trU(5ERS@9rKg<*HY7Ss!;`O5@8|F=SM*1GdUyJauwD4>9Uz^?4M)T5F@y@CS+y?w#eV2A@rLfNfDV z+1e!o@<=udBm|C8yJt^H=(pllM_-ldl%zbDYU&O?<*9>B&)+3xI+14|ae$8;l9r&$ zTcsLrUMPS^_>c_+J1-!Q8Q2OYEp_rkyBO#y$`koGx{K{Lfa{Geb=}@S_I2er-eg zm&JH8cU>s_k8IVUuZ0i;IYR^yD$)NOdPWWo|BF0G$o;FO$Gx#%_*nsgxg$eaBI2`E zMhPf2me#naG=_?EHCHGw$*!gjEsZ^xQLG!AoO7LzB4g7R&M7=X%V6@9Hw~)ja#9Y5 z1|!y9-pdLeld_6L1mb(6vbvXcVaP1*Mkrdvi*Dz9!2F9qY**c!!XiZ zfwQ43^VGkz;LQs>sAz6uQhs>W0ac1AM2(!uwMw2A$Ke{wyGup^R*C4#GsTtf=18~R zt^$t@D#rwY)(qJLzW4C04_AxdE!&Wvii9SE)?g6cb#4&jR=n^ zM4xxkE-NmG-PTC)S7to)-!9-{PT#^S>nGnDn7^PADc64b&jJbs%>p5lfBv@M-*4^T{XcwZpd|NK2Y;=C{@d_ppA3P6n$XNgXI?=Cwek~XL;prGsK=`G2@T>8!8s;Cy$tb_`?SB-_Urm3Ni~cY* zhg2;f_x)8o`qja&3GW{cws8M5@V}+Mzk2yKNBG0bEy3?I_%&nr)x%#|^bdOgfPe@9 p_#3DGYW~-i@XzK6B!4ph*Se@A2Mc+-000u?=MSN6ZStT0{twXIgwy~4 literal 0 HcmV?d00001 From 071ae48024c09569589c56cb248ea94065c88408 Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Mon, 11 Dec 2023 12:45:18 +0000 Subject: [PATCH 59/83] feat: add `jupyter` to rstuio `docker` config --- compose/server/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compose/server/Dockerfile b/compose/server/Dockerfile index 4daf96d2..70e480c0 100644 --- a/compose/server/Dockerfile +++ b/compose/server/Dockerfile @@ -14,7 +14,7 @@ ARG DOCKER_DATA_PATH=/mnt/vmfileshare RUN /rocker_scripts/install_pandoc.sh RUN /rocker_scripts/install_quarto.sh -# RUN /rocker_scripts/install_jupyter.sh +RUN /rocker_scripts/install_jupyter.sh # Install user script # From c6855fa7981633e1bd10b575a320ae56cfbd9b37 Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Wed, 27 Dec 2023 12:27:22 -0500 Subject: [PATCH 60/83] fix(doc): fix docstring formatting in utils.py and ceda_ftp_download.py --- python/data_download/ceda_ftp_download.py | 34 ++-- python/utils.py | 194 +++++++++++++--------- 2 files changed, 139 insertions(+), 89 deletions(-) diff --git a/python/data_download/ceda_ftp_download.py b/python/data_download/ceda_ftp_download.py index 5d95b66e..ab7969af 100644 --- a/python/data_download/ceda_ftp_download.py +++ b/python/data_download/ceda_ftp_download.py @@ -7,31 +7,35 @@ from pathlib import Path -def download_ftp(input, output, username, password, order): +def download_ftp(input: str, output: str, username: str, password: str, order: int) -> None: """ - Function to connect to the CEDA archive and download data. Note you need to have a user account and - provide your username and FTP password. + Function to connect to the CEDA archive and download data. + + Note + ---- + You need to have a user account and provide your username and `FTP` password. Parameters ---------- - input: str + input Path where the CEDA data to download is located - (e.g '/badc/ukmo-hadobs/data/insitu/MOHC/HadOBS/HadUK-Grid/v1.1.0.0/1km/tasmin/day/v20220310' - or top level folder like '/badc/ukcp18/data/land-cpm/uk/2.2km/rcp85' if you want to + (e.g `/badc/ukmo-hadobs/data/insitu/MOHC/HadOBS/HadUK-Grid/v1.1.0.0/1km/tasmin/day/v20220310` + or top level folder like `/badc/ukcp18/data/land-cpm/uk/2.2km/rcp85` if you want to download all files in all sub-directories). - output: str + output Path to save the downloaded data - sub-directories will be created automatically under the output directory. - username: str + username CEDA registered username - password: str - CEDA FPT password (obtained as explained in https://help.ceda.ac.uk/article/280-ftp) - order: int - Order in which to run download 0: default order of file from ftp server 1: reverse order 2: shuffle. - This functionality allows to run several downloads in parallel without rewriting files that are being downloaded. + password + CEDA FPT password (obtained as explained in `https://help.ceda.ac.uk/article/280-ftp`) + order + Order in which to run download - Returns - ------- + `0`: default order of file from `FTP` server + `1`: reverse order + `2`: shuffle. + This functionality allows to run several downloads in parallel without rewriting files that are being downloaded. """ # If directory doesn't exist make it diff --git a/python/utils.py b/python/utils.py index e0ea784b..eca94930 100644 --- a/python/utils.py +++ b/python/utils.py @@ -20,9 +20,17 @@ def date_to_str( ) -> str: """Return a `str` in `date_format_str` of `date_obj`. + Parameters + ---------- + date_obj + A `datetime.date` or `str` object to convert. + in_format_str + A `strftime` format `str` to convert `date_obj` from if `date_obj` is a `str`. + out_format_str + A `strftime` format `str` to convert `date_obj` to. + Examples -------- - >>> date_to_str('20100101') '20100101' >>> date_to_str(date(2010, 1, 1)) @@ -43,9 +51,21 @@ def date_range_to_str( ) -> str: """Take `start_date` and `end_date` `str` or `date` instances and return a range `str`. + Parameters + ---------- + start_date + First date in range. + end_date + Last date in range + split_str + `char` to split returned date range `str`. + in_format_str + A `strftime` format `str` to convert `start_date` from. + out_format_str + A `strftime` format `str` to convert `end_date` from. + Examples -------- - >>> date_range_to_str('20100101', '20100330') '20100101-20100330' >>> date_range_to_str(date(2010, 1, 1), '20100330') @@ -64,9 +84,13 @@ def date_range_to_str( def iter_to_tuple_strs(iter_var: Iterable[Any]) -> tuple[str, ...]: """Return a `tuple` with all components converted to `strs`. + Parameters + ---------- + iter_var + Iterable of objects that can be converted into `strs`. + Examples -------- - >>> iter_to_tuple_strs(['cat', 1, Path('a/path')]) ('cat', '1', 'a/path') @@ -79,9 +103,25 @@ def path_iterdir( ) -> Generator[Optional[Path], None, None]: """Return an `Generator` after ensuring `path` exists. + Parameters + ---------- + path + `Path` of folder to iterate through + strict + Whether to raise `FileNotFoundError` if `path` not found. + + Returns + ------- + A `Generator` of `Paths` within folder `path`. + + Raises + ------ + FileNotFoundError + Raised if `strict = True` and `path` does not exist. + + Examples -------- - >>> tmp_path = getfixture('tmp_path') >>> from os import chdir >>> chdir(tmp_path) @@ -101,7 +141,6 @@ def path_iterdir( >>> example_path.unlink() >>> tuple(path_iterdir(example_path.parent)) () - """ try: yield from path.iterdir() @@ -120,27 +159,29 @@ def make_user( ) -> Path: """Make user account and copy code to that environment. - Args: - user: user and home folder name - password: login password - code_path: path to copy code from to user path - - Example: - ```pycon - >>> import os - >>> if os.geteuid() != 0: - ... pytest.skip('requires root permission to run') - >>> user_name: str = 'very_unlinkely_test_user' - >>> password: str = 'test_pass' - >>> code_path: Path = Path('/home/jovyan') - >>> make_user(user_name, password, code_path=JUPYTER_DOCKER_USER_PATH) - PosixPath('/home/very_unlinkely_test_user') - >>> Path(f'/home/{user_name}/python/conftest.py').is_file() - True - >>> rm_user(user_name) - 'very_unlinkely_test_user' - - ``` + Parameters + ---------- + user + user and home folder name + password + login password + code_path + path to copy code from to user path + + Examples + -------- + >>> import os + >>> if os.geteuid() != 0: + ... pytest.skip('requires root permission to run') + >>> user_name: str = 'very_unlinkely_test_user' + >>> password: str = 'test_pass' + >>> code_path: Path = Path('/home/jovyan') + >>> make_user(user_name, password, code_path=JUPYTER_DOCKER_USER_PATH) + PosixPath('/home/very_unlinkely_test_user') + >>> Path(f'/home/{user_name}/python/conftest.py').is_file() + True + >>> rm_user(user_name) + 'very_unlinkely_test_user' """ home_path: Path = user_home_path / user subprocess.run(f"useradd {user}", shell=True) @@ -154,23 +195,24 @@ def make_user( def rm_user(user: str, user_home_path: Path = DEBIAN_HOME_PATH) -> str: """Remove user and user home folder. - Args: - user: user and home folder name - password: login password - - Example: - ```pycon - >>> import os - >>> if os.geteuid() != 0: - ... pytest.skip('requires root permission to run') - >>> user_name: str = 'very_unlinkely_test_user' - >>> password: str = 'test_pass' - >>> make_user(user_name, password, code_path=JUPYTER_DOCKER_USER_PATH) - PosixPath('/home/very_unlinkely_test_user') - >>> rm_user(user_name) - 'very_unlinkely_test_user' - - ``` + Parameters + ---------- + user + User home folder name (usually the same as the user login name). + user_home_path + Parent path of `user` folder name. + + Examples + -------- + >>> import os + >>> if os.geteuid() != 0: + ... pytest.skip('requires root permission to run') + >>> user_name: str = 'very_unlinkely_test_user' + >>> password: str = 'test_pass' + >>> make_user(user_name, password, code_path=JUPYTER_DOCKER_USER_PATH) + PosixPath('/home/very_unlinkely_test_user') + >>> rm_user(user_name) + 'very_unlinkely_test_user' """ subprocess.run(f"userdel {user}", shell=True) rmtree(user_home_path / user) @@ -182,37 +224,41 @@ def make_users( ) -> Generator[Path, None, None]: """Load a file of usernames and passwords and to pass to make_user. - Args: - file_path: path to collumned file including user names and passwords per row - user_col: str of column name for user names - password_col: name of column name for passwords - file_reader: function to read `file_path` - **kwargs: additional parameters for to pass to `file_reader` - - Example: - ```pycon - >>> import os - >>> if os.geteuid() != 0: - ... pytest.skip('requires root permission to run') - >>> from pandas import read_excel - >>> code_path: Path = Path('/home/jovyan') - >>> def excel_row_iter(path: Path, **kwargs) -> dict: - ... df: DataFrame = read_excel(path, **kwargs) - ... return df.to_dict(orient="records") - >>> test_accounts_path: Path = Path('tests/test_user_accounts.xlsx') - >>> user_paths: tuple[Path, ...] = tuple(make_users( - ... file_path=test_accounts_path, - ... user_col="User Name", - ... password_col="Password", - ... file_reader=excel_row_iter, - ... code_path=JUPYTER_DOCKER_USER_PATH, - ... )) - >>> [(path / 'python' / 'conftest.py').is_file() for path in user_paths] - [True, True, True, True, True] - >>> [rm_user(user_path.name) for user_path in user_paths] - ['sally', 'george', 'jean', 'felicity', 'frank'] - - ``` + Parameters + ---------- + file_path + `Path` to collumned file including user names and passwords per row. + user_col + `str` of column name for user names. + password_col + `str` of column name for passwords. + file_reader + Callable (function) to read `file_path`. + **kwargs + Additional parameters for to pass to `file_reader` function. + + Examples + -------- + >>> import os + >>> if os.geteuid() != 0: + ... pytest.skip('requires root permission to run') + >>> from pandas import read_excel + >>> code_path: Path = Path('/home/jovyan') + >>> def excel_row_iter(path: Path, **kwargs) -> dict: + ... df: DataFrame = read_excel(path, **kwargs) + ... return df.to_dict(orient="records") + >>> test_accounts_path: Path = Path('tests/test_user_accounts.xlsx') + >>> user_paths: tuple[Path, ...] = tuple(make_users( + ... file_path=test_accounts_path, + ... user_col="User Name", + ... password_col="Password", + ... file_reader=excel_row_iter, + ... code_path=JUPYTER_DOCKER_USER_PATH, + ... )) + >>> [(path / 'python' / 'conftest.py').is_file() for path in user_paths] + [True, True, True, True, True] + >>> [rm_user(user_path.name) for user_path in user_paths] + ['sally', 'george', 'jean', 'felicity', 'frank'] """ for record in file_reader(file_path): yield make_user(user=record[user_col], password=record[password_col], **kwargs) From 3c1247947d59f81d7d669087f83ef4ea060f54ad Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Tue, 16 Jan 2024 10:17:50 -0500 Subject: [PATCH 61/83] fix(ci): add jupyter_hub config --- .pre-commit-config.yaml | 6 +-- _quarto.yml | 3 ++ compose.yml | 8 ++-- compose/jupyter-hub.yml | 46 +++++++++++++++++++++++ compose/server/jupyterhub_config.py | 58 +++++++++++++++++++++++++++++ 5 files changed, 114 insertions(+), 7 deletions(-) create mode 100644 compose/jupyter-hub.yml create mode 100644 compose/server/jupyterhub_config.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4ac55731..daf87f52 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,7 @@ exclude: "data" repos: - repo: https://github.com/psf/black - rev: "23.9.1" + rev: "23.12.1" hooks: - id: black-jupyter @@ -27,13 +27,13 @@ repos: - id: rst-inline-touching-normal - repo: https://github.com/hadialqattan/pycln - rev: v2.2.2 + rev: v2.4.0 hooks: - id: pycln args: ["python/"] - repo: https://github.com/pycqa/isort - rev: 5.12.0 + rev: 5.13.2 hooks: - id: isort name: isort (python) diff --git a/_quarto.yml b/_quarto.yml index 97a7148e..a3906dd4 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -14,6 +14,7 @@ project: - "docs/reference" - "docs/pipeline.qmd" - "docs/contributing.md" + - "docs/pipeline_guidance.md" - "python/README.md" # - "notebooks/Assessing_bc_data/MethodsAssessment_DecWorkshop.Rmd" # Requires dataset mounted to run notebook @@ -36,6 +37,8 @@ website: - text: "Summary" href: "README.md" - text: "Pipeline" + href: "docs/pipeline_guidance.md" + - text: "Pipeline Diagram" href: "docs/pipeline.qmd" - text: "Contributing" href: "docs/contributing.md" diff --git a/compose.yml b/compose.yml index 1de2bdbe..f76757f6 100644 --- a/compose.yml +++ b/compose.yml @@ -9,8 +9,8 @@ services: target: clim-recal-base ports: - "8888:8888" - # volumes: - # - .:/home/jovyan:rw + volumes: + - .:/home/jovyan:rw docs: build: @@ -19,7 +19,7 @@ services: ports: - "8080:80" volumes: - - .:/home/jovyan + - .:/home/jovyan:rw rstudio: build: @@ -28,4 +28,4 @@ services: ports: - "8787:8787" volumes: - - .:/home/rstudio + - .:/home/rstudio:rw diff --git a/compose/jupyter-hub.yml b/compose/jupyter-hub.yml new file mode 100644 index 00000000..0ef4983e --- /dev/null +++ b/compose/jupyter-hub.yml @@ -0,0 +1,46 @@ +# Copyright (c) Jupyter Development Team. +# Distributed under the terms of the Modified BSD License. + +# JupyterHub docker compose configuration file +version: "3.8" + +services: + jupyter: + build: + context: . + dockerfile: ./server/Dockerfile.jupyterhub + args: + JUPYTERHUB_VERSION: latest + # restart: always + image: jupyterhub + container_name: jupyterhub + networks: + - jupyterhub-network + volumes: + # Mount the local repo, including the `R` Components + - ..:/home/jovyan:rw + # The JupyterHub configuration file + - "./server/jupyterhub_config.py:/srv/jupyterhub/jupyterhub_config.py:ro" + # Bind Docker socket on the host so we can connect to the daemon from + # within the container + - "/var/run/docker.sock:/var/run/docker.sock:rw" + # Bind Docker volume on host for JupyterHub database and cookie secrets + - "jupyterhub-data:/data" + ports: + - "8888:8000" + environment: + # This username will be a JupyterHub admin + JUPYTERHUB_ADMIN: admin + # All containers will join this network + DOCKER_NETWORK_NAME: jupyterhub-network + # JupyterHub will spawn this Notebook image for users + DOCKER_NOTEBOOK_IMAGE: quay.io/jupyter/base-notebook:latest + # Notebook directory inside user image + DOCKER_NOTEBOOK_DIR: /home/jovyan/work + +volumes: + jupyterhub-data: + +networks: + jupyterhub-network: + name: jupyterhub-network diff --git a/compose/server/jupyterhub_config.py b/compose/server/jupyterhub_config.py new file mode 100644 index 00000000..028f65cb --- /dev/null +++ b/compose/server/jupyterhub_config.py @@ -0,0 +1,58 @@ +# Copyright (c) Jupyter Development Team. +# Distributed under the terms of the Modified BSD License. + +# Configuration file for JupyterHub +import os + +c = get_config() # noqa: F821 + +# We rely on environment variables to configure JupyterHub so that we +# avoid having to rebuild the JupyterHub container every time we change a +# configuration parameter. + +# Spawn single-user servers as Docker containers +c.JupyterHub.spawner_class = "dockerspawner.DockerSpawner" + +# Spawn containers from this image +c.DockerSpawner.image = os.environ["DOCKER_NOTEBOOK_IMAGE"] + +# Connect containers to this Docker network +network_name = os.environ["DOCKER_NETWORK_NAME"] +c.DockerSpawner.use_internal_ip = True +c.DockerSpawner.network_name = network_name + +# Explicitly set notebook directory because we'll be mounting a volume to it. +# Most `jupyter/docker-stacks` *-notebook images run the Notebook server as +# user `jovyan`, and set the notebook directory to `/home/jovyan/work`. +# We follow the same convention. +notebook_dir = os.environ.get("DOCKER_NOTEBOOK_DIR", "/home/jovyan/work") +c.DockerSpawner.notebook_dir = notebook_dir + +# Mount the real user's Docker volume on the host to the notebook user's +# notebook directory in the container +c.DockerSpawner.volumes = {"jupyterhub-user-{username}": notebook_dir} + +# Remove containers once they are stopped +c.DockerSpawner.remove = True + +# For debugging arguments passed to spawned containers +c.DockerSpawner.debug = True + +# User containers will access hub by container name on the Docker network +c.JupyterHub.hub_ip = "jupyterhub" +c.JupyterHub.hub_port = 8080 + +# Persist hub data on volume mounted inside container +c.JupyterHub.cookie_secret_file = "/data/jupyterhub_cookie_secret" +c.JupyterHub.db_url = "sqlite:////data/jupyterhub.sqlite" + +# Authenticate users with Native Authenticator +c.JupyterHub.authenticator_class = "nativeauthenticator.NativeAuthenticator" + +# Allow anyone to sign-up without approval +c.NativeAuthenticator.open_signup = True + +# Allowed admins +admin = os.environ.get("JUPYTERHUB_ADMIN") +if admin: + c.Authenticator.admin_users = [admin] From 73f4a2d1b7954f9b242496e4e884ae55293c29bf Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Thu, 18 Jan 2024 12:09:40 -0500 Subject: [PATCH 62/83] fix(ci): correct the port for default compose Jupyter --- compose.yml | 2 +- compose/{ => jupyterhub}/jupyter-hub.yml | 0 compose/{server => jupyterhub}/jupyterhub_config.py | 0 compose/{server => rstudio}/Dockerfile | 0 4 files changed, 1 insertion(+), 1 deletion(-) rename compose/{ => jupyterhub}/jupyter-hub.yml (100%) rename compose/{server => jupyterhub}/jupyterhub_config.py (100%) rename compose/{server => rstudio}/Dockerfile (100%) diff --git a/compose.yml b/compose.yml index f76757f6..b104cd82 100644 --- a/compose.yml +++ b/compose.yml @@ -24,7 +24,7 @@ services: rstudio: build: context: . - dockerfile: ./compose/server/Dockerfile + dockerfile: ./compose/rstudio/Dockerfile ports: - "8787:8787" volumes: diff --git a/compose/jupyter-hub.yml b/compose/jupyterhub/jupyter-hub.yml similarity index 100% rename from compose/jupyter-hub.yml rename to compose/jupyterhub/jupyter-hub.yml diff --git a/compose/server/jupyterhub_config.py b/compose/jupyterhub/jupyterhub_config.py similarity index 100% rename from compose/server/jupyterhub_config.py rename to compose/jupyterhub/jupyterhub_config.py diff --git a/compose/server/Dockerfile b/compose/rstudio/Dockerfile similarity index 100% rename from compose/server/Dockerfile rename to compose/rstudio/Dockerfile From eab98acffbb8ea63433eafb481a47fa84a87756d Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Thu, 18 Jan 2024 13:55:45 -0500 Subject: [PATCH 63/83] fix(ci): fix `quarto` render without quartodoc and better speed --- compose/docs/Dockerfile | 27 +++++++++++++++++++++------ environment.yml | 2 +- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/compose/docs/Dockerfile b/compose/docs/Dockerfile index a840dddc..656d736c 100644 --- a/compose/docs/Dockerfile +++ b/compose/docs/Dockerfile @@ -1,9 +1,10 @@ ARG QUARTO_VERSION="1.3.450" -FROM ghcr.io/quarto-dev/quarto:${QUARTO_VERSION} AS builder +# FROM ghcr.io/quarto-dev/quarto:${QUARTO_VERSION} AS builder +FROM registry.gitlab.com/quarto-forge/docker/polyglot as builder ARG PORT=8080 -ARG py_ver=3.9 +ARG py_ver=3.10 ENV DEBIAN_FRONTEND=noninteractive # ARG RIG_VERSION="latest" @@ -22,11 +23,25 @@ WORKDIR /app EXPOSE ${PORT}:${PORT} # RUN quarto preview --port ${PORT}:${PORT} -RUN apt-get update && apt-get install -y python${py_ver} python3-pip r-base r-base-dev -RUN pip3 install quartodoc && quartodoc build -RUN Rscript -e 'install.packages("rmarkdown", repos="https://cloud.r-project.org")' +# RUN apt-get update && apt-get install -y python${py_ver} python3-pip r-base r-base-dev +# RUN micromamba install -y -n base quartodoc && quartodoc build -RUN quarto render +RUN micromamba config append channels conda-forge && \ + micromamba install -y -n base quartodoc + + +USER root + +# RUN micromamba run -n base quartodoc build && \ +# micromamba run -n base quarto render + +RUN micromamba run -n base quarto render + +# RUN quartodoc build + +# RUN Rscript -e 'install.packages("rmarkdown", repos="https://cloud.r-project.org")' + +# RUN micromamba run -n base quarto render FROM httpd:alpine COPY --from=builder /app/_site/ /usr/local/apache2/htdocs/ diff --git a/environment.yml b/environment.yml index a44857e3..312016d5 100644 --- a/environment.yml +++ b/environment.yml @@ -50,7 +50,7 @@ dependencies: - pytest-cov==4.1.0 - pytest-sugar==0.9.7 - pytz==2022.5 - - quartodoc==0.6.3 + - quartodoc==0.7.2 - rasterio==1.3.3 - rioxarray==0.12.3 - scipy==1.10.0 From 3b42e2f795316b5dc4209f937c3254477e6ed448 Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Fri, 19 Jan 2024 00:22:12 -0500 Subject: [PATCH 64/83] fix(ci): remove echo command in pytest CI --- .github/workflows/ci.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 00b782b0..b60a7420 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -62,7 +62,6 @@ jobs: docker compose exec -u 0 jupyter bash -c "conda run -n ${{ env.CONDA_ENV_NAME }} --cwd python pytest" export JUPYTER_ID=$(docker compose ps -q jupyter) echo "jupyter_id=$JUPYTER_ID" >> $GITHUB_ENV - echo "jupyter_id=$JUPYTER_ID" - name: Check accessing saved jupyter_id run: | From ecb21df074988ea996fa5bb0c13bb2ae2b429f8b Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Thu, 25 Jan 2024 08:42:10 +0000 Subject: [PATCH 65/83] fix(ci): attempt CI test refactor via miniconda --- .github/workflows/ci.yaml | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index b60a7420..d0a19a08 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -5,6 +5,7 @@ env: DOCKER_BUILDKIT: 1 COMPOSE_DOCKER_CLI_BUILD: 1 CONDA_ENV_NAME: clim-recal + CONDA_ENV_PATH: environment.yml MIN_PYTHON_VERSION: 3.9 PYTHON_MODULE_FOLDER: /home/jovyan/python/ COVERAGE_SVG_FOLDER: docs/assets/ @@ -55,21 +56,40 @@ jobs: - name: Checkout Code Repository uses: actions/checkout@main - - name: Build, Test and Save Test Coverage + - name: Build Conda Environment + uses: conda-incubator/setup-miniconda@v3 + with: + activate-environment: ${{ env.CONDA_ENV_NAME }} + environment-file: ${{ env.CONDA_ENV_PATH }} + auto-activate-base: false + miniforge-version: latest + + - name: Test and Save Test Coverage + shell: bash -el {0} run: | - docker compose build jupyter - docker compose up jupyter --detach - docker compose exec -u 0 jupyter bash -c "conda run -n ${{ env.CONDA_ENV_NAME }} --cwd python pytest" + mamba run -n ${{ env.CONDA_ENV_NAME }} --cwd python pytest export JUPYTER_ID=$(docker compose ps -q jupyter) echo "jupyter_id=$JUPYTER_ID" >> $GITHUB_ENV + # - name: Build, Test and Save Test Coverage + # run: | + # docker compose build jupyter + # docker compose up jupyter --detach + # docker compose exec -u 0 jupyter bash -c "conda run -n ${{ env.CONDA_ENV_NAME }} --cwd python pytest" + # export JUPYTER_ID=$(docker compose ps -q jupyter) + # echo "jupyter_id=$JUPYTER_ID" >> $GITHUB_ENV + - name: Check accessing saved jupyter_id run: | echo ${{ env.jupyter_id }} + # - name: Copy test coverage results + # run: | + # docker cp ${{ env.jupyter_id }}:${{ env.PYTHON_MODULE_FOLDER }}${{ env.COVERAGE_SVG_PATH }} ${{ env.COVERAGE_SVG_FOLDER }} + - name: Copy test coverage results run: | - docker cp ${{ env.jupyter_id }}:${{ env.PYTHON_MODULE_FOLDER }}${{ env.COVERAGE_SVG_PATH }} ${{ env.COVERAGE_SVG_FOLDER }} + cp ${{ env.PYTHON_MODULE_FOLDER }}${{ env.COVERAGE_SVG_PATH }} ${{ env.COVERAGE_SVG_FOLDER }} - name: Archive coverage svg uses: actions/upload-artifact@v3 @@ -77,8 +97,8 @@ jobs: name: coverage-badge path: ${{ env.COVERAGE_SVG_PATH }} - - name: Tear down the Stack - run: docker compose down + # - name: Tear down the Stack + # run: docker compose down docs: needs: [linter, pytest] From d7fbfa6b433c0d2fe25e4a34ff870016fe21ad82 Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Fri, 26 Jan 2024 11:33:10 +0000 Subject: [PATCH 66/83] fix(ci): remove svg coverage render to test doc render workflow --- .github/workflows/ci.yaml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index d0a19a08..0d2bd417 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -87,15 +87,15 @@ jobs: # run: | # docker cp ${{ env.jupyter_id }}:${{ env.PYTHON_MODULE_FOLDER }}${{ env.COVERAGE_SVG_PATH }} ${{ env.COVERAGE_SVG_FOLDER }} - - name: Copy test coverage results - run: | - cp ${{ env.PYTHON_MODULE_FOLDER }}${{ env.COVERAGE_SVG_PATH }} ${{ env.COVERAGE_SVG_FOLDER }} - - - name: Archive coverage svg - uses: actions/upload-artifact@v3 - with: - name: coverage-badge - path: ${{ env.COVERAGE_SVG_PATH }} + # - name: Copy test coverage results + # run: | + # cp ${{ env.PYTHON_MODULE_FOLDER }}${{ env.COVERAGE_SVG_PATH }} ${{ env.COVERAGE_SVG_FOLDER }} + # + # - name: Archive coverage svg + # uses: actions/upload-artifact@v3 + # with: + # name: coverage-badge + # path: ${{ env.COVERAGE_SVG_PATH }} # - name: Tear down the Stack # run: docker compose down From d9947e245f860a8b53f7f18da9a1380d2f80cbcf Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Fri, 26 Jan 2024 11:40:03 +0000 Subject: [PATCH 67/83] fix(ci): drop svg doc render for testing --- .github/workflows/ci.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 0d2bd417..6e743878 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -106,11 +106,11 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Download coverage svg - uses: actions/download-artifact@v3 - with: - name: coverage-badge - path: ${{ env.GH_PAGE_PATH }}${{ env.COVERAGE_SVG_PATH }} + # - name: Download coverage svg + # uses: actions/download-artifact@v3 + # with: + # name: coverage-badge + # path: ${{ env.GH_PAGE_PATH }}${{ env.COVERAGE_SVG_PATH }} # Other options for documentation build for future testing outside docker # - name: Set up conda environment From ced234ced9c38fb258eb8f6a6d259f594d646c43 Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Fri, 26 Jan 2024 15:29:27 +0000 Subject: [PATCH 68/83] fix(doc): improve comments in rstudio/Dockerfile --- compose/rstudio/Dockerfile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/compose/rstudio/Dockerfile b/compose/rstudio/Dockerfile index 70e480c0..67a99a7b 100644 --- a/compose/rstudio/Dockerfile +++ b/compose/rstudio/Dockerfile @@ -6,6 +6,7 @@ ARG HOST_DATA_PATH=/Volumes/vmfileshare # The local_data_path is an absolute path to mount ClimateData within `docker` ARG DOCKER_DATA_PATH=/mnt/vmfileshare +# Potential library versions/defaults to customise in future # ENV S6_VERSION=v2.1.0.2 # ENV RSTUDIO_VERSION=2023.09.1+494 # ENV DEFAULT_USER=rstudio @@ -16,23 +17,22 @@ RUN /rocker_scripts/install_pandoc.sh RUN /rocker_scripts/install_quarto.sh RUN /rocker_scripts/install_jupyter.sh -# Install user script -# - # Maybe install necessary geo packages for terra following https://github.com/rspatial/terra/issues/248 RUN apt-get update && apt-get -y install libudunits2-dev libgdal-dev libgeos-dev libproj-dev \ gdal-bin python3-gdal libgdal-dev build-essential # Python specific deps -# Then install + +# For future: add dependencies to include to speed up notebook deploy, including # library(ggplot2) # library(terra) # library(tmap) #pretty maps # library(RColorBrewer) # library(tidyverse) # library(kableExtra) - -# Install # install.packages("hydroGOF") + COPY . /home/rstudio/ + +# If using outside docker compose, port may need to be exposed # EXPOSE 8787 CMD ["/init"] From ef05e112a34bb003e2549f284eb9919ec3d2f508 Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Fri, 26 Jan 2024 23:50:08 +0000 Subject: [PATCH 69/83] feat(doc): add docker docs --- _quarto.yml | 3 +++ python/.pytest.ini | 2 +- python/utils.py | 39 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 42 insertions(+), 2 deletions(-) diff --git a/_quarto.yml b/_quarto.yml index a3906dd4..a5559d08 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -15,6 +15,7 @@ project: - "docs/pipeline.qmd" - "docs/contributing.md" - "docs/pipeline_guidance.md" + - "docs/docker-configurations.qmd" - "python/README.md" # - "notebooks/Assessing_bc_data/MethodsAssessment_DecWorkshop.Rmd" # Requires dataset mounted to run notebook @@ -62,6 +63,8 @@ website: text: "download_ftp" - href: "docs/reference/utils.qmd" text: "Utilities" + - text: "Docker" + href: "docs/docker-configurations.qmd" quartodoc: diff --git a/python/.pytest.ini b/python/.pytest.ini index 90ba61db..8b80b6e0 100644 --- a/python/.pytest.ini +++ b/python/.pytest.ini @@ -10,7 +10,7 @@ addopts = -ra -q --cov=. --cov-report=term:skip-covered -doctest_optionflags = ELLIPSIS +doctest_optionflags = ELLIPSIS NORMALIZE_WHITESPACE pythonpath = . testpaths = diff --git a/python/utils.py b/python/utils.py index eca94930..29f850e1 100644 --- a/python/utils.py +++ b/python/utils.py @@ -1,5 +1,6 @@ """Utility functions.""" import subprocess +from csv import DictReader from datetime import date, datetime from pathlib import Path from shutil import rmtree @@ -219,10 +220,46 @@ def rm_user(user: str, user_home_path: Path = DEBIAN_HOME_PATH) -> str: return user +def csv_reader(path: Path, **kwargs) -> Generator[dict[str, str], None, None]: + """Yield a `dict` per for from file `path`. + + Parameters + ---------- + path + `CSV` file `Path`. + **kwargs + Additional parameters for `csv.DictReader`. + + Yields + ------ + A `dict` per row from `path`. + + Examples + -------- + >>> import csv + >>> csv_path: Path = 'test_auth.csv' + >>> auth_dict: dict[str, str] = { + ... 'sally': 'fig*new£kid', + ... 'george': 'tee&iguana*sky', + ... 'susan': 'history!bill-walk',} + >>> field_names: tuple[str, str] = ('user_name', 'password') + >>> with open(csv_path, 'w') as csv_file: + ... writer = csv.writer(csv_file) + ... line_num: int = writer.writerow(('user_name', 'password')) + ... for user_name, password in auth_dict.items(): + ... line_num = writer.writerow((user_name, password)) + >>> tuple(csv_reader(csv_path)) + ({'user_name': 'sally', 'password': 'fig*new£kid'}, {'user_name': 'george', 'password': 'tee&iguana*sky'}, {'user_name': 'susan', 'password': 'history!bill-walk'}) + """ + with open(path) as csv_file: + for row in DictReader(csv_file, **kwargs): + yield row + + def make_users( file_path: Path, user_col: str, password_col: str, file_reader: Callable, **kwargs ) -> Generator[Path, None, None]: - """Load a file of usernames and passwords and to pass to make_user. + """Load a file of usernames and passwords to pass to `make_user`. Parameters ---------- From bd3efadc6bb9311baf8d2cfdd77da52c33f815e4 Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Fri, 26 Jan 2024 23:55:39 +0000 Subject: [PATCH 70/83] fix(test): remove commented, refactored debiased tests --- python/debiasing/debias_wrapper.py | 112 ----------------------------- python/tests/test_debiasing.py | 2 + 2 files changed, 2 insertions(+), 112 deletions(-) diff --git a/python/debiasing/debias_wrapper.py b/python/debiasing/debias_wrapper.py index 6337120c..ce57a11f 100644 --- a/python/debiasing/debias_wrapper.py +++ b/python/debiasing/debias_wrapper.py @@ -606,115 +606,3 @@ def to_cli_run_cmethods_str( processors=processors, ) ) - - -# @pytest.mark.server -# @pytest.mark.slow -# @pytest.mark.parametrize( -# -# "city, variable, run, method, calib_start, calib_end, valid_start, valid_end", -# ( -# ( -# CityOptions.GLASGOW, -# VariableOptions.TASMAX, -# RunOptions.FIVE, -# MethodOptions.QUANTILE_DELTA_MAPPING, -# date(1980, 12, 1), -# date(2010, 11, 29), -# date(2010, 11, 30), -# date(2020, 11, 30), -# ), -# ( -# CityOptions.GLASGOW, -# VariableOptions.TASMAX, -# RunOptions.FIVE, -# MethodOptions.DELTA_METHOD, -# date(1980, 12, 1), -# date(2010, 11, 29), -# date(2010, 11, 30), -# date(2020, 11, 30), -# ), -# ( -# CityOptions.GLASGOW, -# VariableOptions.TASMAX, -# RunOptions.FIVE, -# MethodOptions.QUANTILE_MAPPING, -# date(1980, 12, 1), -# date(2010, 11, 29), -# date(2010, 11, 30), -# date(2020, 11, 30), -# ), -# ( -# CityOptions.GLASGOW, -# VariableOptions.TASMAX, -# RunOptions.FIVE, -# MethodOptions.VARIANCE_SCALING, -# date(1980, 12, 1), -# date(2010, 11, 29), -# date(2010, 11, 30), -# date(2020, 11, 30), -# ), -# ( -# CityOptions.GLASGOW, -# VariableOptions.TASMAX, -# RunOptions.SIX, -# MethodOptions.QUANTILE_DELTA_MAPPING, -# date(1980, 12, 1), -# date(2010, 11, 29), -# date(2010, 11, 30), -# date(2020, 11, 30), -# ), -# ( -# CityOptions.GLASGOW, -# VariableOptions.TASMAX, -# RunOptions.SIX, -# MethodOptions.DELTA_METHOD, -# date(1980, 12, 1), -# date(2010, 11, 29), -# date(2010, 11, 30), -# date(2020, 11, 30), -# ), -# ( -# CityOptions.GLASGOW, -# VariableOptions.TASMAX, -# RunOptions.SIX, -# MethodOptions.QUANTILE_MAPPING, -# date(1980, 12, 1), -# date(2010, 11, 29), -# date(2010, 11, 30), -# date(2020, 11, 30), -# ), -# ( -# CityOptions.GLASGOW, -# VariableOptions.TASMAX, -# RunOptions.SIX, -# MethodOptions.VARIANCE_SCALING, -# date(1980, 12, 1), -# date(2010, 11, 29), -# date(2010, 11, 30), -# date(2020, 11, 30), -# ), -# ), -# ) -# def test_run_workshop(run_config, city, variable, run, method, calib_start, calib_end, valid_start, valid_end) -> None: -# """Test running generated command script via a subprocess.""" -# initial_folder: Path = Path().resolve() -# chdir(run_config.command_path) -# assert PREPROCESS_FILE_NAME in tuple(Path().iterdir()) -# preprocess_run: subprocess.CompletedProcess = subprocess.run( -# run_config.to_cli_preprocess_tuple_strs(city=city, variable=variable, run=run, calib_start=calib_start, calib_end=calib_end, valid_start=valid_start, valid_end=valid_end), -# capture_output=True, -# text=True, -# ) -# assert preprocess_run.returncode == 0 -# cmethods_run: subprocess.CompletedProcess = subprocess.run( -# run_config.to_cli_run_cmethods_tuple_strs( -# city=city, run=run, variable=variable, method=method -# ), -# capture_output=True, -# text=True, -# ) -# assert cmethods_run.returncode == 0 -# -# chdir(initial_folder) -# assert False diff --git a/python/tests/test_debiasing.py b/python/tests/test_debiasing.py index a08c99a0..a8cbec8d 100644 --- a/python/tests/test_debiasing.py +++ b/python/tests/test_debiasing.py @@ -111,6 +111,7 @@ def test_run(run_config, city, variable, run, method) -> None: assert cmethods_run.returncode == 0 for log_txt in ( "Loading modelled calibration data (CPM)", + # Todo: uncomment in future to check new paths # ( # f"Debiased/three.cities.cropped/{city}/{run}/{variable}/" # f"debiased_{method}_result_var" @@ -121,6 +122,7 @@ def test_run(run_config, city, variable, run, method) -> None: f"debiased_{method}_result_var" ), "Saving to", + # Todo: uncomment in future to check new paths # ( # f"Saving to {DATA_PATH_DEFAULT}/{city}/{run}/{variable}/" # f"debiased_{method}_result_var-{variable}_kind-+None_20100101_20101229.nc" From 656d4147c10abae803e3aa38b3bfe1fedbc0a37f Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Sat, 27 Jan 2024 04:56:25 +0000 Subject: [PATCH 71/83] fix(ci): add coverage.svg back into pytest workflow --- .github/workflows/ci.yaml | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 6e743878..9613758d 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -68,8 +68,8 @@ jobs: shell: bash -el {0} run: | mamba run -n ${{ env.CONDA_ENV_NAME }} --cwd python pytest - export JUPYTER_ID=$(docker compose ps -q jupyter) - echo "jupyter_id=$JUPYTER_ID" >> $GITHUB_ENV + # export JUPYTER_ID=$(docker compose ps -q jupyter) + # echo "jupyter_id=$JUPYTER_ID" >> $GITHUB_ENV # - name: Build, Test and Save Test Coverage # run: | @@ -79,23 +79,23 @@ jobs: # export JUPYTER_ID=$(docker compose ps -q jupyter) # echo "jupyter_id=$JUPYTER_ID" >> $GITHUB_ENV - - name: Check accessing saved jupyter_id - run: | - echo ${{ env.jupyter_id }} + # - name: Check accessing saved jupyter_id + # run: | + # echo ${{ env.jupyter_id }} # - name: Copy test coverage results # run: | # docker cp ${{ env.jupyter_id }}:${{ env.PYTHON_MODULE_FOLDER }}${{ env.COVERAGE_SVG_PATH }} ${{ env.COVERAGE_SVG_FOLDER }} - # - name: Copy test coverage results - # run: | - # cp ${{ env.PYTHON_MODULE_FOLDER }}${{ env.COVERAGE_SVG_PATH }} ${{ env.COVERAGE_SVG_FOLDER }} - # - # - name: Archive coverage svg - # uses: actions/upload-artifact@v3 - # with: - # name: coverage-badge - # path: ${{ env.COVERAGE_SVG_PATH }} + - name: Copy test coverage results + run: | + cp ${{ env.PYTHON_MODULE_FOLDER }}${{ env.COVERAGE_SVG_PATH }} ${{ env.COVERAGE_SVG_FOLDER }} + + - name: Archive coverage svg + uses: actions/upload-artifact@v3 + with: + name: coverage-badge + path: ${{ env.COVERAGE_SVG_PATH }} # - name: Tear down the Stack # run: docker compose down From 205b268d86255ca6d02d2ef295fb8ea3e26f04b6 Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Sat, 27 Jan 2024 18:25:44 +0000 Subject: [PATCH 72/83] fix(ci): update PYTHON_MODULE_FOLDER and CI push/pull branches --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 9613758d..77e95ab9 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -7,7 +7,7 @@ env: CONDA_ENV_NAME: clim-recal CONDA_ENV_PATH: environment.yml MIN_PYTHON_VERSION: 3.9 - PYTHON_MODULE_FOLDER: /home/jovyan/python/ + PYTHON_MODULE_FOLDER: python/ COVERAGE_SVG_FOLDER: docs/assets/ COVERAGE_SVG_FILE_NAME: coverage.svg # replace below with references to previous config lines From e444fbb77a94cb81aca52057281c42c2a855dfe5 Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Sat, 27 Jan 2024 18:26:12 +0000 Subject: [PATCH 73/83] fix(ci): update CI push/pull branches --- .github/workflows/ci.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 77e95ab9..5f7a5ec4 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -16,11 +16,11 @@ env: on: pull_request: - branches: ['main', 'doc-deploy', 'ruth-notebook-for-workshop', 'r-docker-refactor'] + branches: ['main', 'r-docker-refactor'] paths-ignore: ['docs/**'] push: - branches: ['main', 'doc-deploy', 'ruth-notebook-for-workshop', 'r-docker-refactor'] + branches: ['main', 'r-docker-refactor'] concurrency: group: ${{ github.head_ref || github.run_id }} From 7f97bd2811993d6251058a40061f70a59d581ea8 Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Sat, 27 Jan 2024 18:30:43 +0000 Subject: [PATCH 74/83] feat(ci): update pre-commit-ci/lite-action to v3.0.0 --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 5f7a5ec4..a2e1d0c9 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -45,7 +45,7 @@ jobs: uses: pre-commit/action@main - name: Update pre-commit - uses: pre-commit-ci/lite-action@v1.0.1 + uses: pre-commit-ci/lite-action@v3.0.0 if: always() # With no caching at all the entire ci process takes 4m 30s to complete! From 6fc4224adf6f78c70d5ccdfc2d452cf56fcd247b Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Sat, 27 Jan 2024 18:41:10 +0000 Subject: [PATCH 75/83] fix(ci): set pre-commit-ci/lite to main --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index a2e1d0c9..5f7a5ec4 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -45,7 +45,7 @@ jobs: uses: pre-commit/action@main - name: Update pre-commit - uses: pre-commit-ci/lite-action@v3.0.0 + uses: pre-commit-ci/lite-action@v1.0.1 if: always() # With no caching at all the entire ci process takes 4m 30s to complete! From 96f656d3f471614d713673a9af1669f6137ea9a2 Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Sat, 27 Jan 2024 18:41:31 +0000 Subject: [PATCH 76/83] fix(ci): FIX set pre-commit-ci/lite to main --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 5f7a5ec4..bdbd49d6 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -45,7 +45,7 @@ jobs: uses: pre-commit/action@main - name: Update pre-commit - uses: pre-commit-ci/lite-action@v1.0.1 + uses: pre-commit-ci/lite-action@main if: always() # With no caching at all the entire ci process takes 4m 30s to complete! From fcfffc0e8e8f33dfb2f184f0040dbee4fd22b15d Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Sat, 27 Jan 2024 18:58:47 +0000 Subject: [PATCH 77/83] fix(ci): refactor doc CI via miniconda --- .github/workflows/ci.yaml | 70 ++++++++++++++++++++++++++++----------- 1 file changed, 51 insertions(+), 19 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index bdbd49d6..0af2cb17 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -104,13 +104,13 @@ jobs: needs: [linter, pytest] runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - # - name: Download coverage svg - # uses: actions/download-artifact@v3 - # with: - # name: coverage-badge - # path: ${{ env.GH_PAGE_PATH }}${{ env.COVERAGE_SVG_PATH }} + - name: Download coverage svg + uses: actions/download-artifact@v3 + with: + name: coverage-badge + path: ${{ env.GH_PAGE_PATH }}${{ env.COVERAGE_SVG_PATH }} # Other options for documentation build for future testing outside docker # - name: Set up conda environment @@ -136,18 +136,50 @@ jobs: # npm ci # npm run build --if-present # mv dist ../docs/_book/app + # jobs: + - name: Checkout Code Repository + uses: actions/checkout@main - - name: Build docker quarto - run: | - # A potentially quicker build option to try in future, requires running in detatched mode - # DOCKER_BUILDKIT=1 docker build --no-cache -f compose/docs/Dockerfile --target builder --tag 'clim-recal-docs' . - docker compose build docs - docker compose up --detach - docker cp $(docker compose ps -q docs):/usr/local/apache2/htdocs/ ${{ env.GH_PAGE_PATH }} - - - name: Publish - uses: peaceiris/actions-gh-pages@v3 + - name: Set up Quarto + uses: quarto-dev/quarto-actions/setup@v2 + + - name: Build Conda Environment + uses: conda-incubator/setup-miniconda@v3 + with: + activate-environment: ${{ env.CONDA_ENV_NAME }} + environment-file: ${{ env.CONDA_ENV_PATH }} + auto-activate-base: false + miniforge-version: latest + + - name: Build Python Docstrings + run: quartodoc build + + # - name: Install Python and Dependencies + # uses: actions/setup-python@v4 + # with: + # python-version: ${{ env.MIN_PYTHON_VERSION }} + # cache: 'pip' + # - run: pip install jupyter + # - run: pip install -r requirements.txt + + - name: Render and Publish + uses: quarto-dev/quarto-actions/publish@v2 with: - github_token: ${{ secrets.GITHUB_TOKEN }} - publish_dir: ${{ env.GH_PAGE_PATH }} - keep_files: false + target: gh-pages + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + # - name: Build docker quarto + # run: | + # # A potentially quicker build option to try in future, requires running in detatched mode + # # DOCKER_BUILDKIT=1 docker build --no-cache -f compose/docs/Dockerfile --target builder --tag 'clim-recal-docs' . + # docker compose build docs + # docker compose up --detach + # docker cp $(docker compose ps -q docs):/usr/local/apache2/htdocs/ ${{ env.GH_PAGE_PATH }} + # + # - name: Publish + # uses: peaceiris/actions-gh-pages@v3 + # with: + # github_token: ${{ secrets.GITHUB_TOKEN }} + # publish_dir: ${{ env.GH_PAGE_PATH }} + # keep_files: false From fc943699d570309e107506e7f52650f1175ab98e Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Sat, 27 Jan 2024 18:59:38 +0000 Subject: [PATCH 78/83] fix: add r-quarto back in environment.yml --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 312016d5..288cac6e 100644 --- a/environment.yml +++ b/environment.yml @@ -17,7 +17,7 @@ dependencies: - pip=23.3 - python=3.9 - gdal=3.3.2 - # - r-quarto=1.3 + - r-quarto=1.3 - readline=8.2 - setuptools=68.0.0 - tk=8.6.12 From 11b7eecceccd1e4211a0aad0776355689f1982bd Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Tue, 30 Jan 2024 03:36:34 +0000 Subject: [PATCH 79/83] fix(refactor): refactor `docker` deploy and improve related code and docs --- .github/workflows/ci.yaml | 94 ---------- _quarto.yml | 19 +- compose.yml | 2 +- compose/docs/Dockerfile | 47 ++--- compose/jupyter/Dockerfile | 22 +-- compose/jupyterhub/Dockerfile | 27 +++ compose/rstudio/Dockerfile | 10 +- docs/pipeline_guidance.md | 166 ------------------ docs/pipeline_guidance.qmd | 203 ++++++++++++++++++++++ environment.yml | 20 +-- python/data_download/ceda_ftp_download.py | 14 +- python/debiasing/debias_wrapper.py | 4 - python/load_data/data_loader.py | 120 ++++++++----- python/utils.py | 7 +- 14 files changed, 362 insertions(+), 393 deletions(-) create mode 100644 compose/jupyterhub/Dockerfile delete mode 100644 docs/pipeline_guidance.md create mode 100644 docs/pipeline_guidance.qmd diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 0af2cb17..c9ae51e7 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -48,7 +48,6 @@ jobs: uses: pre-commit-ci/lite-action@main if: always() - # With no caching at all the entire ci process takes 4m 30s to complete! pytest: runs-on: ubuntu-latest @@ -68,24 +67,6 @@ jobs: shell: bash -el {0} run: | mamba run -n ${{ env.CONDA_ENV_NAME }} --cwd python pytest - # export JUPYTER_ID=$(docker compose ps -q jupyter) - # echo "jupyter_id=$JUPYTER_ID" >> $GITHUB_ENV - - # - name: Build, Test and Save Test Coverage - # run: | - # docker compose build jupyter - # docker compose up jupyter --detach - # docker compose exec -u 0 jupyter bash -c "conda run -n ${{ env.CONDA_ENV_NAME }} --cwd python pytest" - # export JUPYTER_ID=$(docker compose ps -q jupyter) - # echo "jupyter_id=$JUPYTER_ID" >> $GITHUB_ENV - - # - name: Check accessing saved jupyter_id - # run: | - # echo ${{ env.jupyter_id }} - - # - name: Copy test coverage results - # run: | - # docker cp ${{ env.jupyter_id }}:${{ env.PYTHON_MODULE_FOLDER }}${{ env.COVERAGE_SVG_PATH }} ${{ env.COVERAGE_SVG_FOLDER }} - name: Copy test coverage results run: | @@ -97,9 +78,6 @@ jobs: name: coverage-badge path: ${{ env.COVERAGE_SVG_PATH }} - # - name: Tear down the Stack - # run: docker compose down - docs: needs: [linter, pytest] runs-on: ubuntu-latest @@ -111,75 +89,3 @@ jobs: with: name: coverage-badge path: ${{ env.GH_PAGE_PATH }}${{ env.COVERAGE_SVG_PATH }} - - # Other options for documentation build for future testing outside docker - # - name: Set up conda environment - # uses: conda-incubator/setup-miniconda@v2 - # with: - # activate-environment: environment.yml - # - # From https://github.com/r-lib/actions/tree/v2-branch/setup-r - # - name: Setup R - # uses: r-lib/actions/setup-r@v2 - - # Potentially necessary for future interactive documentation builds following uatk-spc - # - name: Install node - # uses: actions/setup-node@v2 - # with: - # node-version: 18.x - # cache: 'npm' - # cache-dependency-path: web/package-lock.json - # - # - name: Build web app - # run: | - # cd web - # npm ci - # npm run build --if-present - # mv dist ../docs/_book/app - # jobs: - - name: Checkout Code Repository - uses: actions/checkout@main - - - name: Set up Quarto - uses: quarto-dev/quarto-actions/setup@v2 - - - name: Build Conda Environment - uses: conda-incubator/setup-miniconda@v3 - with: - activate-environment: ${{ env.CONDA_ENV_NAME }} - environment-file: ${{ env.CONDA_ENV_PATH }} - auto-activate-base: false - miniforge-version: latest - - - name: Build Python Docstrings - run: quartodoc build - - # - name: Install Python and Dependencies - # uses: actions/setup-python@v4 - # with: - # python-version: ${{ env.MIN_PYTHON_VERSION }} - # cache: 'pip' - # - run: pip install jupyter - # - run: pip install -r requirements.txt - - - name: Render and Publish - uses: quarto-dev/quarto-actions/publish@v2 - with: - target: gh-pages - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - # - name: Build docker quarto - # run: | - # # A potentially quicker build option to try in future, requires running in detatched mode - # # DOCKER_BUILDKIT=1 docker build --no-cache -f compose/docs/Dockerfile --target builder --tag 'clim-recal-docs' . - # docker compose build docs - # docker compose up --detach - # docker cp $(docker compose ps -q docs):/usr/local/apache2/htdocs/ ${{ env.GH_PAGE_PATH }} - # - # - name: Publish - # uses: peaceiris/actions-gh-pages@v3 - # with: - # github_token: ${{ secrets.GITHUB_TOKEN }} - # publish_dir: ${{ env.GH_PAGE_PATH }} - # keep_files: false diff --git a/_quarto.yml b/_quarto.yml index a5559d08..4a76c98e 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -14,7 +14,7 @@ project: - "docs/reference" - "docs/pipeline.qmd" - "docs/contributing.md" - - "docs/pipeline_guidance.md" + - "docs/pipeline_guidance.qmd" - "docs/docker-configurations.qmd" - "python/README.md" # - "notebooks/Assessing_bc_data/MethodsAssessment_DecWorkshop.Rmd" @@ -37,10 +37,12 @@ website: contents: - text: "Summary" href: "README.md" - - text: "Pipeline" - href: "docs/pipeline_guidance.md" - - text: "Pipeline Diagram" - href: "docs/pipeline.qmd" + - section: "Pipeline" + contents: + - text: "Guidance" + href: "docs/pipeline_guidance.qmd" + - text: "Diagram" + href: "docs/pipeline.qmd" - text: "Contributing" href: "docs/contributing.md" - section: "R" @@ -60,7 +62,7 @@ website: - section: "Reference" contents: - href: "docs/reference/data_download.ceda_ftp_download.download_ftp.qmd" - text: "download_ftp" + text: "Data Download" - href: "docs/reference/utils.qmd" text: "Utilities" - text: "Docker" @@ -82,5 +84,8 @@ quartodoc: contents: # the functions being documented in the package. # you can refer to anything: class methods, modules, etc.. - - data_download.ceda_ftp_download.download_ftp + - data_download + - load_data + - resampling + - debiasing - utils diff --git a/compose.yml b/compose.yml index b104cd82..b9dfbe36 100644 --- a/compose.yml +++ b/compose.yml @@ -17,7 +17,7 @@ services: context: . dockerfile: ./compose/docs/Dockerfile ports: - - "8080:80" + - "8080:8080" volumes: - .:/home/jovyan:rw diff --git a/compose/docs/Dockerfile b/compose/docs/Dockerfile index 656d736c..17901de8 100644 --- a/compose/docs/Dockerfile +++ b/compose/docs/Dockerfile @@ -1,47 +1,32 @@ ARG QUARTO_VERSION="1.3.450" -# FROM ghcr.io/quarto-dev/quarto:${QUARTO_VERSION} AS builder FROM registry.gitlab.com/quarto-forge/docker/polyglot as builder -ARG PORT=8080 +ARG LOCAL_PORT=8080 +ARG EXTERNAL_PORT=$LOCAL_PORT + +# Setting as an ENV allows LOCAL_PORT to be used in +# runtime commandas like CMD and still be passed as +# an ARG +ENV HOST_IP=0.0.0.0 +ENV LOCAL_PORT=${LOCAL_PORT} ARG py_ver=3.10 ENV DEBIAN_FRONTEND=noninteractive -# ARG RIG_VERSION="latest" -# ARG R_VERSION="release" -# COPY install-rig.sh /tmp/install-rig.sh -# RUN bash /tmp/install-rig.sh "${RIG_VERSION}" -# RUN rig add ${R_VERSION} # && Rscript -e 'pak::pkg_install("renv")' - -# COPY mywebsite /app -# WORKDIR /app -# RUN Rscript -e "renv::restore()" -# RUN quarto render . COPY . /app WORKDIR /app -# RUN Rscript -e "renv::restore()" -EXPOSE ${PORT}:${PORT} - -# RUN quarto preview --port ${PORT}:${PORT} -# RUN apt-get update && apt-get install -y python${py_ver} python3-pip r-base r-base-dev -# RUN micromamba install -y -n base quartodoc && quartodoc build -RUN micromamba config append channels conda-forge && \ - micromamba install -y -n base quartodoc +# The local_data_path is an absolute local path to ClimateData on the machine hosting running `docker` +ARG HOST_DATA_PATH=/Volumes/vmfileshare +# The local_data_path is an absolute path to mount ClimateData within `docker` +ARG DOCKER_DATA_PATH=/mnt/vmfileshare USER root -# RUN micromamba run -n base quartodoc build && \ -# micromamba run -n base quarto render - -RUN micromamba run -n base quarto render - -# RUN quartodoc build - -# RUN Rscript -e 'install.packages("rmarkdown", repos="https://cloud.r-project.org")' +# Install the local python environemtn named 'clim-recal' +RUN micromamba create -f environment.yml -# RUN micromamba run -n base quarto render -FROM httpd:alpine -COPY --from=builder /app/_site/ /usr/local/apache2/htdocs/ +CMD micromamba run -n clim-recal quartodoc build && \ + micromamba run -n clim-recal quarto preview --port $LOCAL_PORT --host $HOST_IP diff --git a/compose/jupyter/Dockerfile b/compose/jupyter/Dockerfile index 3df90ba9..87cb1058 100644 --- a/compose/jupyter/Dockerfile +++ b/compose/jupyter/Dockerfile @@ -3,8 +3,11 @@ FROM quay.io/jupyter/r-notebook as clim-recal-base # This is derived from documentation available at # https://jupyter-docker-stacks.readthedocs.io/en/latest/ -# Example run command: +# The local_data_path is an absolute local path to ClimateData on the machine hosting running `docker` +ARG HOST_DATA_PATH=/Volumes/vmfileshare +# The local_data_path is an absolute path to mount ClimateData within `docker` +ARG DOCKER_DATA_PATH=/mnt/vmfileshare # This will require a mount of `vmfileshare` from `dymestorage1` # On macOS this can be solved via: @@ -32,7 +35,7 @@ ARG env_name=clim-recal # `py_ver` is not currently used below and is specified in `environment.yaml` # included here as reminder and clarity if future change needed. -ARG py_ver=3.9 +ARG py_ver=3.10 USER root @@ -62,20 +65,17 @@ RUN "${CONDA_DIR}/envs/${env_name}/bin/python" -m ipykernel install --user --nam fix-permissions "${CONDA_DIR}" && \ fix-permissions "/home/${NB_USER}" -# Copy the rest of the clim-recal code to volume -# COPY --chown=${NB_UID}:${NB_GID} . . -# Keep below while permission ambiguity persists on compose volume mounting COPY . . -# Switch to default jupyter user -# USER ${NB_UID} -# Add custom activate script to reflect environment -# USER root +RUN git config --global --add safe.directory /home/jovyan +RUN cd python/debiasing && git submodule update --init --recursive + RUN activate_custom_env_script=/usr/local/bin/before-notebook.d/activate_custom_env.sh && \ echo "#!/bin/bash" > ${activate_custom_env_script} && \ echo "eval \"$(conda shell.bash activate "${env_name}")\"" >> ${activate_custom_env_script} && \ chmod +x ${activate_custom_env_script} + RUN chown -R ${NB_UID}:${NB_GID} . # Switch to default jupyter user @@ -94,7 +94,3 @@ RUN echo "conda activate ${env_name}" >> "${HOME}/.bashrc" # RUN --mount=type=bind,source=$HOST_DATA_PATH,target=$DOCKER_DATA_PATH \ # find /build/jars -type f -name '*.jar' -maxdepth 1 -print0 \ # | xargs -0 --no-run-if-empty --replace=source cp --force source >"${INSTALL_PATH}/modules/" - - - -RUN cd python/debiasing && git submodule update --init --recursive diff --git a/compose/jupyterhub/Dockerfile b/compose/jupyterhub/Dockerfile new file mode 100644 index 00000000..c2b46094 --- /dev/null +++ b/compose/jupyterhub/Dockerfile @@ -0,0 +1,27 @@ +# Copyright (c) Jupyter Development Team. +# Distributed under the terms of the Modified BSD License. +ARG JUPYTERHUB_VERSION +FROM quay.io/jupyterhub/jupyterhub:$JUPYTERHUB_VERSION + +ARG HOST_DATA_PATH=/Volumes/vmfileshare + +# The local_data_path is an absolute path to mount ClimateData within `docker` +ARG DOCKER_DATA_PATH=/mnt/vmfileshare +ARG env_name=clim-recal + +# Install dockerspawner, nativeauthenticator +# hadolint ignore=DL3013 +RUN python3 -m pip install --no-cache-dir \ + dockerspawner \ + jupyterhub-nativeauthenticator + + +COPY . . +WORKDIR python/debiasing/ && git submodule update --init --recursive +WORKDIR $HOME + +CMD jupyterhub -f /srv/jupyterhub/jupyterhub_config.py + +# TODO: test uncommenting to add necessary packages in future +# RUN conda update -n base -c conda-forge conda +# RUN conda env create -f environment.yml -n clim-recal diff --git a/compose/rstudio/Dockerfile b/compose/rstudio/Dockerfile index 67a99a7b..b61d20e2 100644 --- a/compose/rstudio/Dockerfile +++ b/compose/rstudio/Dockerfile @@ -14,12 +14,14 @@ ARG DOCKER_DATA_PATH=/mnt/vmfileshare # ENV QUARTO_VERSION=default RUN /rocker_scripts/install_pandoc.sh -RUN /rocker_scripts/install_quarto.sh -RUN /rocker_scripts/install_jupyter.sh +# RUN /rocker_scripts/install_quarto.sh +# RUN /rocker_scripts/install_jupyter.sh # Maybe install necessary geo packages for terra following https://github.com/rspatial/terra/issues/248 -RUN apt-get update && apt-get -y install libudunits2-dev libgdal-dev libgeos-dev libproj-dev \ -gdal-bin python3-gdal libgdal-dev build-essential # Python specific deps +# RUN apt-get update && apt-get -y install libudunits2-dev \ +# libgeos-dev libproj-dev python3-gdal libgdal-dev \ +# build-essential +# Python specific deps # For future: add dependencies to include to speed up notebook deploy, including # library(ggplot2) diff --git a/docs/pipeline_guidance.md b/docs/pipeline_guidance.md deleted file mode 100644 index afea7f2e..00000000 --- a/docs/pipeline_guidance.md +++ /dev/null @@ -1,166 +0,0 @@ - -# Analysis pipeline guidance - -This is a detailed guide to our analysis pipeline. -*see also this [flowchart viz](https://github.com/alan-turing-institute/clim-recal/blob/documentation/docs/pipeline.md) of the pipeline* - -**Contents:** -* [Prerequisites](#prerequisites) - * [Setting up your R environment](#setting-up-your-r-environment) - * [Setting up your python environment](#setting-up-your-python-environment) -* [Downloading the data](#downloading-the-data) -* [Reprojecting the data](#reprojecting-the-data) -* [Resampling the data](#resampling-the-data) -* [Preparing the bias correction and assessment](#preparing-the-bias-correction-and-assessment) -* [Applying the bias correction](#applying-the-bias-correction) - - -### Prerequisites - -For our bias correction methods, we tap into dedicated packages in both Python and R ecosystems. The integration of these languages allows us to utilize the cutting-edge functionalities implemented in each. Given this dual-language nature of our analysis pipeline, we also provide preprocessing scripts written in both Python and R. To facilitate a seamless experience, users are required to set up both Python and R environments as detailed below. - -#### Setting up your R environment - -- **Download and Install R:** Visit [CRAN (The Comprehensive R Archive Network)](https://cran.r-project.org/) to download the latest version of R compatible with your operating system. Then verify successful installation via command line: - -``` -R --version -``` -- **Install Necessary R Packages:** Our analysis utilizes several R packages. You can install these packages by starting R (just type `R` in your command line and press enter) and entering the following commands in the R console: - ```R - install.packages("package1") - install.packages("package2") - #... (continue for all necessary packages) - ``` -- Replace `"package1"`, `"package2"`, etc., with the actual names of the required packages. A list of required R packages is provided in the 'R_Package_Requirements.txt' file. - -#### Setting up your python environment - -For your python environment, we provide an Anaconda environment file for ease-of-use. -``` -conda env create -f environment.yml -``` - -> **Warning**: -> To reproduce our exact outputs, you will require GDAL version 3.4. Please be aware that this specific version of GDAL requires a different Python version than the one specified in our environment file. Therefore, we have not included it in the environment file and instead, for the reprojection step, you'll need to install GDAL (for example using conda) and set up a new environment: -> ``` -> conda install -c conda-forge gdal -> conda create -n gdal_env python=3.10 gdal=3.4 -> ``` - -In order to paralellize the reprojection step, we make use of the [GNU parallel shell tool](https://www.gnu.org/software/parallel/). - -``` -parallel --version -``` - -#### The cmethods library - -This repository contains a python script used to run debiasing in climate data using a fork of the [original python-cmethods](https://github.com/btschwertfeger/python-cmethods) module written by Benjamin Thomas Schwertfeger's , which has -been modified to function with the dataset used in the clim-recal project. This library has been included as a -submodule to this project, so you must run the following command to pull the submodules required. - -``` -git submodule update --init --recursive -``` - -#### Downloading the data - -**Climate data** - -This streamlined pipeline is designed for raw data provided by the Met Office, accessible through the [CEDA archive]((https://catalogue.ceda.ac.uk/uuid/ad2ac0ddd3f34210b0d6e19bfc335539)). It utilizes [UKCP](https://data.ceda.ac.uk/badc/ukcp18/data/land-cpm/uk/2.2km) control, scenario data at 2.2km resolution, and [HADs](https://data.ceda.ac.uk/badc/ukmo-hadobs/data/insitu/MOHC/HadOBS/HadUK-Grid/v1.1.0.0/1km) observational data. For those unfamiliar with this data, refer to our [the dataset](#the-dataset) section. - -To access the data,[register here]((https://archive.ceda.ac.uk/)) at the CEDA archive and configure your FTP credentials in "My Account". Utilize our [ceda_ftp_download.py](python/data_download/) script to download the data. - -``` -# cpm data -python3 ceda_ftp_download.py --input /badc/ukcp18/data/land-cpm/uk/2.2km/rcp85/ --output 'output_dir' --username 'uuu' --psw 'ppp' --change_hierarchy - -# hads data -python3 ceda_ftp_download.py --input /badc/ukmo-hadobs/data/insitu/MOHC/HadOBS/HadUK-Grid/v1.1.0.0/1km --output 'output_dir' --username 'uuu' --psw 'ppp' -``` -You need to replace `uuu` and `ppp` with your CEDA username and FTP password respectively and replace `output_dir` with the directory you want to write the data to. - -The `--change_hierarchy` flag modifies the folder hierarchy to fit with the hierarchy in the Turing Azure file store. This flag only applies to the UKCP data and should not be used with HADs data. You can use the same script without the `--change_hierarchy` flag in order to download files without any changes to the hierarchy. - -> 📢 If you are an internal collaborator you can access the raw data as well as intermediate steps through our Azure server. [See here for a How-to](). - -**Geospatial data** - -In addition to the climate data we use geospatial data to divide the data into smaller chunks. Specifically we use the following datasets for city boundaries: - -- Scottish localities boundaries for cropping out Glasgow. Downloaded from [nrscotland.gov.uk](https://www.nrscotland.gov.uk/statistics-and-data/geography/our-products/settlements-and-localities-dataset/settlements-and-localities-digital-boundaries) on 1st Aug 2023 - -- Major Towns and Cities boundaries for cropping out Manchester. Downloaded from [https://geoportal.statistics.gov.uk/](https://geoportal.statistics.gov.uk/datasets/980da620a0264647bd679642f96b42c1/explore) - - -### Reprojecting the data -The HADs data and the UKCP projections have different resolution and coordinate system. For example the HADs dataset uses the British National Grid coordinate system. - -The first step in our analysis pipeline is to reproject the UKCP datasets to the British National Grid coordinate system. For this purpose, we utilize the Geospatial Data Abstraction Library (GDAL), designed for reading and writing raster and vector geospatial data formats. - -> **Warning**: -> Note that, to reproduce our exact pipeline, we switch environments here as explained in the requirements. -> ``` -> conda activate gdal_env -> ``` - -To execute the reprojection in parallel fashion, run the `reproject_all.sh` script from your shell. As an input to the script replace `path_to_netcdf_files` with the path to the raw netCDF files. - -```bash -cd bash -sh reproject_all.sh path_to_netcdf_files -``` - -### Resampling the data - -Resample the HADsUK dataset from 1km to 2.2km grid to match the UKCP reprojected grid. We run the resampling python script specifying the `--input` location of the reprojected files from the previous step, the UKCP `--grid` file an the `--output` location for saving the resampled files. - -``` -# switch to main environment -conda activate clim-recal - -# run resampling -cd ../python/resampling -python resampling_hads.py --input path_to_reprojected --grid path_to_grid_file --output path_to_resampled -``` -### Preparing the bias correction and assessment - -**Spatial cropping** -Because the bias correction process is computationally intensive, handling large datasets can be challenging and time-consuming. Therefore, to make the pipeline more manageable and efficient, it is recommended to split the data into smaller subsets. For the purposes of our example pipeline, we've opted for reducing the data to individual city boundaries. To crop you need to adjust the paths in `Cropping_Rasters_to_three_cities.R` script to fit 1your own directory sturcture. The cropping is implemented in the `cpm_read_crop` and `hads_read_crop` functions. - -``` -Rscript Cropping_Rasters_to_three_cities.R -``` -**calibration-validation data split** -For the purpose of assessing our bias correction, we then split our data, both the projection as well as the ground-truth observations by dates. In this example here we calibrate the bias correction based on the years 1981 to 1983. We then use data from year 2010 to validate the quality of the bias correction. You need to replace `path_to_cropped` with the path where the data from the previous cropping step was saved and `path_to_preprocessed` with the output directory you choose. You can leave the `-v` and `-r` flags as specified below or choose another metric and run if you prefer. - -``` -cd ../debiasing -python preprocess_data.py --mod path_to_cropped --obs path_to_cropped -v tasmax -r '05' --out path_to_preprocessed --calib_dates 19810101-19831230 --valid_dates 20100101-20101230 -``` - -The preprocess_data.py script also aligns the calendars of the historical simulation data and observed data, ensuring that they have the same time dimension and checks that the observed and simulated historical data have the same dimensions. - -> **Note**: -> preprocess_data.py makes use of our custom data loader functions. In [`data_loader/`](python/load_data/data_loader.py) we have written a few functions for loading and concatenating data into a single xarray which can be used for running debiasing methods. Instructions in how to use these functions can be found in python/notebooks/load_data_python.ipynb. - - -### Applying the bias correction -Note: By March 2023 we have only implemented the [python-cmethods](https://github.com/alan-turing-institute/python-cmethods) library. - - -The [run_cmethods.py](../debiasing/run_cmethods.py) allow us to adjusts climate biases in climate data using the python-cmethods library. It takes as input observation data (HADs data), control data (historical UKCP data), and scenario data (future UKCP data), -and applies a correction method to the scenario data. The resulting output is saved as a `.nc` to a specified directory. The script will also produce a time-series and a map plot of the debiased data. To run this you need to replace `path_to_validation_data` with the output directories of the previous step and specify `path_to_corrected_data` as your output directory for the bias corrected data. You can also specify your preferred `bias_correction_method` (e.g. quantile_delta_mapping). - -``` -python3 run_cmethods.py --input_data_folder path_to_validation_data --out path_to_corrected_data --method bias_correction_method --v 'tas' -``` - -The run_cmethods.py script loops over the time periods and applies debiasing in periods of 10 years in the following steps: - - Loads the scenario data for the current time period. - - Applies the specified correction method to the scenario data. - - Saves the resulting output to the specified directory. - - Creates diagnotic figues of the output dataset (time series and time dependent maps) and saves it into the specified directory. - -For each 10 year time period it will produce an `.nc` output file with the adjusted data and a time-series plot and a time dependent map plot of the adjusted data. diff --git a/docs/pipeline_guidance.qmd b/docs/pipeline_guidance.qmd new file mode 100644 index 00000000..0a068a5b --- /dev/null +++ b/docs/pipeline_guidance.qmd @@ -0,0 +1,203 @@ +--- +title: Analysis pipeline guidance +format: + html: + toc: true +execute: + echo: true +--- +This is a detailed guide to our analysis pipeline. +*see also this [flowchart viz](https://github.com/alan-turing-institute/clim-recal/blob/documentation/docs/pipeline.md) of the pipeline* + +## Prerequisites + +For our bias correction methods, we tap into dedicated packages in both Python and R ecosystems. The integration of these languages allows us to utilize the cutting-edge functionalities implemented in each. + +Given this dual-language nature of our analysis pipeline, we also provide preprocessing scripts written in both Python and R. To facilitate a seamless experience, users are required to set up both Python and R environments as detailed below. + +### Setting up your R environment + +#### Download and Install R + +Visit [CRAN (The Comprehensive R Archive Network)](https://cran.r-project.org/) to download the latest version of R compatible with your operating system. Then verify successful installation via command line: + +```{bash} +R --version +``` + +#### Install Necessary R Packages + +Our analysis utilizes several R packages. You can install these packages by starting R (just type `R` in your command line and press enter) and entering the following commands in the R console: + +```{R} +#| eval: false + +install.packages("package1") +install.packages("package2") +#... (continue for all necessary packages) +``` + +Replace `"package1"`, `"package2"`, etc., with the actual names of the required packages. A list of required R packages is provided in the 'R_Package_Requirements.txt' file. + +### Setting up your python environment + +For your python environment, we provide an Anaconda environment file for ease-of-use. +```{bash} +#| eval: false + +conda env create -f environment.yml +``` + +::: {.callout-warning} +To reproduce our exact outputs, you will require GDAL version 3.4. Please be aware that this specific version of GDAL requires a different Python version than the one specified in our environment file. Therefore, we have not included it in the environment file and instead, for the reprojection step, you'll need to install GDAL (for example using conda) and set up a new environment: +```{bash} +#| eval: false + +conda install -c conda-forge gdal +conda create -n gdal_env python=3.10 gdal=3.4 +``` +::: + +In order to paralellize the reprojection step, we make use of the [GNU parallel shell tool](https://www.gnu.org/software/parallel/). + +```{bash} +#| eval: false + +parallel --version +``` + +### The `cmethods` library + +This repository contains a python script used to run debiasing in climate data using a fork of the [original python-cmethods](https://github.com/btschwertfeger/python-cmethods) module written by Benjamin Thomas Schwertfeger's , which has been modified to function with the dataset used in the clim-recal project. This library has been included as a submodule to this project, so you must run the following command to pull the submodules required. + +```{bash} +git submodule update --init --recursive +``` + +### Downloading the data + +#### Climate data + +This streamlined pipeline is designed for raw data provided by the Met Office, accessible through the [CEDA archive]((https://catalogue.ceda.ac.uk/uuid/ad2ac0ddd3f34210b0d6e19bfc335539)). It utilizes [UKCP](https://data.ceda.ac.uk/badc/ukcp18/data/land-cpm/uk/2.2km) control, scenario data at 2.2km resolution, and [HADs](https://data.ceda.ac.uk/badc/ukmo-hadobs/data/insitu/MOHC/HadOBS/HadUK-Grid/v1.1.0.0/1km) observational data. For those unfamiliar with this data, refer to our [the dataset](#the-dataset) section. + +To access the data,[register here]((https://archive.ceda.ac.uk/)) at the CEDA archive and configure your FTP credentials in "My Account". Utilize our [ceda_ftp_download.py](../python/data_download/ceda_ftp_download.py) script to download the data. + +```{bash} +#| eval: false + +# cpm data +python3 ceda_ftp_download.py --input /badc/ukcp18/data/land-cpm/uk/2.2km/rcp85/ --output 'output_dir' --username 'uuu' --psw 'ppp' --change_hierarchy + +# hads data +python3 ceda_ftp_download.py --input /badc/ukmo-hadobs/data/insitu/MOHC/HadOBS/HadUK-Grid/v1.1.0.0/1km --output 'output_dir' --username 'uuu' --psw 'ppp' +``` +You need to replace `uuu` and `ppp` with your CEDA username and FTP password respectively and replace `output_dir` with the directory you want to write the data to. + +The `--change_hierarchy` flag modifies the folder hierarchy to fit with the hierarchy in the Turing Azure file store. This flag only applies to the UKCP data and should not be used with HADs data. You can use the same script without the `--change_hierarchy` flag in order to download files without any changes to the hierarchy. + +::: {.callout-tip} +📢 If you are an internal collaborator you can access the raw data as well as intermediate steps through our Azure server. [See here for a How-to](). +::: + +#### Geospatial data + +In addition to the climate data we use geospatial data to divide the data into smaller chunks. Specifically we use the following datasets for city boundaries: + +- Scottish localities boundaries for cropping out Glasgow. Downloaded from [nrscotland.gov.uk](https://www.nrscotland.gov.uk/statistics-and-data/geography/our-products/settlements-and-localities-dataset/settlements-and-localities-digital-boundaries) on 1st Aug 2023 + +- Major Towns and Cities boundaries for cropping out Manchester. Downloaded from [https://geoportal.statistics.gov.uk/](https://geoportal.statistics.gov.uk/datasets/980da620a0264647bd679642f96b42c1/explore) + + +## Reprojecting the data +The HADs data and the UKCP projections have different resolution and coordinate system. For example the HADs dataset uses the British National Grid coordinate system. + +The first step in our analysis pipeline is to reproject the UKCP datasets to the British National Grid coordinate system. For this purpose, we utilize the Geospatial Data Abstraction Library (GDAL), designed for reading and writing raster and vector geospatial data formats. + +::: {.callout-warning} +Note that, to reproduce our exact pipeline, we switch environments here as explained in the requirements. +```bash +#| eval: false + +conda activate gdal_env +``` +::: + +To execute the reprojection in parallel fashion, run the `reproject_all.sh` script from your shell. As an input to the script replace `path_to_netcdf_files` with the path to the raw netCDF files. + +```{bash} +#| eval: false + +cd bash +sh reproject_all.sh path_to_netcdf_files +``` + +## Resampling the data + +Resample the HADs UK dataset from 1km to 2.2km grid to match the UKCP reprojected grid. We run the resampling python script specifying the `--input` location of the reprojected files from the previous step, the UKCP `--grid` file an the `--output` location for saving the resampled files. + +```{bash} +#| eval: false + +# switch to main environment +conda activate clim-recal + +# run resampling +cd ../python/resampling +python resampling_hads.py --input path_to_reprojected --grid path_to_grid_file --output path_to_resampled +``` +## Preparing the bias correction and assessment + +### Spatial cropping +Because the bias correction process is computationally intensive, handling large datasets can be challenging and time-consuming. Therefore, to make the pipeline more manageable and efficient, it is recommended to split the data into smaller subsets. + +For the purposes of our example pipeline, we've opted for reducing the data to individual city boundaries. To crop you need to adjust the paths in `Cropping_Rasters_to_three_cities.R` script to fit 1your own directory sturcture. The cropping is implemented in the `cpm_read_crop` and `hads_read_crop` functions. + +```{bash} +#| eval: false + +Rscript Cropping_Rasters_to_three_cities.R +``` +### calibration-validation data split +For the purpose of assessing our bias correction, we then split our data, both the projection as well as the ground-truth observations by dates. In this example here we calibrate the bias correction based on the years 1981 to 1983. + +We then use data from year 2010 to validate the quality of the bias correction. You need to replace `path_to_cropped` with the path where the data from the previous cropping step was saved and `path_to_preprocessed` with the output directory you choose. You can leave the `-v` and `-r` flags as specified below or choose another metric and run if you prefer. + +```{bash} +#| eval: false + +cd ../debiasing +python preprocess_data.py --mod path_to_cropped --obs path_to_cropped -v tasmax -r '05' --out path_to_preprocessed --calib_dates 19810101-19831230 --valid_dates 20100101-20101230 +``` + +The preprocess_data.py script also aligns the calendars of the historical simulation data and observed data, ensuring that they have the same time dimension and checks that the observed and simulated historical data have the same dimensions. + +::: {.callout-note} +`preprocess_data.py` makes use of our custom data loader functions. In [`data_loader.py`](../python/load_data/data_loader.py) we have written a few functions for loading and concatenating data into a single xarray which can be used for running debiasing methods. Instructions in how to use these functions can be found in `python/notebooks/load_data_python.ipynb`. +::: + + +## Applying the bias correction + +::: {.callout-note} +By March 2023 we have only implemented the [python-cmethods](https://github.com/alan-turing-institute/python-cmethods) library. +::: + + +The [run_cmethods.py](../python/debiasing/run_cmethods.py) allow us to adjusts climate biases in climate data using the python-cmethods library. It takes as input observation data (HADs data), control data (historical UKCP data), and scenario data (future UKCP data), and applies a correction method to the scenario data. The resulting output is saved as a `.nc` to a specified directory. + +The script will also produce a time-series and a map plot of the debiased data. To run this you need to replace `path_to_validation_data` with the output directories of the previous step and specify `path_to_corrected_data` as your output directory for the bias corrected data. You can also specify your preferred `bias_correction_method` (e.g. quantile_delta_mapping). + +```{bash} +#| eval: false + +python3 run_cmethods.py --input_data_folder path_to_validation_data --out path_to_corrected_data --method bias_correction_method --v 'tas' +``` + +The `run_cmethods.py` script loops over the time periods and applies debiasing in periods of 10 years in the following steps: + +- Loads the scenario data for the current time period. +- Applies the specified correction method to the scenario data. +- Saves the resulting output to the specified directory. +- Creates diagnotic figues of the output dataset (time series and time dependent maps) and saves it into the specified directory. + +For each 10 year time period it will produce an `.nc` output file with the adjusted data and a time-series plot and a time dependent map plot of the adjusted data. diff --git a/environment.yml b/environment.yml index 288cac6e..74508cdb 100644 --- a/environment.yml +++ b/environment.yml @@ -4,22 +4,10 @@ channels: - defaults - conda-forge dependencies: - - bzip2=1.0.8 - - c-ares=1.19.1 - - ca-certificates=2023.08.22 - - libedit=3.1.20221030 - - libev=4.33 - - libgfortran5=11.3.0 - - libssh2=1.10.0 - - libzlib=1.2.13 - - llvm-openmp=14.0.6 - - ncurses=6.4 - - pip=23.3 - - python=3.9 - - gdal=3.3.2 - - r-quarto=1.3 - - readline=8.2 - - setuptools=68.0.0 + - llvm-openmp=17.0.6 + - python=3.10 + - gdal=3.8.3 + # - r-quarto=1.3 - tk=8.6.12 - tzdata=2023c - wheel=0.41.2 diff --git a/python/data_download/ceda_ftp_download.py b/python/data_download/ceda_ftp_download.py index ab7969af..868c0585 100644 --- a/python/data_download/ceda_ftp_download.py +++ b/python/data_download/ceda_ftp_download.py @@ -8,12 +8,10 @@ def download_ftp(input: str, output: str, username: str, password: str, order: int) -> None: - """ - Function to connect to the CEDA archive and download data. + """Function to connect to the CEDA archive and download data. - Note - ---- - You need to have a user account and provide your username and `FTP` password. + You need to have a user account and provide your username and + `FTP` password. Parameters ---------- @@ -90,8 +88,10 @@ def download_ftp(input: str, output: str, username: str, password: str, order: i if __name__ == "__main__": """ - Script to download CEDA data from the command line. Note you need to have a user account and - provide your username and FTP password. + Script to download CEDA data from the command line. + + Note you need to have a user account and provide your username + and FTP password. """ # Initialize parser diff --git a/python/debiasing/debias_wrapper.py b/python/debiasing/debias_wrapper.py index ce57a11f..a30aef22 100644 --- a/python/debiasing/debias_wrapper.py +++ b/python/debiasing/debias_wrapper.py @@ -320,8 +320,6 @@ def to_cli_preprocess_tuple( ) -> tuple[Union[str, PathLike], ...]: """Generate a `tuple` of `str` for a command line command. - Note - ---- This will leave `Path` objects uncoverted. See `self.to_cli_preprocess_tuple_strs` for passing to a terminal. @@ -501,8 +499,6 @@ def to_cli_run_cmethods_tuple( ) -> tuple[Union[str, PathLike], ...]: """Generate a `tuple` of `str` for a command line command. - Note - ---- This will leave `Path` objects uncoverted. See `self.to_cli_run_cmethods_tuple_strs` for passing to a terminal. diff --git a/python/load_data/data_loader.py b/python/load_data/data_loader.py index bf57fb60..6a40c212 100644 --- a/python/load_data/data_loader.py +++ b/python/load_data/data_loader.py @@ -5,52 +5,60 @@ import geopandas as gp import xarray as xr +DateRange = tuple[datetime, datetime] + def load_data( - input_path, - date_range, - variable, - filter_filenames_on_variable=False, - run_number=None, - filter_filenames_on_run_number=False, - use_pr=False, - shapefile_path=None, - extension="nc", -): + input_path: str, + date_range: DateRange, + variable: str, + filter_filenames_on_variable: bool = False, + run_number: str | None = None, + filter_filenames_on_run_number: bool = False, + use_pr: bool = False, + shapefile_path: str | None = None, + extension: str = "nc", +) -> xr.DataArray: """ - This function takes a date range and a variable and loads and merges xarrays based on those parameters. + This function takes a date range and a variable and loads and merges + xarrays based on those parameters. + If shapefile is provided it crops the data to that region. Parameters ---------- - input_path: str + input_path Path to where .nc or .tif files are found - date_range : tuple + date_range A tuple of datetime objects representing the start and end date - variable : string + variable A string representing the variable to be loaded - filter_filenames_on_variable : bool, default = False - When True, files in the input_path will be filtered based on whether their file name - contains "variable" as a substring. When False, filtering does not happen. - run_number : sting, default None - A string representing the CPM run number to use (out of 13 CPM runs available in the database). Only files - whose file name contains the substring run_number will be used. If None, all files in input_path are parsed, - regardless of run number in filename. - filter_filenames_on_run_number : bool, default = False - When True, files in the input_path will be filtered based on whether their file name - contains "2.2km_" followed by "run_number". When False, filtering does not happen. - This should only be used for CPM files. For HADs files this should always be set to False. - use_pr : bool, default = False + filter_filenames_on_variable + When True, files in the input_path will be filtered based on + whether their file name contains "variable" as a substring. When + False, filtering does not happen. + run_number + A string representing the CPM run number to use + (out of 13 CPM runs available in the database). Only files + whose file name contains the substring run_number will be used. + If `None`, all files in input_path are parsed, regardless of run + number in filename. + filter_filenames_on_run_number + When True, files in the input_path will be filtered based on + whether their file name contains "2.2km_" followed by "run_number". + When False, filtering does not happen. This should only be used for + CPM files. For HADs files this should always be set to False. + use_pr If True, replace variable with "pr" string when filtering the file names. - shapefile_path: str + shapefile_path Path to a shape file used to clip resulting dataset. - extension: str + extension Extension of the files to be loaded, it can be .nc or .tif files. Returns ------- - merged_xarray : xarray - An xarray containing all loaded and merged and clipped data + xr.DataArray + A DataArray containing all loaded and merged and clipped data """ if extension not in ("nc", "tif"): @@ -101,20 +109,22 @@ def load_data( return xa -def clip_dataset(xa, variable, shapefile): - """ +def clip_dataset(xa: xr.DataArray, variable: str, shapefile: str) -> xr.DataArray: + """Spatially clip `xa` `DataArray` variable via `shapefile. + Parameters ---------- - xa: xArray Dataset + xa xArray containing a giving variable - variable : string - A strings representing the variable to be loaded - shapefile: str - Path to a shape file used to clip resulting dataset, must be in the same CRS of the input xArray. + variable + A string representing the variable to be loaded + shapefile + Path to a shape file used to clip resulting dataset, + must be in the same CRS of the input xArray. - Returns + Returns ------- - xa : xarray + xr.DataArray A clipped xarray dataset """ @@ -148,9 +158,21 @@ def clip_dataset(xa, variable, shapefile): return xa -def reformat_file(file, variable): - """ - Load tif file and reformat xarray into expected format. +def reformat_file(file: str, variable: str) -> xr.DataArray: + """Load tif file and reformat xarray into expected format. + + Parameters + ---------- + + file + Path as a `str` to `tiff` file. + variable + A string representing the variable to be loaded + + Returns + ------- + xr.DataArray + A formatted xarray """ print(f"File: {file} needs rasterio library, trying...") filename = os.path.basename(file).split("_") @@ -191,23 +213,27 @@ def reformat_file(file, variable): return xa -def load_and_merge(date_range, files, variable): +def load_and_merge( + date_range: DateRange, + files: list[str], + variable: str +) -> xr.DataArray: """ Load files into xarrays, select a time range and a variable and merge into a sigle xarray. Parameters ---------- - date_range : tuple + date_range A tuple of datetime objects representing the start and end date - files: list (str) + files List of strings with path to files to be loaded. - variable : string + variable A string representing the variable to be loaded Returns ------- - merged_xarray : xarray + xr.DataArray An xarray containing all loaded and merged data """ diff --git a/python/utils.py b/python/utils.py index 29f850e1..1f19c69e 100644 --- a/python/utils.py +++ b/python/utils.py @@ -230,9 +230,10 @@ def csv_reader(path: Path, **kwargs) -> Generator[dict[str, str], None, None]: **kwargs Additional parameters for `csv.DictReader`. - Yields - ------ - A `dict` per row from `path`. + Returns + ------- + : + A `Generator` of `dicts` per row from `path`. Examples -------- From 15cab815e9e39e984bbda1cdb6fc50371edcb8c5 Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Fri, 2 Feb 2024 02:52:05 +0000 Subject: [PATCH 80/83] fix(ci): rewrite compose/jupyter/Dockerfile to fix env activation --- compose.yml | 1 - compose/jupyter/Dockerfile | 93 ++++++++------------------------------ environment.yml | 4 +- 3 files changed, 19 insertions(+), 79 deletions(-) diff --git a/compose.yml b/compose.yml index b9dfbe36..60a16425 100644 --- a/compose.yml +++ b/compose.yml @@ -6,7 +6,6 @@ services: build: context: . dockerfile: ./compose/jupyter/Dockerfile - target: clim-recal-base ports: - "8888:8888" volumes: diff --git a/compose/jupyter/Dockerfile b/compose/jupyter/Dockerfile index 87cb1058..d56810b1 100644 --- a/compose/jupyter/Dockerfile +++ b/compose/jupyter/Dockerfile @@ -1,96 +1,39 @@ -FROM quay.io/jupyter/r-notebook as clim-recal-base +FROM quay.io/jupyter/r-notebook -# This is derived from documentation available at -# https://jupyter-docker-stacks.readthedocs.io/en/latest/ - -# The local_data_path is an absolute local path to ClimateData on the machine hosting running `docker` -ARG HOST_DATA_PATH=/Volumes/vmfileshare - -# The local_data_path is an absolute path to mount ClimateData within `docker` -ARG DOCKER_DATA_PATH=/mnt/vmfileshare - -# This will require a mount of `vmfileshare` from `dymestorage1` -# On macOS this can be solved via: -# open smb://dymestorage1.file.core.windows.net/vmfileshare -# Using user: dymestorage1 -# And password specified via: -# https://learn.microsoft.com/en-us/azure/storage/common/storage-account-keys-manage?tabs=azure-portal#view-account-access-keys - -# Example run: -# cd clim-recal -# docker build --tag 'clim-recal' . -# docker run -it -p 8888:8888 -v /Volumes/vmfileshare:/home/jovyan/work/vmfileshare clim-recal - -ENV LC_ALL en_GB.UTF-8 -ENV LANG en_GB.UTF-8 -ENV LANGUAGE en_GB.UTF-8 -ENV SHELL /bin/bash +ARG env_name=clim-recal +ENV env_name=$env_name +ARG py_ver=3.10 # The local_data_path is an absolute local path to ClimateData on the machine hosting running `docker` ARG HOST_DATA_PATH=/Volumes/vmfileshare # The local_data_path is an absolute path to mount ClimateData within `docker` ARG DOCKER_DATA_PATH=/mnt/vmfileshare -ARG env_name=clim-recal - -# `py_ver` is not currently used below and is specified in `environment.yaml` -# included here as reminder and clarity if future change needed. -ARG py_ver=3.10 USER root +RUN apt-get update && apt-get -y install gdal-bin python3-gdal libgdal-dev build-essential git && \ + rm -rf /var/lib/apt/lists/* -# Generate the locales -RUN echo "en_GB.UTF-8 UTF-8" > /etc/locale.gen && locale-gen +RUN git config --global --add safe.directory /tmp - -RUN apt-get update && apt-get -y install gdal-bin python3-gdal libgdal-dev build-essential -RUN conda update -n base -c conda-forge conda - -# Ensure correct GDAL paths -RUN export CPLUS_INCLUDE_PATH=/usr/include/gdal && export C_INCLUDE_PATH=/usr/include/gdal - -# Create custom environment from environment.yml -# Add ipykernel for environment build as necessary COPY --chown=${NB_UID}:${NB_GID} environment.yml /tmp/ RUN mamba env create -p "${CONDA_DIR}/envs/${env_name}" -f /tmp/environment.yml && \ mamba clean --all -f -y -# Any additional `pip` installs can be added by using the following line -# Using `mamba` is highly recommended though -RUN "${CONDA_DIR}/envs/${env_name}/bin/pip" install --no-cache-dir \ - 'ipykernel' - -# Create kernel from custome `environment.yml` +COPY . . +# Create Python kernel and link it to jupyter RUN "${CONDA_DIR}/envs/${env_name}/bin/python" -m ipykernel install --user --name="${env_name}" && \ fix-permissions "${CONDA_DIR}" && \ fix-permissions "/home/${NB_USER}" -COPY . . - -RUN git config --global --add safe.directory /home/jovyan -RUN cd python/debiasing && git submodule update --init --recursive +RUN apt-get update && apt-get -y install gdal-bin python3-gdal libgdal-dev build-essential git && \ + rm -rf /var/lib/apt/lists/* +RUN \ + # This changes a startup hook, which will activate the custom environment + echo conda activate "${env_name}" >> /usr/local/bin/before-notebook.d/10activate-conda-env.sh && \ + # This makes the custom environment default in Jupyter Terminals for all users which might be created later + echo conda activate "${env_name}" >> /etc/skel/.bashrc && \ + # This makes the custom environment default in Jupyter Terminals for already existing NB_USER + echo conda activate "${env_name}" >> "/home/${NB_USER}/.bashrc" -RUN activate_custom_env_script=/usr/local/bin/before-notebook.d/activate_custom_env.sh && \ - echo "#!/bin/bash" > ${activate_custom_env_script} && \ - echo "eval \"$(conda shell.bash activate "${env_name}")\"" >> ${activate_custom_env_script} && \ - chmod +x ${activate_custom_env_script} - - -RUN chown -R ${NB_UID}:${NB_GID} . - -# Switch to default jupyter user USER ${NB_UID} - -# This eases running shell commands outside docker following: -# https://pythonspeed.com/articles/activate-conda-dockerfile/ - -# Set this for default `conda activate` configuration -# You can comment this line to keep the default environment in Terminal -RUN echo "conda activate ${env_name}" >> "${HOME}/.bashrc" - -# Try adding access to drive files if mounted -# RUN --mount=type=bind,source=$HOST_DATA_PATH,target=$DOCKER_DATA_PATH - -# RUN --mount=type=bind,source=$HOST_DATA_PATH,target=$DOCKER_DATA_PATH \ -# find /build/jars -type f -name '*.jar' -maxdepth 1 -print0 \ -# | xargs -0 --no-run-if-empty --replace=source cp --force source >"${INSTALL_PATH}/modules/" diff --git a/environment.yml b/environment.yml index 74508cdb..aa214f87 100644 --- a/environment.yml +++ b/environment.yml @@ -14,16 +14,14 @@ dependencies: - xz=5.4.2 - pip: - affine==2.3.1 - - attrs==22.1.0 - backports.strenum==1.2.8 - certifi==2023.07.22 - coverage-badge==1.1.0 - cftime==1.6.2 - - click==8.1.3 - - click-plugins==1.1.1 - cligj==0.7.2 - geopandas==0.12.2 - ipython==8.15.0 + - jupyterlab==4.0.12 - matplotlib==3.6.1 - netcdf4==1.6.1 - numpy==1.23.4 From 896f843b2db762578a61a8fe028a3b456bafe3bf Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Fri, 2 Feb 2024 03:52:51 +0000 Subject: [PATCH 81/83] fix(doc): improve python/resampling/resampling_hads.py documentation --- python/resampling/resampling_hads.py | 63 +++++++++++++++++----------- 1 file changed, 39 insertions(+), 24 deletions(-) diff --git a/python/resampling/resampling_hads.py b/python/resampling/resampling_hads.py index 48b91183..42e8ae2e 100644 --- a/python/resampling/resampling_hads.py +++ b/python/resampling/resampling_hads.py @@ -10,9 +10,7 @@ import os from os import cpu_count -import netCDF4 import pandas as pd -import scipy import xarray as xr # requires rioxarray extension from tqdm import tqdm @@ -20,18 +18,23 @@ def enforce_date_dropping( raw_data: xr.Dataset, converted_data: xr.Dataset ) -> xr.Dataset: - """ - Workaround to avoid convert_calendar misbehavior with monthly data files. - - For leap years, the conversion assigns dropped data to the previous date instead of deleting it. - Here we manually delete those dates to avoid duplicates later in the pipeline. - - Args: - raw_data (xr.Dataset): The original data. - converted_data (xr.Dataset): The data after conversion. - - Returns: - xr.Dataset: The converted data with specific dates dropped. + """Workaround convert_calendar misbehavior with monthly data files. + + For leap years, the conversion assigns dropped data to the previous + date instead of deleting it. Here we manually delete those dates to + avoid duplicates later in the pipeline. + + Parameters + ---------- + raw_data + The original data. + converted_data + The data after conversion. + + Returns + ------- + xr.Dataset + The converted data with specific dates dropped. """ month_day_drop = {(1, 31), (4, 1), (6, 1), (8, 1), (10, 1), (12, 1)} time_values = pd.DatetimeIndex(raw_data.coords["time"].values) @@ -56,16 +59,28 @@ def enforce_date_dropping( return converted_data -def resample_hadukgrid(x): - """ - Resamples the UKHADs data to match UKCP18 data both spatially and temporally - and saves the resampled data to the output directory. - inputs: - x: list of inputs - x[0]: file to be resampled - x[1]: x_grid - x[2]: y_grid - x[3]: output_dir +def resample_hadukgrid(x: list) -> int: + """Resample UKHADs data to match UKCP18 spatially and temporally. + + Results are saved to the output directory. + + Parameters + ---------- + x + x[0]: file to be resampled + x[1]: x_grid + x[2]: y_grid + x[3]: output_dir + + Returns + ------- + bool + Whether function was a success (True) or not (False). + + Raises + ------ + Exception + Generic execption for any errors raised. """ try: # due to the multiprocessing implementations inputs come as list From c517aac7f0e3dd8bbba44358aa330bd136bf9a3c Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Mon, 5 Feb 2024 13:43:15 +0000 Subject: [PATCH 82/83] fix(doc): fix typos in data_loader.py and utils.py --- python/load_data/data_loader.py | 2 +- python/utils.py | 62 +++++++++++++++++++++++++-------- 2 files changed, 48 insertions(+), 16 deletions(-) diff --git a/python/load_data/data_loader.py b/python/load_data/data_loader.py index 6a40c212..0810dd7d 100644 --- a/python/load_data/data_loader.py +++ b/python/load_data/data_loader.py @@ -115,7 +115,7 @@ def clip_dataset(xa: xr.DataArray, variable: str, shapefile: str) -> xr.DataArra Parameters ---------- xa - xArray containing a giving variable + xArray containing a given variable (e.g. rainfall) variable A string representing the variable to be loaded shapefile diff --git a/python/utils.py b/python/utils.py index 1f19c69e..764da152 100644 --- a/python/utils.py +++ b/python/utils.py @@ -30,6 +30,10 @@ def date_to_str( out_format_str A `strftime` format `str` to convert `date_obj` to. + Return + ------ + A `str` version of `date_obj` in `out_format_str` format. + Examples -------- >>> date_to_str('20100101') @@ -65,6 +69,11 @@ def date_range_to_str( out_format_str A `strftime` format `str` to convert `end_date` from. + Return + ------ + A `str` of date range from `start_date` to `end_date` in the + `out_format_str` format. + Examples -------- >>> date_range_to_str('20100101', '20100330') @@ -90,6 +99,10 @@ def iter_to_tuple_strs(iter_var: Iterable[Any]) -> tuple[str, ...]: iter_var Iterable of objects that can be converted into `strs`. + Return + ------ + A `tuple` of all `iter_var` elements in `str` format. + Examples -------- >>> iter_to_tuple_strs(['cat', 1, Path('a/path')]) @@ -107,19 +120,23 @@ def path_iterdir( Parameters ---------- path - `Path` of folder to iterate through + `Path` to folder to iterate through. strict Whether to raise `FileNotFoundError` if `path` not found. - Returns - ------- - A `Generator` of `Paths` within folder `path`. + Yield + ----- + A `Path` for each folder in `path`. Raises ------ FileNotFoundError Raised if `strict = True` and `path` does not exist. + Return + ------ + `None` if `FileNotFoundError` error and `strict` is `False`. + Examples -------- @@ -163,11 +180,17 @@ def make_user( Parameters ---------- user - user and home folder name + Name for user and home folder name to append to `user_home_path`. password - login password + Login password. code_path - path to copy code from to user path + `Path` to copy code from to `user` home directory. + user_home_path + Path that `user` folder will be in, often `Path('/home')` in `linux`. + + Return + ------ + Full path to generated `user` home folder. Examples -------- @@ -203,6 +226,10 @@ def rm_user(user: str, user_home_path: Path = DEBIAN_HOME_PATH) -> str: user_home_path Parent path of `user` folder name. + Return + ------ + `user` name of account and home folder deleted. + Examples -------- >>> import os @@ -221,7 +248,7 @@ def rm_user(user: str, user_home_path: Path = DEBIAN_HOME_PATH) -> str: def csv_reader(path: Path, **kwargs) -> Generator[dict[str, str], None, None]: - """Yield a `dict` per for from file `path`. + """Yield a `dict` per row from a `CSV` file at `path`. Parameters ---------- @@ -230,10 +257,9 @@ def csv_reader(path: Path, **kwargs) -> Generator[dict[str, str], None, None]: **kwargs Additional parameters for `csv.DictReader`. - Returns - ------- - : - A `Generator` of `dicts` per row from `path`. + Yield + ----- + A `dict` per row from `CSV` file at `path`. Examples -------- @@ -249,8 +275,10 @@ def csv_reader(path: Path, **kwargs) -> Generator[dict[str, str], None, None]: ... line_num: int = writer.writerow(('user_name', 'password')) ... for user_name, password in auth_dict.items(): ... line_num = writer.writerow((user_name, password)) - >>> tuple(csv_reader(csv_path)) - ({'user_name': 'sally', 'password': 'fig*new£kid'}, {'user_name': 'george', 'password': 'tee&iguana*sky'}, {'user_name': 'susan', 'password': 'history!bill-walk'}) + >>> tuple(csv_reader(csv_path)) # doctest: +NORMALIZE_WHITESPACE + ({'user_name': 'sally', 'password': 'fig*new£kid'}, + {'user_name': 'george', 'password': 'tee&iguana*sky'}, + {'user_name': 'susan', 'password': 'history!bill-walk'}) """ with open(path) as csv_file: for row in DictReader(csv_file, **kwargs): @@ -260,7 +288,7 @@ def csv_reader(path: Path, **kwargs) -> Generator[dict[str, str], None, None]: def make_users( file_path: Path, user_col: str, password_col: str, file_reader: Callable, **kwargs ) -> Generator[Path, None, None]: - """Load a file of usernames and passwords to pass to `make_user`. + """Load a file of usernames and passwords and pass each line to `make_user`. Parameters ---------- @@ -275,6 +303,10 @@ def make_users( **kwargs Additional parameters for to pass to `file_reader` function. + Yield + ----- + The home `Path` for each generated user. + Examples -------- >>> import os From c47f4f91bc2d4524dad357e6da864fea186289e5 Mon Sep 17 00:00:00 2001 From: Dr Griffith Rees Date: Mon, 5 Feb 2024 14:28:46 +0000 Subject: [PATCH 83/83] fix(doc): improve resampling_hads and _quarto.yml config --- _quarto.yml | 33 +++++++++++++++++----------- python/resampling/resampling_hads.py | 15 +++++++------ 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/_quarto.yml b/_quarto.yml index 4a76c98e..13d6c23d 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -62,7 +62,13 @@ website: - section: "Reference" contents: - href: "docs/reference/data_download.ceda_ftp_download.download_ftp.qmd" - text: "Data Download" + text: "Download CEDA FTP" + # There are errors rendering the pages below + # see: https://github.com/alan-turing-institute/clim-recal/issues/128 + # - href: "docs/reference/resampling.qmd" + # text: "HADs Resampling" + # - href: "docs/reference/debiasing.qmd" + # text: "Debiasing" - href: "docs/reference/utils.qmd" text: "Utilities" - text: "Docker" @@ -77,15 +83,16 @@ quartodoc: # write sidebar data to this file sidebar: _sidebar.yml source_dir: ./python/ - - sections: - - title: Data Source Management - desc: How data is downloaded for use - contents: - # the functions being documented in the package. - # you can refer to anything: class methods, modules, etc.. - - data_download - - load_data - - resampling - - debiasing - - utils + # + # see: https://github.com/alan-turing-institute/clim-recal/issues/128 + # sections: + # - title: Data Source Management + # desc: How data is downloaded for use + # contents: + # # the functions being documented in the package. + # # you can refer to anything: class methods, modules, etc.. + # - data_download + # # - load_data + # # - resampling + # # - debiasing + # - utils diff --git a/python/resampling/resampling_hads.py b/python/resampling/resampling_hads.py index 42e8ae2e..97780f7c 100644 --- a/python/resampling/resampling_hads.py +++ b/python/resampling/resampling_hads.py @@ -1,5 +1,5 @@ -""" -This script resamples the UKHADS data to match UKCP18 data. +"""This script resamples the UKHADS data to match UKCP18 data. + It resamples spatially, from 1km to 2.2km It resamples temporally to a 360 day calendar. """ @@ -22,7 +22,10 @@ def enforce_date_dropping( For leap years, the conversion assigns dropped data to the previous date instead of deleting it. Here we manually delete those dates to - avoid duplicates later in the pipeline. + avoid duplicates later in the pipeline. See + https://docs.xarray.dev/en/stable/generated/xarray.Dataset.convert_calendar.html#xarray.Dataset.convert_calendar + for more information, and for updates on issues see + https://github.com/pydata/xarray/issues/8086 Parameters ---------- @@ -33,8 +36,7 @@ def enforce_date_dropping( Returns ------- - xr.Dataset - The converted data with specific dates dropped. + The converted data with specific dates dropped. """ month_day_drop = {(1, 31), (4, 1), (6, 1), (8, 1), (10, 1), (12, 1)} time_values = pd.DatetimeIndex(raw_data.coords["time"].values) @@ -74,8 +76,7 @@ def resample_hadukgrid(x: list) -> int: Returns ------- - bool - Whether function was a success (True) or not (False). + Whether function was a success `0` or not `1`. Raises ------