-
Notifications
You must be signed in to change notification settings - Fork 0
/
1_DataCleaning.R
577 lines (429 loc) · 18.5 KB
/
1_DataCleaning.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
### data cleaning
# load packages
library(tidyverse)
library(data.table)
library(lubridate)
## Import three data tables
water_qual_event <- read_csv("Data/water_qual_event.csv",
col_types = cols(Date = col_date(format = "%m/%d/%Y")))
water_qual_event
fish_dat <- read_csv("Data/fish_dat.csv",
col_types = cols(Year = col_double()))
#View(fish_dat)
#some entries in Station ID need to be converted to all caps
fish_dat$Station_ID = toupper(fish_dat$Station_ID)
# import notes problems on rows 12485 and 13444, but I don't see anything wrong.
#note that mort and Injury are all NA
water_qual <- read_csv("Data/water_qual.csv",
col_types = cols(Date = col_date(format = "%m/%d/%Y"),
ID = col_character(), Start_time = col_character()))
#View(water_qual)
hist(fish_dat$Length) #outlier length on a stickleback (370mm!) Check with Darren
## To Merge all the files with the fish data, we need to use Year-Season-Station_ID for all three files
# Water_qual_event
water_qual_event$Year <- year(water_qual_event$Date)
water_qual_event$Month <- month(water_qual_event$Date)
hist(water_qual_event$Month)
water_qual_event$Season <- ifelse(water_qual_event$Month < 6, "Winter", "Fall") # Darren OK'd
water_qual_event$Unique_ID2 <- paste0(water_qual_event$Year,"_",
water_qual_event$Season, "_",
water_qual_event$Station_ID)
duplicated(water_qual_event$Unique_ID2) #yay, no duplicates #406 rows
#View(water_qual_event)
#Water_qual
water_qual$Year <- year(water_qual$Date)
water_qual$Month <- month(water_qual$Date)
water_qual$Season <- ifelse(water_qual$Month < 6, "Winter", "Fall")
water_qual$Unique_ID2 <- paste0(water_qual$Year,"_",
water_qual$Season, "_",
water_qual$Station_ID)
duplicated(water_qual$Unique_ID2) #duplicates expected will fix with pivot wider
#fish_dat - sesaon and year done
fish_dat$Unique_ID2 <- paste0(fish_dat$Year,"_",
fish_dat$Season, "_",
fish_dat$Station_ID)
duplicated(fish_dat$Unique_ID2) #mostly duplicate will fix with summarize
#lets make water_qual wide so each even has unique row.
#currently surface and bottom samples from same even in consecutive rows
## add a unique "Group_ID" to each duplicated row to enable pivot wide
## rowid is from data.table package
water_qual_temp <- water_qual %>%
mutate(Group_ID = data.table::rowid(Unique_ID2)) #changed to Unique_ID2
#View(water_qual_temp)
hist(water_qual_temp$Group_ID)
# looks like some data are more than just the 2 values...perhaps these can be deleted? look at raw data to see why 3-6 samples at some events.
# only 14 instances of > 2 environmental samples...I think we can delete these replicates and just keep the core 2 samples for each parameter.
water_qual_temp <- water_qual_temp %>%
filter(Group_ID < 3)
hist(water_qual_temp$Group_ID)
# pivot wide
water_qual_wide <- water_qual_temp %>%
pivot_wider(id_cols = c("Unique_ID2", "Date"), # added Date to keep in file
names_from = "Group_ID",
values_from = c("Water_temp", "Spec_cond", "Der_spec_cond", "DO", "Perc_sat", "pH")) #airtemp only 1 measurement
#View(water_qual_wide) #has 397 rows, water_qual_event has 406 rows. so some events missing WQ data?
# change some character to numeric
water_qual_wide$Water_temp_1 <- as.numeric(water_qual_wide$Water_temp_1)
water_qual_wide$Water_temp_2 <- as.numeric(water_qual_wide$Water_temp_2)
water_qual_wide$Spec_cond_1 <- as.numeric(water_qual_wide$Spec_cond_1)
water_qual_wide$Spec_cond_2 <- as.numeric(water_qual_wide$Spec_cond_2)
water_qual_wide$DO_1 <- as.numeric(water_qual_wide$DO_1)
water_qual_wide$DO_2 <- as.numeric(water_qual_wide$DO_2)
water_qual_wide$Perc_sat_1 <- as.numeric(water_qual_wide$Perc_sat_1)
water_qual_wide$Perc_sat_2 <- as.numeric(water_qual_wide$Perc_sat_2)
#join WQ_event and WQ
WQ_event_WQ <- left_join(water_qual_event, water_qual_wide, by = "Unique_ID2")
#View(WQ_event_WQ)
# fix some Species naming errors
unique(fish_dat$Species)
fish_dat$Species <-
ifelse(fish_dat$Species == "PSC", "SC",
ifelse(fish_dat$Species == "SSC", "SC",
ifelse(fish_dat$Species == "SCU", "SC", fish_dat$Species)))
unique(fish_dat$Species) #looks good
# get sum fish per day
fish_dat_sum <-
fish_dat %>%
group_by(Unique_ID2, Species) %>%
summarise_at(vars(Numbers),
list(spec_sum = sum))
(fish_dat_sum)
#create length columns for each species
# 2023-06-21 Seems to work
## but BB and TTB check on need for weighted means and how to get min and max.
fish_dat_length <-
fish_dat %>%
filter(!is.na(Length)) %>% # remove the NA lengths
pivot_wider(id_cols = "Unique_ID2",
names_from = "Species",
values_from = "Length",
names_prefix = "Length_",
values_fn = mean) # added to get mean for the multiple groups of same
(fish_dat_length)
hist(fish_dat_length$Length_TW)
hist(fish_dat_length$Length_SB)
hist(fish_dat_length$Length_SC)
#### 2023-08-07
#### fixing up the fish length to get species by year.
#### for fish lengths need to use fish_dat
hist(fish_dat$Year)
length.test1 <-
fish_dat %>%
filter(!is.na(Length)) %>%
filter(Season == "Fall") %>%
group_by(Species, Year) %>%
summarise_at(vars(Length),
list(min_length = min,
max_length = max,
mean_length = mean,
sample_size = length))
#mean length over time
ggplot(length.test1, aes(Year, mean_length, color = Species)) +
geom_point() +
geom_smooth()
#boxplot over time
ggplot(fish_dat, aes(x = as.factor(Year), y = Length, color = Species)) +
geom_boxplot()
fish_dat_TW <- fish_dat %>%
filter(Species == "TW")
ggplot(fish_dat_TW, aes(x = as.factor(Year), y = Length)) +
geom_violin() +
geom_point()
ggplot(fish_dat_TW, aes(Length)) +
geom_histogram() +
geom_vline(xintercept = 12, lty = 2) +
facet_wrap(~Year)
ggplot(fish_dat_TW, aes(Length)) +
geom_histogram() +
geom_vline(xintercept = 12, lty = 2)
## 2023-06-21 LETS TALK THROUGH BELOW SO BB UNDERSTANDS BETTER.
#expand to individual rows instead of csv
fish_dat_length2 <- fish_dat_length %>%
mutate(Length_TW = strsplit(as.character(Length_TW), ",")) %>%
unnest(Length_TW)#caused single SB value to fill in for expanded TW rows...need to fix!!!
fish_dat_length3 <- fish_dat_length2 %>%
mutate(Length_SB = strsplit(as.character(Length_SB), ",", fixed = TRUE)) %>%
unnest(Length_SB)
fish_dat_length4 <- fish_dat_length3 %>%
mutate(Length_SC = strsplit(as.character(Length_SC), ",", fixed = TRUE)) %>%
unnest(Length_SC)#line 127 error happened to all values that were missing... filled in missing values with the previous value
#View(fish_dat_length4)
#need to remove extra characters "c", "()"
fish_dat_length4$Length_TW<-gsub("c","",as.character(fish_dat_length4$Length_TW))
fish_dat_length4$Length_TW<-gsub("[()]","",as.character(fish_dat_length4$Length_TW))
fish_dat_length4$Length_SB<-gsub("c","",as.character(fish_dat_length4$Length_SB))
fish_dat_length4$Length_SB<-gsub("[()]","",as.character(fish_dat_length4$Length_SB))
fish_dat_length4$Length_SC<-gsub("c","",as.character(fish_dat_length4$Length_SC))
fish_dat_length4$Length_SC<-gsub("[()]","",as.character(fish_dat_length4$Length_SC))
#left with white space - need to remove
fish_dat_length4$Length_TW <- trimws(fish_dat_length4$Length_TW)
fish_dat_length4$Length_SB <- trimws(fish_dat_length4$Length_SB)
fish_dat_length4$Length_SC <- trimws(fish_dat_length4$Length_SC)
## 2023-06-21 ALL GOOD FROM HERE DOWN EXCEPT NEED BREACH DATA.
###---------
# sum fish into columns by species
fish_dat_sum <-
fish_dat_sum %>%
pivot_wider(id_cols = "Unique_ID2",
names_from = "Species",
values_from = "spec_sum",
names_prefix = "Sum_")
fish_dat_sum <- fish_dat_sum %>% select(-Sum_NONE) #remove "none" column
fish_dat_sum[is.na(fish_dat_sum)] <- 0
(fish_dat_sum)
#calculate min, max, mean for species length per unique ID (consider adding 95% CI) !!
## FIX so that column is pulling TWG length only and not all species lengths
#this section has not run yet - need to fix error on line 127
TW_stats <- fish_dat_length %>%
group_by(Unique_ID2) %>%
summarise_at(vars(Length_TW),
list(TW_min_length = min,
TW_max_length = max,
TW_mean_length = mean))
SB_stats <- fish_dat_length4 %>%
group_by(Unique_ID2) %>%
summarise_at(vars(Length_SB),
list(SB_min_length = min,
SB_max_length = max,
SB_mean_length = mean))
SC_stats <- fish_dat_length4 %>%
group_by(Unique_ID2) %>%
summarise_at(vars(Length_SC),
list(SC_min_length = min,
SC_max_length = max,
SC_mean_length = mean))
#join all min/max/mean columns into one table, then join to goby_master
#need to do
#calculate sum of mortality per unique ID
# Note that mortality and injury data are all NA when imported. TTB please check original data.
fish_mort <- fish_dat %>%
group_by(Unique_ID2, Species) %>%
summarise_at(vars(Mort),
list(mort_sum = sum))
fish_mortality <-
fish_mort %>%
pivot_wider(id_cols = "Unique_ID2",
names_from = "Species",
values_from = "mort_sum",
names_prefix = "Mort_")
fish_mortality <- fish_mortality %>% select(-Mort_NONE)
#calculate sum of injury per unique ID
fish_injury <- fish_dat %>%
group_by(Unique_ID2, Species) %>%
summarise_at(vars(Injury),
list(inj_sum = sum))
summary(fish_dat)
#fish_injury <- fish_injury %>% select(-NONE)
#calculate sum of microsporidian per unique ID
###change N/Y to 0/1 first
fish_dat_temp <- fish_dat %>%
mutate(Microsporidian = ifelse(Microsporidian == "N",0,1))
fish_dat <- fish_dat_temp %>% mutate(Microsporidian = ifelse(is.na(Microsporidian), 0, Microsporidian))
microsporidium <- fish_dat %>% group_by(Unique_ID2) %>%
summarise_at(vars(Microsporidian),
list(micro_sum = sum))
microsporidium
### let's join all our five datasets
#water_qual_event
#water_qual_wide,
#fish_dat_sum
#fish_stats
#microsporidium
#fish_mortality <- all NAs, so check original data
#fish_injury <- all NAs, so check original data
#change unique ID so all letters are capitalized to match with wq wide
goby_master <- list(water_qual_event, water_qual_wide, fish_dat_sum, microsporidium) %>% #will add fish_mort ## removed fish_stats,
reduce(left_join, by = "Unique_ID2")
str(goby_master)
goby_master
#Volume is just mean depth by area and volumer has one na. so replace all and make numeric
goby_master$volume <- goby_master$Ave_depth * goby_master$Area
goby_master$Volumer <- as.numeric(goby_master$Volumer)
plot(goby_master$volume, as.numeric(goby_master$Volumer))
# question for Darren, why does volume and Volumer usually but not always match?
goby_master$Volume_Match <- goby_master$Volumer == goby_master$volume
#date fields match?
goby_master$Match <- goby_master$Date.x == goby_master$Date.y
# 5 dates don't match and are 1-2 days different, must have been data collected on two days due to logistics.
goby_master <- goby_master %>%
rename("Zone" = "Zone 2")
goby_master$Zone <- as.factor(goby_master$Zone)
#1 case where volume = 0 was in shallow mud. assign a really small area
hist(goby_master$volume)
goby_master$volume <- ifelse(goby_master$volume == 0, 0.2, goby_master$volume)
#look at zones to combine. refer to previous report.
plot(goby_master$Zone)
# remove the Winter data?
table(goby_master$Season)
str(goby_master)
goby_master$Water_temp_1 <- as.numeric(goby_master$Water_temp_1)
hist(goby_master$Water_temp_1)
## TO DO
# pool zones?
# include precip data
# breach data?
### NEW 5/26 #####
# drop winter data
# goby_master[goby_master$Season != "Winter"]
goby_master <- goby_master %>% filter(Season != "Winter")
#View(goby_master)
# drop zones
#goby_master[goby_master$Zone != "NE" & goby_master$Zone != "SE" & goby_master$Zone != "SW"]
#KeepZone<- c("E", "W", "NW")
goby_master <- goby_master %>% filter(Zone != "NE") %>%
filter(Zone != "SE") %>%
filter(Zone != "SW")
#View(goby_master)
#goby_master$avg_WT <- mean(goby_master$Water_temp_1, goby_master$Water_temp_1)
# Duplicates the temp_mean below...can delete
# library(dplyr)
# goby_master <- goby_master %>%
# mutate(val= (pmax(Water_temp_1, Water_temp_2, na.rm=TRUE)+
# pmin(Water_temp_1, Water_temp_2, na.rm=TRUE))/2)
# Works fine for mean temp
goby_master <- goby_master %>% mutate(temp_mean = rowMeans(across(starts_with("Water_temp")), na.rm=TRUE))
#plot(goby_master$val, goby_master$temp_mean)
#minimum DO
goby_master$min_DO <- do.call(pmin, c(goby_master[,c("DO_1", "DO_2")], na.rm=TRUE))
#View(goby_master)
#join rainfall data by year
library(readr)
rainfall_data <- read_csv("Data/rainfall_data.csv")
rainfall_data
rainfall_data <- rainfall_data[,-c(4:15)]
goby_master_1 <- left_join(goby_master, rainfall_data, by = "Year")
hist(rainfall_data$Rain_Sum)
#join breach data based on year
library(tidyverse)
breach_data <- read_csv("Data/breach_data.csv",
col_types = cols(Date_Latest_Breach = col_date(format = "%m/%d/%Y")))
goby_master_2 <- left_join(goby_master_1, breach_data, by = "Year")
#create new column for Date.x - Date_Latest_Breach
goby_master_2$breach_days <- goby_master_2$Date.x - goby_master_2$Date_Latest_Breach
goby_master_2$breach_days <- as.numeric(goby_master_2$breach_days)
#remove days inside value or use lubridate package for calculating difference in days
hist(goby_master_2$breach_days)
#change "muck (organic)" to "muck"
goby_master_2$Dom_substrate[goby_master_2$Dom_substrate == 'muck (organic)'] <- 'muck'
goby_master_2$Subdom_substrate[goby_master_2$Subdom_substrate == 'muck (organic)'] <- 'muck'
#calculate density for each species (SC, SB, TW)
goby_master_2$TW_density <- (goby_master_2$Sum_TW/goby_master_2$volume)
goby_master_2$SC_density <- (goby_master_2$Sum_SC/goby_master_2$volume)
goby_master_2$SB_density <- (goby_master_2$Sum_SB/goby_master_2$volume)
#View(goby_master_2)
str(goby_master_2)
#calculate years since latest breach
goby_master_2$Breach_Year <- format.Date(as.Date(goby_master_2$Date_Latest_Breach, format="%d/%m/%Y"),"%Y")
goby_master_2$Since_Breach <- as.numeric(goby_master_2$Year) - as.numeric(goby_master_2$Breach_Year)
# since no data 2014. We know that there was a breach on 2013-12-26 so give 2014 a "0"
# since no data for 2010 we will give it a "0.5" since there was a breach in feb 2009.
goby_master_2$Since_Breach <- ifelse(goby_master_2$Year == 2014, 0, goby_master_2$Since_Breach)
goby_master_2$Since_Breach <- ifelse(goby_master_2$Year == 2010, 0.5, goby_master_2$Since_Breach)
#calculate total number of breach days per year (import from excel)
## 2023-06-21
## BB import total_breaches from new google folder...
### 2023-06-21
### NEED TOTAL BREACH DATA
library(tidyverse)
goby_master_2 <- left_join(goby_master_2, total_breaches, by = "Year")
#get stats for each variable
#Date.x
summary(goby_master_2$Date.x)
sum(is.na(goby_master_2$Date.x))
list(goby_master_2$Date.x)
hist(goby_master_2$Date.x, breaks=30)
#Zone
summary(goby_master_2$Zone)
sum(is.na(goby_master_2$Zone))
tabledata <- table(goby_master_2$Zone)
barplot(tabledata)
#Dom_substrate
library(dplyr)
goby_master_2 %>%
count(Dom_substrate)
sum(is.na(goby_master_2$Dom_substrate))
list(goby_master_2$Dom_substrate)
tabledata <- table(goby_master_2$Dom_substrate)
barplot(tabledata)
#SAV
summary(goby_master_2$SAV)
goby_master_2 %>%
count(SAV)
hist(goby_master_2$SAV)
#Season
goby_master_2 %>%
count(Season)
sum(is.na(goby_master_2$Season))
#Sum_TW
summary(goby_master_2$Sum_TW)
sum(is.na(goby_master_2$Sum_TW))
hist(goby_master_2$Sum_TW)
#Sum_SC
summary(goby_master_2$Sum_SC)
sum(is.na(goby_master_2$Sum_SC))
hist(goby_master_2$Sum_SC)
#Sum_SB
summary(goby_master_2$Sum_SB)
sum(is.na(goby_master_2$Sum_SB))
hist(goby_master_2$Sum_SB)
#Volume
summary(goby_master_2$volume)
sum(is.na(goby_master_2$volume))
hist(goby_master_2$volume)
#temp_mean
summary(goby_master_2$temp_mean)
hist(goby_master_2$temp_mean)
#min_DO
summary(goby_master_2$min_DO)
hist(goby_master_2$min_DO)
#rain_sum
summary(goby_master_2$Rain_Sum)
sum(is.na(goby_master_2$Rain_Sum))
hist(goby_master_2$Rain_Sum)
#breach_days
summary(goby_master_2$breach_days)
sum(is.na(goby_master_2$breach_days))
hist(goby_master_2$breach_days)
#TW_density
summary(goby_master_2$TW_density)
sum(is.na(goby_master_2$TW_density))
hist(goby_master_2$TW_density)
#SC_density
summary(goby_master_2$SC_density)
sum(is.na(goby_master_2$SC_density))
hist(goby_master_2$SC_density)
#SB_density
summary(goby_master_2$SB_density)
sum(is.na(goby_master_2$SB_density))
hist(goby_master_2$SB_density)
write.csv(goby_master_2, "/Users/Thuy-Tien/R files/goby/goby_master_2.csv", row.names=FALSE)
write.csv(goby_master_2, "data/goby_master_2.csv", row.names=FALSE)
#####OK goby_master ready for analysis -----------------------------------------
## wind data
## https://coastwatch.pfeg.noaa.gov/erddap/griddap/erdlasFnWPr.html
##double checked that latlon was just offshore frmo rodeo lagoon. (app wants lat = 37.83, long east = 360-122.55 = 237.45)
wind <- read_csv("Data/Rodeo_Wind_37.83_-122.55_1994-2023_monthly_from ERDDAP_Upwelling.csv")
ggplot(wind,aes(time, u_mean_ms)) +
geom_point() +
geom_smooth()
head(wind)
library(lubridate)
## get mean u component of wind (e-w) for April - Sept each year
wind$Year <- year(wind$time)
wind$Month <- month(wind$time)
wind_A <- wind %>% filter(Month > 3 & Month < 10)
## get mean and max for each U for each year
wind_mean <- wind_A %>%
group_by(Year) %>%
summarize(u_mean = mean(u_mean_ms))
wind_max <- wind_A %>%
group_by(Year) %>%
summarize(u_max = max(u_mean_ms))
par(mfrow = c(1,2))
hist(wind_mean$u_mean)
hist(wind_max$u_max)
wind_ready <- cbind(wind_mean, wind_max)
wind_ready <- wind_ready[,-3]
## attach to goby_master_2
library(plyr)
goby_master_3 <- left_join(goby_master_2, wind_ready, by = "Year")