update read_enterodata to include bay segment, change column order ou…

…tput, update enterodata
tbep-tech · Aug 13, 2024 · c10efa2 · c10efa2
1 parent 74cd215
commit c10efa2
Show file tree

Hide file tree

Showing 8 changed files with 111 additions and 83 deletions.
diff --git a/R/enterodata.R b/R/enterodata.R
@@ -1,20 +1,24 @@
 #' Enterococcus data from 53 key Enterococcus stations since 1995
 #'
-#' @format A data frame with 6207 rows and 12 columns:
+#' @format A data frame with 6266 rows and 16 columns:
 #' \describe{
-#'  \item{date}{data, sample date}
-#'  \item{station}{character, sample station, as named in the Water Quality Portal}
-#'  \item{ecocci}{numeric, sample's Enterococcus concentration}
-#'  \item{ecocci_censored}{logical, whether sample concentration was censored (below detection limit)}
-#'  \item{Latitude}{numeric, latitude in decimal degrees}
-#'  \item{Longitude}{numeric, longitude in decimal degrees}
-#'  \item{ecocci_units}{character, units of measurement of Enterococcus}
-#'  \item{time}{character, time sample was obtained, in 24hr format}
-#'  \item{time_zone}{character, time zone of sample time}
-#'  \item{MDL}{numeric, minimum detection limit of laboratory for Enterococcus}
-#'  \item{yr}{numeric year of sample date}
-#'  \item{mo}{numeric month of sample date}
-#' }
+#'  \item{\code{date}}{date, sample date}
+#'  \item{\code{yr}}{numeric, year of sample date}
+#'  \item{\code{mo}}{numeric, month of sample date}
+#'  \item{\code{time}}{character, sample time}
+#'  \item{\code{time_zone}}{character, sample time zone}
+#'  \item{\code{long_name}}{character, long name of bay segment subwatershed}
+#'  \item{\code{bay_segment}}{character, short name of bay segment subwatershed}
+#'  \item{\code{station}}{character, sample station}
+#'  \item{\code{ecocci}}{numeric, Enterococcus concentration}
+#'  \item{\code{ecocci_censored}}{logical, whether \code{ecocci} value was below the laboratory \code{MDL}, minimum detection limit}
+#'  \item{\code{MDL}}{numeric, minimum detection limit at the time of processing}
+#'  \item{\code{ecocci_units}}{character, units of measurement for \code{ecocci}}
+#'  \item{\code{qualifier}}{qualifier codes associated with sample}
+#'  \item{\code{LabComments}}{lab comments on sample}
+#'  \item{\code{Latitude}}{numeric, latitude in decimal degrees}
+#'  \item{\code{Longitude}}{numeric, longitude in decimal degrees}
+#'  }
 #' @details
 #' A sample dataset containing Enterococcus from 53 stations in the TBEP watershed from 1995-2023. Generated by \code{data-raw/enterodata-raw.R} (view on github: \url{https://github.com/tbep-tech/tbeptools/blob/master/data-raw/enterodata-raw.R})
 #' @source Water Quality Portal, \url{https://waterqualitydata.us}

diff --git a/R/read_importentero.R b/R/read_importentero.R
@@ -9,19 +9,21 @@
 #' @return a data frame containing one row for each sample. Columns returned are:
 #' \describe{
 #'  \item{\code{date}}{date, sample date}
+#'  \item{\code{yr}}{numeric, year of sample date}
+#'  \item{\code{mo}}{numeric, month of sample date}
+#'  \item{\code{time}}{character, sample time}
+#'  \item{\code{time_zone}}{character, sample time zone}
+#'  \item{\code{long_name}}{character, long name of bay segment subwatershed}
+#'  \item{\code{bay_segment}}{character, short name of bay segment subwatershed}
 #'  \item{\code{station}}{character, sample station}
 #'  \item{\code{ecocci}}{numeric, Enterococcus concentration}
 #'  \item{\code{ecocci_censored}}{logical, whether \code{ecocci} value was below the laboratory \code{MDL}, minimum detection limit}
+#'  \item{\code{MDL}}{numeric, minimum detection limit at the time of processing}
 #'  \item{\code{ecocci_units}}{character, units of measurement for \code{ecocci}}
 #'  \item{\code{qualifier}}{qualifier codes associated with sample}
 #'  \item{\code{LabComments}}{lab comments on sample}
 #'  \item{\code{Latitude}}{numeric, latitude in decimal degrees}
 #'  \item{\code{Longitude}}{numeric, longitude in decimal degrees}
-#'  \item{\code{time}}{character, sample time}
-#'  \item{\code{time_zone}}{character, sample time zone}
-#'  \item{\code{MDL}}{numeric, minimum detection limit for the lab and time the sample was analyzed}
-#'  \item{\code{yr}}{numeric, year of sample date}
-#'  \item{\code{mo}}{numeric, month of sample date}
 #'  }
 #'
 #' @importFrom dplyr %>%
@@ -30,12 +32,12 @@
 #' @examples
 #' \dontrun{
 #' # stations to download
-#' stations <- c('21FLHILL_WQX-101',
+#' stas <- c('21FLHILL_WQX-101',
 #' '21FLHILL_WQX-102',
 #' '21FLHILL_WQX-103')
 #'
 #' # download and read the data
-#' entero_in <- read_importentero(stas = stations, startDate = '2023-01-01', endDate = '2023-02-01')
+#' entero_in <- read_importentero(stas = stas, startDate = '2023-01-01', endDate = '2023-02-01')
 #'
 #' head(entero_in)
 #'
@@ -44,21 +46,20 @@ read_importentero <- function(stas = NULL, startDate, endDate){
 
   # default to all stations if not specified
   if(is.null(stas))
-    stations <- unique(catchprecip$station)
+    stas <- unique(catchprecip$station)
 
   entero_names <- c('Enterococci',
                     'Enterococcus')
   startDate <- as.Date(startDate)
   endDate <- as.Date(endDate)
 
   args <- list(
-    siteid = stations,
+    siteid = stas,
     characteristicName = entero_names,
     startDateLo = format(startDate, '%m-%d-%Y'),
     startDateHi = format(endDate, '%m-%d-%Y')
   )
 
-
   # generate the parts
   # a weakness here is building the '&' into everything but siteid -
   # this basically means everything is required in the proper order
@@ -81,33 +82,49 @@ read_importentero <- function(stas = NULL, startDate, endDate){
 
   # download and read in the file
   tmp1 <- tempfile()
-  download.file(url = url_full, destfile = tmp1, method = 'curl')
-  dat <- read.csv(tmp1)
+  download.file(url = url_full, destfile = tmp1, method = 'libcurl', quiet = TRUE)
+  datraw <- suppressWarnings(read.csv(tmp1))
   unlink(tmp1)
 
-
   # select columns
-  dat2 <- dat %>%
-    dplyr::select(station = MonitoringLocationIdentifier,
-                  Latitude = ActivityLocation.LatitudeMeasure,
-                  Longitude = ActivityLocation.LongitudeMeasure,
-                  ecocci = ResultMeasureValue, # - the result (has characters in here too - 'Not Reported')
-                  ecocci_units = ResultMeasure.MeasureUnitCode,
-                  qualifier = MeasureQualifierCode,
-                  date = ActivityStartDate,
-                  time = ActivityStartTime.Time, # local time
-                  time_zone = ActivityStartTime.TimeZoneCode,
-                  MDL = DetectionQuantitationLimitMeasure.MeasureValue,
-                  LabComments = ResultLaboratoryCommentText) %>%
-    dplyr::filter(ecocci != 'Not Reported') %>%
-    dplyr::mutate(ecocci = as.numeric(ecocci),
-                  ecocci_censored = dplyr::case_when(ecocci <= MDL ~ TRUE,
-                                                     .default = FALSE),
-                  date = as.Date(date),
-                  yr = lubridate::year(date),
-                  mo = lubridate::month(date)) %>%
-    dplyr::relocate(date, station, ecocci, ecocci_censored, ecocci_units, qualifier, LabComments) %>%
+  dat <- datraw %>%
+    dplyr::select(
+      station = MonitoringLocationIdentifier,
+      Latitude = ActivityLocation.LatitudeMeasure,
+      Longitude = ActivityLocation.LongitudeMeasure,
+      ecocci = ResultMeasureValue, # - the result (has characters in here too - 'Not Reported')
+      ecocci_units = ResultMeasure.MeasureUnitCode,
+      qualifier = MeasureQualifierCode,
+      date = ActivityStartDate,
+      time = ActivityStartTime.Time, # local time
+      time_zone = ActivityStartTime.TimeZoneCode,
+      MDL = DetectionQuantitationLimitMeasure.MeasureValue,
+      LabComments = ResultLaboratoryCommentText
+    ) %>%
+    dplyr::mutate(
+      ecocci = dplyr::case_when(
+        ecocci %in% c('*Non-detect', '*Not Reported', 'Not Reported', '*Present >QL') ~ NA_character_,
+        TRUE ~ ecocci
+      ),
+      ecocci = as.numeric(ecocci),
+      ecocci_censored = dplyr::case_when(
+        ecocci <= MDL ~ TRUE,
+        .default = FALSE
+      ),
+      date = as.Date(date),
+      yr = lubridate::year(date),
+      mo = lubridate::month(date)
+      ) %>%
     dplyr::arrange(station, date)
 
-  return(dat2)
+  # add subsegment basin
+  out <- dat %>%
+    sf::st_as_sf(coords = c('Longitude', 'Latitude'), crs = sf::st_crs(tbsegshed), remove = FALSE)
+  out <- suppressWarnings(sf::st_intersection(out, tbsegshed))
+  out <- out %>%
+    sf::st_set_geometry(NULL) %>%
+    dplyr::relocate(date, yr, mo, time, time_zone, long_name, bay_segment, station, ecocci, ecocci_censored, MDL, ecocci_units, qualifier, LabComments)
+
+  return(out)
+
 }
diff --git a/data-raw/enterodata-raw.R b/data-raw/enterodata-raw.R
@@ -2,8 +2,6 @@
 # although apparently data collection only started in 2000 at the earliest of these stations
 library(here)
 
-enterodata <- read_importentero(startDate = '1995-01-01', endDate = '2023-12-31') %>%
-  dplyr::select(-qualifier,
-                -LabComments)
+enterodata <- read_importentero(startDate = '1995-01-01', endDate = '2023-12-31')
 
 save(enterodata, file = here('data/enterodata.RData'), compress = 'xz')
diff --git a/data/enterodata.RData b/data/enterodata.RData
diff --git a/data/tbsegshed.RData b/data/tbsegshed.RData
diff --git a/man/enterodata.Rd b/man/enterodata.Rd
diff --git a/man/read_importentero.Rd b/man/read_importentero.Rd
diff --git a/tests/testthat/test-read_importentero.R b/tests/testthat/test-read_importentero.R
@@ -3,8 +3,8 @@ library(mockery)
 # Mock data for read.csv
 mock_data <- data.frame(
   MonitoringLocationIdentifier = c("station1", "station2"),
-  ActivityLocation.LatitudeMeasure = c(27.123, 27.456),
-  ActivityLocation.LongitudeMeasure = c(-82.123, -82.456),
+  ActivityLocation.LatitudeMeasure = c(27.8893, 27.8589),
+  ActivityLocation.LongitudeMeasure = c(-82.4774, -82.4686),
   ResultMeasureValue = c("100", "200"),
   ResultMeasure.MeasureUnitCode = c("cfu/100mL", "cfu/100mL"),
   MeasureQualifierCode = c("", ""),
@@ -25,27 +25,29 @@ test_that("read_importentero works correctly", {
   mock_read_csv <- mock(return_value = mock_data)
   stub(read_importentero, "read.csv", mock_read_csv)
 
-  stations <- c("21FLHILL_WQX-101", "21FLHILL_WQX-102")
+  stas <- c("21FLHILL_WQX-101", "21FLHILL_WQX-102")
 
   # Call the function with mocked dependencies
-  result <- read_importentero(stas = stations, startDate = "2023-01-01", endDate = "2023-12-31")
+  result <- read_importentero(stas = stas, startDate = "2023-01-01", endDate = "2023-12-31")
 
   # Define expected output
   expected_output <- data.frame(
     date = as.Date(c("2023-01-01", "2023-01-02")),
+    yr = c(2023, 2023),
+    mo = c(1, 1),
+    time = c("10:00", "11:00"),
+    time_zone = c("EST", "EST"),
+    long_name = c("Hillsborough Bay", "Hillsborough Bay"),
+    bay_segment = c("HB", "HB"),
     station = c("station1", "station2"),
     ecocci = c(100, 200),
     ecocci_censored = c(FALSE, FALSE),
+    MDL = c(5, 5),
     ecocci_units = c("cfu/100mL", "cfu/100mL"),
     qualifier = c("", ""),
     LabComments = c("No issues", "No issues"),
-    Latitude = c(27.123, 27.456),
-    Longitude = c(-82.123, -82.456),
-    time = c("10:00", "11:00"),
-    time_zone = c("EST", "EST"),
-    MDL = c(5, 5),
-    yr = c(2023, 2023),
-    mo = c(1, 1)
+    Latitude = c(27.8893, 27.8589),
+    Longitude = c(-82.4774, -82.4686)
   )
 
   # Check if the result matches the expected output
@@ -54,4 +56,5 @@ test_that("read_importentero works correctly", {
   # Verify that the mock functions were called as expected
   expect_called(mock_download, 1)
   expect_called(mock_read_csv, 1)
+
 })