diff --git a/R/enterodata.R b/R/enterodata.R index 7cfde702..53f28fa0 100644 --- a/R/enterodata.R +++ b/R/enterodata.R @@ -1,20 +1,24 @@ #' Enterococcus data from 53 key Enterococcus stations since 1995 #' -#' @format A data frame with 6207 rows and 12 columns: +#' @format A data frame with 6266 rows and 16 columns: #' \describe{ -#' \item{date}{data, sample date} -#' \item{station}{character, sample station, as named in the Water Quality Portal} -#' \item{ecocci}{numeric, sample's Enterococcus concentration} -#' \item{ecocci_censored}{logical, whether sample concentration was censored (below detection limit)} -#' \item{Latitude}{numeric, latitude in decimal degrees} -#' \item{Longitude}{numeric, longitude in decimal degrees} -#' \item{ecocci_units}{character, units of measurement of Enterococcus} -#' \item{time}{character, time sample was obtained, in 24hr format} -#' \item{time_zone}{character, time zone of sample time} -#' \item{MDL}{numeric, minimum detection limit of laboratory for Enterococcus} -#' \item{yr}{numeric year of sample date} -#' \item{mo}{numeric month of sample date} -#' } +#' \item{\code{date}}{date, sample date} +#' \item{\code{yr}}{numeric, year of sample date} +#' \item{\code{mo}}{numeric, month of sample date} +#' \item{\code{time}}{character, sample time} +#' \item{\code{time_zone}}{character, sample time zone} +#' \item{\code{long_name}}{character, long name of bay segment subwatershed} +#' \item{\code{bay_segment}}{character, short name of bay segment subwatershed} +#' \item{\code{station}}{character, sample station} +#' \item{\code{ecocci}}{numeric, Enterococcus concentration} +#' \item{\code{ecocci_censored}}{logical, whether \code{ecocci} value was below the laboratory \code{MDL}, minimum detection limit} +#' \item{\code{MDL}}{numeric, minimum detection limit at the time of processing} +#' \item{\code{ecocci_units}}{character, units of measurement for \code{ecocci}} +#' \item{\code{qualifier}}{qualifier codes associated with sample} +#' \item{\code{LabComments}}{lab comments on sample} +#' \item{\code{Latitude}}{numeric, latitude in decimal degrees} +#' \item{\code{Longitude}}{numeric, longitude in decimal degrees} +#' } #' @details #' A sample dataset containing Enterococcus from 53 stations in the TBEP watershed from 1995-2023. Generated by \code{data-raw/enterodata-raw.R} (view on github: \url{https://github.com/tbep-tech/tbeptools/blob/master/data-raw/enterodata-raw.R}) #' @source Water Quality Portal, \url{https://waterqualitydata.us} diff --git a/R/read_importentero.R b/R/read_importentero.R index a9672340..5fedcc0f 100644 --- a/R/read_importentero.R +++ b/R/read_importentero.R @@ -9,19 +9,21 @@ #' @return a data frame containing one row for each sample. Columns returned are: #' \describe{ #' \item{\code{date}}{date, sample date} +#' \item{\code{yr}}{numeric, year of sample date} +#' \item{\code{mo}}{numeric, month of sample date} +#' \item{\code{time}}{character, sample time} +#' \item{\code{time_zone}}{character, sample time zone} +#' \item{\code{long_name}}{character, long name of bay segment subwatershed} +#' \item{\code{bay_segment}}{character, short name of bay segment subwatershed} #' \item{\code{station}}{character, sample station} #' \item{\code{ecocci}}{numeric, Enterococcus concentration} #' \item{\code{ecocci_censored}}{logical, whether \code{ecocci} value was below the laboratory \code{MDL}, minimum detection limit} +#' \item{\code{MDL}}{numeric, minimum detection limit at the time of processing} #' \item{\code{ecocci_units}}{character, units of measurement for \code{ecocci}} #' \item{\code{qualifier}}{qualifier codes associated with sample} #' \item{\code{LabComments}}{lab comments on sample} #' \item{\code{Latitude}}{numeric, latitude in decimal degrees} #' \item{\code{Longitude}}{numeric, longitude in decimal degrees} -#' \item{\code{time}}{character, sample time} -#' \item{\code{time_zone}}{character, sample time zone} -#' \item{\code{MDL}}{numeric, minimum detection limit for the lab and time the sample was analyzed} -#' \item{\code{yr}}{numeric, year of sample date} -#' \item{\code{mo}}{numeric, month of sample date} #' } #' #' @importFrom dplyr %>% @@ -30,12 +32,12 @@ #' @examples #' \dontrun{ #' # stations to download -#' stations <- c('21FLHILL_WQX-101', +#' stas <- c('21FLHILL_WQX-101', #' '21FLHILL_WQX-102', #' '21FLHILL_WQX-103') #' #' # download and read the data -#' entero_in <- read_importentero(stas = stations, startDate = '2023-01-01', endDate = '2023-02-01') +#' entero_in <- read_importentero(stas = stas, startDate = '2023-01-01', endDate = '2023-02-01') #' #' head(entero_in) #' @@ -44,7 +46,7 @@ read_importentero <- function(stas = NULL, startDate, endDate){ # default to all stations if not specified if(is.null(stas)) - stations <- unique(catchprecip$station) + stas <- unique(catchprecip$station) entero_names <- c('Enterococci', 'Enterococcus') @@ -52,13 +54,12 @@ read_importentero <- function(stas = NULL, startDate, endDate){ endDate <- as.Date(endDate) args <- list( - siteid = stations, + siteid = stas, characteristicName = entero_names, startDateLo = format(startDate, '%m-%d-%Y'), startDateHi = format(endDate, '%m-%d-%Y') ) - # generate the parts # a weakness here is building the '&' into everything but siteid - # this basically means everything is required in the proper order @@ -81,33 +82,49 @@ read_importentero <- function(stas = NULL, startDate, endDate){ # download and read in the file tmp1 <- tempfile() - download.file(url = url_full, destfile = tmp1, method = 'curl') - dat <- read.csv(tmp1) + download.file(url = url_full, destfile = tmp1, method = 'libcurl', quiet = TRUE) + datraw <- suppressWarnings(read.csv(tmp1)) unlink(tmp1) - # select columns - dat2 <- dat %>% - dplyr::select(station = MonitoringLocationIdentifier, - Latitude = ActivityLocation.LatitudeMeasure, - Longitude = ActivityLocation.LongitudeMeasure, - ecocci = ResultMeasureValue, # - the result (has characters in here too - 'Not Reported') - ecocci_units = ResultMeasure.MeasureUnitCode, - qualifier = MeasureQualifierCode, - date = ActivityStartDate, - time = ActivityStartTime.Time, # local time - time_zone = ActivityStartTime.TimeZoneCode, - MDL = DetectionQuantitationLimitMeasure.MeasureValue, - LabComments = ResultLaboratoryCommentText) %>% - dplyr::filter(ecocci != 'Not Reported') %>% - dplyr::mutate(ecocci = as.numeric(ecocci), - ecocci_censored = dplyr::case_when(ecocci <= MDL ~ TRUE, - .default = FALSE), - date = as.Date(date), - yr = lubridate::year(date), - mo = lubridate::month(date)) %>% - dplyr::relocate(date, station, ecocci, ecocci_censored, ecocci_units, qualifier, LabComments) %>% + dat <- datraw %>% + dplyr::select( + station = MonitoringLocationIdentifier, + Latitude = ActivityLocation.LatitudeMeasure, + Longitude = ActivityLocation.LongitudeMeasure, + ecocci = ResultMeasureValue, # - the result (has characters in here too - 'Not Reported') + ecocci_units = ResultMeasure.MeasureUnitCode, + qualifier = MeasureQualifierCode, + date = ActivityStartDate, + time = ActivityStartTime.Time, # local time + time_zone = ActivityStartTime.TimeZoneCode, + MDL = DetectionQuantitationLimitMeasure.MeasureValue, + LabComments = ResultLaboratoryCommentText + ) %>% + dplyr::mutate( + ecocci = dplyr::case_when( + ecocci %in% c('*Non-detect', '*Not Reported', 'Not Reported', '*Present >QL') ~ NA_character_, + TRUE ~ ecocci + ), + ecocci = as.numeric(ecocci), + ecocci_censored = dplyr::case_when( + ecocci <= MDL ~ TRUE, + .default = FALSE + ), + date = as.Date(date), + yr = lubridate::year(date), + mo = lubridate::month(date) + ) %>% dplyr::arrange(station, date) - return(dat2) + # add subsegment basin + out <- dat %>% + sf::st_as_sf(coords = c('Longitude', 'Latitude'), crs = sf::st_crs(tbsegshed), remove = FALSE) + out <- suppressWarnings(sf::st_intersection(out, tbsegshed)) + out <- out %>% + sf::st_set_geometry(NULL) %>% + dplyr::relocate(date, yr, mo, time, time_zone, long_name, bay_segment, station, ecocci, ecocci_censored, MDL, ecocci_units, qualifier, LabComments) + + return(out) + } diff --git a/data-raw/enterodata-raw.R b/data-raw/enterodata-raw.R index d86a2c80..bed35153 100644 --- a/data-raw/enterodata-raw.R +++ b/data-raw/enterodata-raw.R @@ -2,8 +2,6 @@ # although apparently data collection only started in 2000 at the earliest of these stations library(here) -enterodata <- read_importentero(startDate = '1995-01-01', endDate = '2023-12-31') %>% - dplyr::select(-qualifier, - -LabComments) +enterodata <- read_importentero(startDate = '1995-01-01', endDate = '2023-12-31') save(enterodata, file = here('data/enterodata.RData'), compress = 'xz') diff --git a/data/enterodata.RData b/data/enterodata.RData index bc1d3380..cb762cc0 100644 Binary files a/data/enterodata.RData and b/data/enterodata.RData differ diff --git a/data/tbsegshed.RData b/data/tbsegshed.RData index d54ae8a5..80334122 100644 Binary files a/data/tbsegshed.RData and b/data/tbsegshed.RData differ diff --git a/man/enterodata.Rd b/man/enterodata.Rd index f8605246..ec207894 100644 --- a/man/enterodata.Rd +++ b/man/enterodata.Rd @@ -5,21 +5,25 @@ \alias{enterodata} \title{Enterococcus data from 53 key Enterococcus stations since 1995} \format{ -A data frame with 6207 rows and 12 columns: +A data frame with 6266 rows and 16 columns: \describe{ - \item{date}{data, sample date} - \item{station}{character, sample station, as named in the Water Quality Portal} - \item{ecocci}{numeric, sample's Enterococcus concentration} - \item{ecocci_censored}{logical, whether sample concentration was censored (below detection limit)} - \item{Latitude}{numeric, latitude in decimal degrees} - \item{Longitude}{numeric, longitude in decimal degrees} - \item{ecocci_units}{character, units of measurement of Enterococcus} - \item{time}{character, time sample was obtained, in 24hr format} - \item{time_zone}{character, time zone of sample time} - \item{MDL}{numeric, minimum detection limit of laboratory for Enterococcus} - \item{yr}{numeric year of sample date} - \item{mo}{numeric month of sample date} -} + \item{\code{date}}{date, sample date} + \item{\code{yr}}{numeric, year of sample date} + \item{\code{mo}}{numeric, month of sample date} + \item{\code{time}}{character, sample time} + \item{\code{time_zone}}{character, sample time zone} + \item{\code{long_name}}{character, long name of bay segment subwatershed} + \item{\code{bay_segment}}{character, short name of bay segment subwatershed} + \item{\code{station}}{character, sample station} + \item{\code{ecocci}}{numeric, Enterococcus concentration} + \item{\code{ecocci_censored}}{logical, whether \code{ecocci} value was below the laboratory \code{MDL}, minimum detection limit} + \item{\code{MDL}}{numeric, minimum detection limit at the time of processing} + \item{\code{ecocci_units}}{character, units of measurement for \code{ecocci}} + \item{\code{qualifier}}{qualifier codes associated with sample} + \item{\code{LabComments}}{lab comments on sample} + \item{\code{Latitude}}{numeric, latitude in decimal degrees} + \item{\code{Longitude}}{numeric, longitude in decimal degrees} + } } \source{ Water Quality Portal, \url{https://waterqualitydata.us} diff --git a/man/read_importentero.Rd b/man/read_importentero.Rd index 80c78d72..ba177e6c 100644 --- a/man/read_importentero.Rd +++ b/man/read_importentero.Rd @@ -17,19 +17,21 @@ read_importentero(stas = NULL, startDate, endDate) a data frame containing one row for each sample. Columns returned are: \describe{ \item{\code{date}}{date, sample date} + \item{\code{yr}}{numeric, year of sample date} + \item{\code{mo}}{numeric, month of sample date} + \item{\code{time}}{character, sample time} + \item{\code{time_zone}}{character, sample time zone} + \item{\code{long_name}}{character, long name of bay segment subwatershed} + \item{\code{bay_segment}}{character, short name of bay segment subwatershed} \item{\code{station}}{character, sample station} \item{\code{ecocci}}{numeric, Enterococcus concentration} \item{\code{ecocci_censored}}{logical, whether \code{ecocci} value was below the laboratory \code{MDL}, minimum detection limit} + \item{\code{MDL}}{numeric, minimum detection limit at the time of processing} \item{\code{ecocci_units}}{character, units of measurement for \code{ecocci}} \item{\code{qualifier}}{qualifier codes associated with sample} \item{\code{LabComments}}{lab comments on sample} \item{\code{Latitude}}{numeric, latitude in decimal degrees} \item{\code{Longitude}}{numeric, longitude in decimal degrees} - \item{\code{time}}{character, sample time} - \item{\code{time_zone}}{character, sample time zone} - \item{\code{MDL}}{numeric, minimum detection limit for the lab and time the sample was analyzed} - \item{\code{yr}}{numeric, year of sample date} - \item{\code{mo}}{numeric, month of sample date} } } \description{ @@ -41,12 +43,12 @@ Retrieves Enterococcus sample data from selected stations and date range from th \examples{ \dontrun{ # stations to download -stations <- c('21FLHILL_WQX-101', +stas <- c('21FLHILL_WQX-101', '21FLHILL_WQX-102', '21FLHILL_WQX-103') # download and read the data -entero_in <- read_importentero(stas = stations, startDate = '2023-01-01', endDate = '2023-02-01') +entero_in <- read_importentero(stas = stas, startDate = '2023-01-01', endDate = '2023-02-01') head(entero_in) diff --git a/tests/testthat/test-read_importentero.R b/tests/testthat/test-read_importentero.R index f54aceb6..f062f388 100644 --- a/tests/testthat/test-read_importentero.R +++ b/tests/testthat/test-read_importentero.R @@ -3,8 +3,8 @@ library(mockery) # Mock data for read.csv mock_data <- data.frame( MonitoringLocationIdentifier = c("station1", "station2"), - ActivityLocation.LatitudeMeasure = c(27.123, 27.456), - ActivityLocation.LongitudeMeasure = c(-82.123, -82.456), + ActivityLocation.LatitudeMeasure = c(27.8893, 27.8589), + ActivityLocation.LongitudeMeasure = c(-82.4774, -82.4686), ResultMeasureValue = c("100", "200"), ResultMeasure.MeasureUnitCode = c("cfu/100mL", "cfu/100mL"), MeasureQualifierCode = c("", ""), @@ -25,27 +25,29 @@ test_that("read_importentero works correctly", { mock_read_csv <- mock(return_value = mock_data) stub(read_importentero, "read.csv", mock_read_csv) - stations <- c("21FLHILL_WQX-101", "21FLHILL_WQX-102") + stas <- c("21FLHILL_WQX-101", "21FLHILL_WQX-102") # Call the function with mocked dependencies - result <- read_importentero(stas = stations, startDate = "2023-01-01", endDate = "2023-12-31") + result <- read_importentero(stas = stas, startDate = "2023-01-01", endDate = "2023-12-31") # Define expected output expected_output <- data.frame( date = as.Date(c("2023-01-01", "2023-01-02")), + yr = c(2023, 2023), + mo = c(1, 1), + time = c("10:00", "11:00"), + time_zone = c("EST", "EST"), + long_name = c("Hillsborough Bay", "Hillsborough Bay"), + bay_segment = c("HB", "HB"), station = c("station1", "station2"), ecocci = c(100, 200), ecocci_censored = c(FALSE, FALSE), + MDL = c(5, 5), ecocci_units = c("cfu/100mL", "cfu/100mL"), qualifier = c("", ""), LabComments = c("No issues", "No issues"), - Latitude = c(27.123, 27.456), - Longitude = c(-82.123, -82.456), - time = c("10:00", "11:00"), - time_zone = c("EST", "EST"), - MDL = c(5, 5), - yr = c(2023, 2023), - mo = c(1, 1) + Latitude = c(27.8893, 27.8589), + Longitude = c(-82.4774, -82.4686) ) # Check if the result matches the expected output @@ -54,4 +56,5 @@ test_that("read_importentero works correctly", { # Verify that the mock functions were called as expected expect_called(mock_download, 1) expect_called(mock_read_csv, 1) + })