Skip to content

Commit

Permalink
update read_enterodata to include bay segment, change column order ou…
Browse files Browse the repository at this point in the history
…tput, update enterodata
  • Loading branch information
fawda123 committed Aug 13, 2024
1 parent 74cd215 commit c10efa2
Show file tree
Hide file tree
Showing 8 changed files with 111 additions and 83 deletions.
32 changes: 18 additions & 14 deletions R/enterodata.R
Original file line number Diff line number Diff line change
@@ -1,20 +1,24 @@
#' Enterococcus data from 53 key Enterococcus stations since 1995
#'
#' @format A data frame with 6207 rows and 12 columns:
#' @format A data frame with 6266 rows and 16 columns:
#' \describe{
#' \item{date}{data, sample date}
#' \item{station}{character, sample station, as named in the Water Quality Portal}
#' \item{ecocci}{numeric, sample's Enterococcus concentration}
#' \item{ecocci_censored}{logical, whether sample concentration was censored (below detection limit)}
#' \item{Latitude}{numeric, latitude in decimal degrees}
#' \item{Longitude}{numeric, longitude in decimal degrees}
#' \item{ecocci_units}{character, units of measurement of Enterococcus}
#' \item{time}{character, time sample was obtained, in 24hr format}
#' \item{time_zone}{character, time zone of sample time}
#' \item{MDL}{numeric, minimum detection limit of laboratory for Enterococcus}
#' \item{yr}{numeric year of sample date}
#' \item{mo}{numeric month of sample date}
#' }
#' \item{\code{date}}{date, sample date}
#' \item{\code{yr}}{numeric, year of sample date}
#' \item{\code{mo}}{numeric, month of sample date}
#' \item{\code{time}}{character, sample time}
#' \item{\code{time_zone}}{character, sample time zone}
#' \item{\code{long_name}}{character, long name of bay segment subwatershed}
#' \item{\code{bay_segment}}{character, short name of bay segment subwatershed}
#' \item{\code{station}}{character, sample station}
#' \item{\code{ecocci}}{numeric, Enterococcus concentration}
#' \item{\code{ecocci_censored}}{logical, whether \code{ecocci} value was below the laboratory \code{MDL}, minimum detection limit}
#' \item{\code{MDL}}{numeric, minimum detection limit at the time of processing}
#' \item{\code{ecocci_units}}{character, units of measurement for \code{ecocci}}
#' \item{\code{qualifier}}{qualifier codes associated with sample}
#' \item{\code{LabComments}}{lab comments on sample}
#' \item{\code{Latitude}}{numeric, latitude in decimal degrees}
#' \item{\code{Longitude}}{numeric, longitude in decimal degrees}
#' }
#' @details
#' A sample dataset containing Enterococcus from 53 stations in the TBEP watershed from 1995-2023. Generated by \code{data-raw/enterodata-raw.R} (view on github: \url{https://github.com/tbep-tech/tbeptools/blob/master/data-raw/enterodata-raw.R})
#' @source Water Quality Portal, \url{https://waterqualitydata.us}
Expand Down
85 changes: 51 additions & 34 deletions R/read_importentero.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,21 @@
#' @return a data frame containing one row for each sample. Columns returned are:
#' \describe{
#' \item{\code{date}}{date, sample date}
#' \item{\code{yr}}{numeric, year of sample date}
#' \item{\code{mo}}{numeric, month of sample date}
#' \item{\code{time}}{character, sample time}
#' \item{\code{time_zone}}{character, sample time zone}
#' \item{\code{long_name}}{character, long name of bay segment subwatershed}
#' \item{\code{bay_segment}}{character, short name of bay segment subwatershed}
#' \item{\code{station}}{character, sample station}
#' \item{\code{ecocci}}{numeric, Enterococcus concentration}
#' \item{\code{ecocci_censored}}{logical, whether \code{ecocci} value was below the laboratory \code{MDL}, minimum detection limit}
#' \item{\code{MDL}}{numeric, minimum detection limit at the time of processing}
#' \item{\code{ecocci_units}}{character, units of measurement for \code{ecocci}}
#' \item{\code{qualifier}}{qualifier codes associated with sample}
#' \item{\code{LabComments}}{lab comments on sample}
#' \item{\code{Latitude}}{numeric, latitude in decimal degrees}
#' \item{\code{Longitude}}{numeric, longitude in decimal degrees}
#' \item{\code{time}}{character, sample time}
#' \item{\code{time_zone}}{character, sample time zone}
#' \item{\code{MDL}}{numeric, minimum detection limit for the lab and time the sample was analyzed}
#' \item{\code{yr}}{numeric, year of sample date}
#' \item{\code{mo}}{numeric, month of sample date}
#' }
#'
#' @importFrom dplyr %>%
Expand All @@ -30,12 +32,12 @@
#' @examples
#' \dontrun{
#' # stations to download
#' stations <- c('21FLHILL_WQX-101',
#' stas <- c('21FLHILL_WQX-101',
#' '21FLHILL_WQX-102',
#' '21FLHILL_WQX-103')
#'
#' # download and read the data
#' entero_in <- read_importentero(stas = stations, startDate = '2023-01-01', endDate = '2023-02-01')
#' entero_in <- read_importentero(stas = stas, startDate = '2023-01-01', endDate = '2023-02-01')
#'
#' head(entero_in)
#'
Expand All @@ -44,21 +46,20 @@ read_importentero <- function(stas = NULL, startDate, endDate){

# default to all stations if not specified
if(is.null(stas))
stations <- unique(catchprecip$station)
stas <- unique(catchprecip$station)

entero_names <- c('Enterococci',
'Enterococcus')
startDate <- as.Date(startDate)
endDate <- as.Date(endDate)

args <- list(
siteid = stations,
siteid = stas,
characteristicName = entero_names,
startDateLo = format(startDate, '%m-%d-%Y'),
startDateHi = format(endDate, '%m-%d-%Y')
)


# generate the parts
# a weakness here is building the '&' into everything but siteid -
# this basically means everything is required in the proper order
Expand All @@ -81,33 +82,49 @@ read_importentero <- function(stas = NULL, startDate, endDate){

# download and read in the file
tmp1 <- tempfile()
download.file(url = url_full, destfile = tmp1, method = 'curl')
dat <- read.csv(tmp1)
download.file(url = url_full, destfile = tmp1, method = 'libcurl', quiet = TRUE)
datraw <- suppressWarnings(read.csv(tmp1))
unlink(tmp1)


# select columns
dat2 <- dat %>%
dplyr::select(station = MonitoringLocationIdentifier,
Latitude = ActivityLocation.LatitudeMeasure,
Longitude = ActivityLocation.LongitudeMeasure,
ecocci = ResultMeasureValue, # - the result (has characters in here too - 'Not Reported')
ecocci_units = ResultMeasure.MeasureUnitCode,
qualifier = MeasureQualifierCode,
date = ActivityStartDate,
time = ActivityStartTime.Time, # local time
time_zone = ActivityStartTime.TimeZoneCode,
MDL = DetectionQuantitationLimitMeasure.MeasureValue,
LabComments = ResultLaboratoryCommentText) %>%
dplyr::filter(ecocci != 'Not Reported') %>%
dplyr::mutate(ecocci = as.numeric(ecocci),
ecocci_censored = dplyr::case_when(ecocci <= MDL ~ TRUE,
.default = FALSE),
date = as.Date(date),
yr = lubridate::year(date),
mo = lubridate::month(date)) %>%
dplyr::relocate(date, station, ecocci, ecocci_censored, ecocci_units, qualifier, LabComments) %>%
dat <- datraw %>%
dplyr::select(
station = MonitoringLocationIdentifier,
Latitude = ActivityLocation.LatitudeMeasure,
Longitude = ActivityLocation.LongitudeMeasure,
ecocci = ResultMeasureValue, # - the result (has characters in here too - 'Not Reported')
ecocci_units = ResultMeasure.MeasureUnitCode,
qualifier = MeasureQualifierCode,
date = ActivityStartDate,
time = ActivityStartTime.Time, # local time
time_zone = ActivityStartTime.TimeZoneCode,
MDL = DetectionQuantitationLimitMeasure.MeasureValue,
LabComments = ResultLaboratoryCommentText
) %>%
dplyr::mutate(
ecocci = dplyr::case_when(
ecocci %in% c('*Non-detect', '*Not Reported', 'Not Reported', '*Present >QL') ~ NA_character_,
TRUE ~ ecocci
),
ecocci = as.numeric(ecocci),
ecocci_censored = dplyr::case_when(
ecocci <= MDL ~ TRUE,
.default = FALSE
),
date = as.Date(date),
yr = lubridate::year(date),
mo = lubridate::month(date)
) %>%
dplyr::arrange(station, date)

return(dat2)
# add subsegment basin
out <- dat %>%
sf::st_as_sf(coords = c('Longitude', 'Latitude'), crs = sf::st_crs(tbsegshed), remove = FALSE)
out <- suppressWarnings(sf::st_intersection(out, tbsegshed))
out <- out %>%
sf::st_set_geometry(NULL) %>%
dplyr::relocate(date, yr, mo, time, time_zone, long_name, bay_segment, station, ecocci, ecocci_censored, MDL, ecocci_units, qualifier, LabComments)

return(out)

}
4 changes: 1 addition & 3 deletions data-raw/enterodata-raw.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
# although apparently data collection only started in 2000 at the earliest of these stations
library(here)

enterodata <- read_importentero(startDate = '1995-01-01', endDate = '2023-12-31') %>%
dplyr::select(-qualifier,
-LabComments)
enterodata <- read_importentero(startDate = '1995-01-01', endDate = '2023-12-31')

save(enterodata, file = here('data/enterodata.RData'), compress = 'xz')
Binary file modified data/enterodata.RData
Binary file not shown.
Binary file modified data/tbsegshed.RData
Binary file not shown.
32 changes: 18 additions & 14 deletions man/enterodata.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 9 additions & 7 deletions man/read_importentero.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

25 changes: 14 additions & 11 deletions tests/testthat/test-read_importentero.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ library(mockery)
# Mock data for read.csv
mock_data <- data.frame(
MonitoringLocationIdentifier = c("station1", "station2"),
ActivityLocation.LatitudeMeasure = c(27.123, 27.456),
ActivityLocation.LongitudeMeasure = c(-82.123, -82.456),
ActivityLocation.LatitudeMeasure = c(27.8893, 27.8589),
ActivityLocation.LongitudeMeasure = c(-82.4774, -82.4686),
ResultMeasureValue = c("100", "200"),
ResultMeasure.MeasureUnitCode = c("cfu/100mL", "cfu/100mL"),
MeasureQualifierCode = c("", ""),
Expand All @@ -25,27 +25,29 @@ test_that("read_importentero works correctly", {
mock_read_csv <- mock(return_value = mock_data)
stub(read_importentero, "read.csv", mock_read_csv)

stations <- c("21FLHILL_WQX-101", "21FLHILL_WQX-102")
stas <- c("21FLHILL_WQX-101", "21FLHILL_WQX-102")

# Call the function with mocked dependencies
result <- read_importentero(stas = stations, startDate = "2023-01-01", endDate = "2023-12-31")
result <- read_importentero(stas = stas, startDate = "2023-01-01", endDate = "2023-12-31")

# Define expected output
expected_output <- data.frame(
date = as.Date(c("2023-01-01", "2023-01-02")),
yr = c(2023, 2023),
mo = c(1, 1),
time = c("10:00", "11:00"),
time_zone = c("EST", "EST"),
long_name = c("Hillsborough Bay", "Hillsborough Bay"),
bay_segment = c("HB", "HB"),
station = c("station1", "station2"),
ecocci = c(100, 200),
ecocci_censored = c(FALSE, FALSE),
MDL = c(5, 5),
ecocci_units = c("cfu/100mL", "cfu/100mL"),
qualifier = c("", ""),
LabComments = c("No issues", "No issues"),
Latitude = c(27.123, 27.456),
Longitude = c(-82.123, -82.456),
time = c("10:00", "11:00"),
time_zone = c("EST", "EST"),
MDL = c(5, 5),
yr = c(2023, 2023),
mo = c(1, 1)
Latitude = c(27.8893, 27.8589),
Longitude = c(-82.4774, -82.4686)
)

# Check if the result matches the expected output
Expand All @@ -54,4 +56,5 @@ test_that("read_importentero works correctly", {
# Verify that the mock functions were called as expected
expect_called(mock_download, 1)
expect_called(mock_read_csv, 1)

})

0 comments on commit c10efa2

Please sign in to comment.