Skip to content

Commit

Permalink
Closes #5, split out locations into different events
Browse files Browse the repository at this point in the history
  • Loading branch information
smwoodman committed Mar 12, 2024
1 parent 5a65af7 commit 2d8af79
Show file tree
Hide file tree
Showing 4 changed files with 14,497 additions and 14,278 deletions.
140 changes: 91 additions & 49 deletions R/data_to_dwca.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Convert CS-PHOC data set to Darwin Core Archive records
# Convert CS-PHOC event/count data to Darwin Core Archive records

library(dplyr)
library(readr)
Expand All @@ -12,7 +12,7 @@ x.events <- read.csv(here("data", "manuscript", "cs-phoc-events.csv"))
x.counts <- read.csv(here("data", "manuscript", "cs-phoc-counts.csv"))


# Prep WORRMS names-------------------------------------------------------------
# Prep WoRMS names--------------------------------------------------------------
# match taxa using the list of unique scientific names
matched_taxa_tibbles <- wm_records_names(unique(x.counts$species))

Expand All @@ -23,41 +23,53 @@ matched_taxa <- bind_rows(matched_taxa_tibbles) %>%


## Create Event table-----------------------------------------------------------
# Long values, pulled out for readability
geo <- "Antarctica | South Shetland Islands | Cape Shirreff, Livingston Island"
geo.id <- "https://data.aad.gov.au/aadc/gaz/scar/display_name.cfm?gaz_id=131551"

event <- x.events %>%
mutate(
# eventDate is mandatory
eventDate = if_else(census_date_start == census_date_end, census_date_start,
paste(census_date_start, census_date_end, sep = "/")),
# whatever cannot be mapped to Darwin Core terms goes to dynamicProperties
dynamicProperties = sprintf(
'{"research_program": "%s", "surveyed_pst": %s}',
research_program, if_else(surveyed_pst, "true", "false")
),
# add recommended Darwin Core terms: https://dwc.tdwg.org/terms/#event
decimalLongitude = "-60.77",
decimalLatitude = "-62.47",
coordinateUncertaintyInMeters = "2650",
locality = "Cape Shirreff, Livingston Island",
higherGeography = geo,
higherGeographyID = geo.id,
continent = "Antarctica",
countryCode = "AQ",
sampleSizeValue = census_days,
sampleSizeUnit = ifelse(census_days <= 1, "day", "days"),
geodeticDatum = "EPSG:4326",
samplingProtocol = paste(
"The count of a given species, age class, and sex",
"at the given location(s), made by a trained observer ",
"using binoculars during the specified sampling event.")
) %>%
rename(eventID = event_id) %>%
# Split Core and PST locations into their own events with their own IDs
event1 <- x.events %>%
rename(eventID = event_id) %>%
mutate(eventID = paste0(eventID, "-1"),
locality = "Cape Shirreff, Livingston Island",
locationRemarks = "CS-PHOC core census locations, on Cape Shirreff")

event2 <- x.events %>%
rename(eventID = event_id) %>%
filter(surveyed_pst) %>%
mutate(eventID = paste0(eventID, "-2"),
locality = "Punta San Telmo, Cape Shirreff, Livingston Island",
locationRemarks = "CS-PHOC location Punta San Telmo, on Cape Shirreff")

# Bind Core and PST rows together, and add all other DwC info
event <- bind_rows(event1, event2) %>%
mutate(eventDate = census_date_start,
eventRemarks = if_else(
census_days == 1, "",
paste("The census event spanned multiple days.",
"The reported date is the start of the census event window")),
sampleSizeValue = census_days,
sampleSizeUnit = ifelse(census_days <= 1, "day", "days"),
# whatever cannot be mapped to DwC terms goes to dynamicProperties
dynamicProperties = sprintf(
'{"research_program": "%s"}', research_program
),
# add recommended Darwin Core terms: https://dwc.tdwg.org/terms/#event
decimalLongitude = "-60.77",
decimalLatitude = "-62.47",
coordinateUncertaintyInMeters = "2650",
higherGeography = paste("Antarctica | South Shetland Islands |",
"Cape Shirreff, Livingston Island"),
higherGeographyID = paste0("https://data.aad.gov.au/aadc/gaz/scar/",
"display_name.cfm?gaz_id=131551"),
continent = "Antarctica",
countryCode = "AQ",
geodeticDatum = "EPSG:4326",
samplingProtocol = paste(
"The count of a given species, age class, and sex",
"made at the given locality by a trained observer ",
"using binoculars during the specified sampling event.")
) %>%
arrange(eventDate, eventID) %>%
# fields that cannot be mapped to Darwin Core
select(-c(season_name, census_days, census_date_start, census_date_end,
surveyed_pst, research_program))
research_program))

stopifnot(
!any(is.na(event)),
Expand All @@ -73,19 +85,22 @@ occ <- x.counts %>%
rename(scientificName = species) %>%
left_join(matched_taxa, by = "scientificName") %>%
# rename finished columns to Darwin Core terms
rename(
eventID = event_id,
vernacularName = species_common,
scientificNameID = lsid,
taxonRank = rank
rename(eventID = event_id,
vernacularName = species_common,
scientificNameID = lsid,
taxonRank = rank
) %>%
# add recommended Darwin Core terms: https://dwc.tdwg.org/terms/#occurrence
mutate(
basisOfRecord = "HumanObservation",
identificationReferences = "https://doi.org/10.1016/C2012-0-06919-0"
)
mutate(basisOfRecord = "HumanObservation",
identificationReferences = "https://doi.org/10.1016/C2012-0-06919-0",
eventID = case_when(
location == "Core census locations" ~ paste0(eventID, "-1"),
location == "Punta San Telmo" ~ paste0(eventID, "-2"),
.default = NA_character_
))


# create the long table
# create the long occurrence table
occ.long <- occ %>%
select(-total_count) %>%
rename(pup_unk_count = pup_count) %>%
Expand Down Expand Up @@ -115,24 +130,51 @@ occ.long <- occ %>%
sex = case_when(
grepl("female", sex) ~ "female",
grepl("male", sex) ~ "male",
grepl("unk", sex) ~ "unknown"
grepl("unk", sex) ~ "indeterminate"
),
occurrenceID = paste(count_id, lifeStage_id, sex_id, sep = "-")) %>%
left_join(select(event, eventID, dateIdentified = eventDate),
by = "eventID") %>%
select(-c(count_id, lifeStage_id, sex_id)) %>%
relocate(occurrenceID, .before = eventID)

# # Visual sanity checks

# ## Visual sanity checks
# occ.long %>%
# select(-c(eventID, occurrenceID, individualCount, dateIdentified)) %>%
# lapply(table, useNA = "ifany")
# table(occ.long$individualCount, occ.long$occurrenceStatus, useNA = "ifany")

## Sanity checks
stopifnot(
!any(is.na(occ.long)),
sum(duplicated(occ.long$occurrenceID)) == 0
sum(duplicated(occ.long$occurrenceID)) == 0,
all(occ$eventID %in% event$eventID),
all(event$eventID %in% occ$eventID),
# Confirm in occurrence that there are 4 records for each eventID
all.equal(
unique(occ %>% group_by(eventID) %>% summarise(n_id = n()) %>% pull(n_id)),
4),
# Confirm that occurrenceID prefix (from manuscript_data.R) matches eventID
all.equal(
occ.long$eventID,
purrr::map_chr(
stringr::str_split(occ.long$occurrenceID, "-"),
function(i) paste(i[1], i[2], sep = "-"))
),
# Confirm total counts are still the same, in general and by eventID
all.equal(sum(x.counts$total_count), sum(occ.long$individualCount)),
all.equal(
x.counts %>%
group_by(as.character(event_id), location) %>%
summarise(count_sum = sum(total_count), .groups = "drop") %>%
pull(count_sum),
occ.long %>%
group_by(eventID) %>%
summarise(count_sum = sum(individualCount), .groups = "drop") %>%
pull(count_sum)
)
)


# write to file
write_tsv(occ.long, here("data", "dwca", "occurrence.txt"), na = "")
8 changes: 4 additions & 4 deletions R/manuscript_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,22 +24,22 @@ cs.header.orig <- tbl(con, "vCensus_Phocid_Header") %>%
select(header_id, census_phocid_header_id, season_name,
census_date_start, census_date_end, census_days,
surveyed_pst, research_program) %>%
# TODO: temporary to avoid including half of 2023/24 data
# TODO: temporary to avoid including 2023/24 data
filter(census_date_start < as.Date("2023-07-01"))

cs.header <- cs.header.orig %>% select(-census_phocid_header_id)

stopifnot(
nrow(cs.header) == nrow(collect(tbl(con, "census_phocid_header")) %>%
# TODO: temporary to avoid including half of 2023/24 data
# TODO: temporary to avoid including 2023/24 data
filter(census_date_start < as.Date("2023-07-01")))
)


cs.wide <- tbl(con, "vCensus_Phocid") %>%
arrange(census_date, species, location_group) %>%
rename(header_id = census_phocid_header_id) %>%
# TODO: temporary to avoid including half of 2023/24 data
# TODO: temporary to avoid including 2023/24 data
filter(census_date < as.Date("2023-07-01")) %>%
collect() %>%
select(header_id, observer, census_date, location_group, species,
Expand Down Expand Up @@ -174,7 +174,7 @@ cs.counts <- cs.core.pst %>% rename(event_id = header_id)

#-------------------------------------------------------------------------------
# Save data
write_csv(cs.events, file = here("data", "manuscript", "cs-phoc-events.csv"), na = "")
write_csv(cs.events, here("data", "manuscript", "cs-phoc-events.csv"), na = "")
write_csv(cs.counts, here("data", "manuscript", "cs-phoc-counts.csv"), na = "")


Expand Down
Loading

0 comments on commit 2d8af79

Please sign in to comment.