Closes #5, split out locations into different events

us-amlr · Mar 12, 2024 · 2d8af79 · 2d8af79
1 parent 5a65af7
commit 2d8af79
Show file tree

Hide file tree

Showing 4 changed files with 14,497 additions and 14,278 deletions.
diff --git a/R/data_to_dwca.R b/R/data_to_dwca.R
@@ -1,4 +1,4 @@
-# Convert CS-PHOC data set to Darwin Core Archive records
+# Convert CS-PHOC event/count data to Darwin Core Archive records
 
 library(dplyr)
 library(readr)
@@ -12,7 +12,7 @@ x.events <- read.csv(here("data", "manuscript", "cs-phoc-events.csv"))
 x.counts <- read.csv(here("data", "manuscript", "cs-phoc-counts.csv"))
 
 
-# Prep WORRMS names-------------------------------------------------------------
+# Prep WoRMS names--------------------------------------------------------------
 # match taxa using the list of unique scientific names
 matched_taxa_tibbles <- wm_records_names(unique(x.counts$species))
 
@@ -23,41 +23,53 @@ matched_taxa <- bind_rows(matched_taxa_tibbles) %>%
 
 
 ## Create Event table-----------------------------------------------------------
-# Long values, pulled out for readability
-geo <- "Antarctica | South Shetland Islands | Cape Shirreff, Livingston Island"
-geo.id <- "https://data.aad.gov.au/aadc/gaz/scar/display_name.cfm?gaz_id=131551"
-
-event <- x.events %>%
-  mutate(
-    # eventDate is mandatory
-    eventDate = if_else(census_date_start == census_date_end, census_date_start, 
-                        paste(census_date_start, census_date_end, sep = "/")),  
-    # whatever cannot be mapped to Darwin Core terms goes to dynamicProperties
-    dynamicProperties = sprintf(
-      '{"research_program": "%s", "surveyed_pst": %s}', 
-      research_program, if_else(surveyed_pst, "true", "false")
-    ),
-    # add recommended Darwin Core terms: https://dwc.tdwg.org/terms/#event
-    decimalLongitude = "-60.77",
-    decimalLatitude = "-62.47",
-    coordinateUncertaintyInMeters = "2650",
-    locality = "Cape Shirreff, Livingston Island",  
-    higherGeography = geo, 
-    higherGeographyID = geo.id, 
-    continent = "Antarctica",
-    countryCode = "AQ",
-    sampleSizeValue = census_days,
-    sampleSizeUnit = ifelse(census_days <= 1, "day", "days"),
-    geodeticDatum = "EPSG:4326",
-    samplingProtocol = paste(
-      "The count of a given species, age class, and sex", 
-      "at the given location(s), made by a trained observer ", 
-      "using binoculars during the specified sampling event.")
-  ) %>%
-  rename(eventID = event_id) %>%
+# Split Core and PST locations into their own events with their own IDs
+event1 <- x.events %>%
+  rename(eventID = event_id) %>% 
+  mutate(eventID = paste0(eventID, "-1"), 
+         locality = "Cape Shirreff, Livingston Island",
+         locationRemarks = "CS-PHOC core census locations, on Cape Shirreff")
+
+event2 <- x.events %>%
+  rename(eventID = event_id) %>% 
+  filter(surveyed_pst) %>% 
+  mutate(eventID = paste0(eventID, "-2"), 
+         locality = "Punta San Telmo, Cape Shirreff, Livingston Island",
+         locationRemarks = "CS-PHOC location Punta San Telmo, on Cape Shirreff")
+
+# Bind Core and PST rows together, and add all other DwC info
+event <- bind_rows(event1, event2) %>%
+  mutate(eventDate = census_date_start, 
+         eventRemarks  = if_else(
+           census_days == 1, "", 
+           paste("The census event spanned multiple days.", 
+                 "The reported date is the start of the census event window")),  
+         sampleSizeValue = census_days,
+         sampleSizeUnit = ifelse(census_days <= 1, "day", "days"),
+         # whatever cannot be mapped to DwC terms goes to dynamicProperties
+         dynamicProperties = sprintf(
+           '{"research_program": "%s"}', research_program
+         ),
+         # add recommended Darwin Core terms: https://dwc.tdwg.org/terms/#event
+         decimalLongitude = "-60.77",
+         decimalLatitude = "-62.47",
+         coordinateUncertaintyInMeters = "2650",
+         higherGeography = paste("Antarctica | South Shetland Islands |", 
+                                 "Cape Shirreff, Livingston Island"),
+         higherGeographyID = paste0("https://data.aad.gov.au/aadc/gaz/scar/", 
+                                    "display_name.cfm?gaz_id=131551"),
+         continent = "Antarctica",
+         countryCode = "AQ",
+         geodeticDatum = "EPSG:4326",
+         samplingProtocol = paste(
+           "The count of a given species, age class, and sex", 
+           "made at the given locality by a trained observer ", 
+           "using binoculars during the specified sampling event.")
+  ) %>% 
+  arrange(eventDate, eventID) %>%
   # fields that cannot be mapped to Darwin Core
   select(-c(season_name, census_days, census_date_start, census_date_end, 
-            surveyed_pst, research_program))
+            research_program))
 
 stopifnot(
   !any(is.na(event)), 
@@ -73,19 +85,22 @@ occ <- x.counts %>%
   rename(scientificName = species) %>% 
   left_join(matched_taxa, by = "scientificName") %>%
   # rename finished columns to Darwin Core terms
-  rename(
-    eventID = event_id,
-    vernacularName = species_common,
-    scientificNameID = lsid,
-    taxonRank = rank
+  rename(eventID = event_id,
+         vernacularName = species_common,
+         scientificNameID = lsid,
+         taxonRank = rank
   ) %>%
   # add recommended Darwin Core terms: https://dwc.tdwg.org/terms/#occurrence
-  mutate(
-    basisOfRecord = "HumanObservation",
-    identificationReferences = "https://doi.org/10.1016/C2012-0-06919-0"
-  )
+  mutate(basisOfRecord = "HumanObservation",
+         identificationReferences = "https://doi.org/10.1016/C2012-0-06919-0", 
+         eventID = case_when(
+           location == "Core census locations" ~ paste0(eventID, "-1"), 
+           location == "Punta San Telmo" ~ paste0(eventID, "-2"), 
+           .default = NA_character_
+         ))
+
 
-# create the long table
+# create the long occurrence table
 occ.long <- occ %>%
   select(-total_count) %>% 
   rename(pup_unk_count = pup_count) %>% 
@@ -115,24 +130,51 @@ occ.long <- occ %>%
          sex = case_when(
            grepl("female", sex) ~ "female",
            grepl("male", sex) ~ "male",
-           grepl("unk", sex) ~ "unknown"
+           grepl("unk", sex) ~ "indeterminate"
          ), 
          occurrenceID = paste(count_id, lifeStage_id, sex_id, sep = "-")) %>%
   left_join(select(event, eventID, dateIdentified = eventDate), 
             by = "eventID") %>% 
   select(-c(count_id, lifeStage_id, sex_id)) %>% 
   relocate(occurrenceID, .before = eventID)
 
-# # Visual sanity checks
+
+# ## Visual sanity checks
 # occ.long %>%
 #   select(-c(eventID, occurrenceID, individualCount, dateIdentified)) %>%
 #   lapply(table, useNA = "ifany")
-# table(occ.long$individualCount, occ.long$occurrenceStatus, useNA = "ifany")
 
+## Sanity checks
 stopifnot(
   !any(is.na(occ.long)), 
-  sum(duplicated(occ.long$occurrenceID)) == 0
+  sum(duplicated(occ.long$occurrenceID)) == 0, 
+  all(occ$eventID %in% event$eventID), 
+  all(event$eventID %in% occ$eventID), 
+  # Confirm in occurrence that there are 4 records for each eventID
+  all.equal(
+    unique(occ %>% group_by(eventID) %>% summarise(n_id = n()) %>% pull(n_id)), 
+    4), 
+  # Confirm that occurrenceID prefix (from manuscript_data.R) matches eventID
+  all.equal(
+    occ.long$eventID, 
+    purrr::map_chr(
+      stringr::str_split(occ.long$occurrenceID, "-"), 
+      function(i) paste(i[1], i[2], sep = "-"))
+  ), 
+  # Confirm total counts are still the same, in general and by eventID
+  all.equal(sum(x.counts$total_count), sum(occ.long$individualCount)), 
+  all.equal(
+    x.counts %>% 
+      group_by(as.character(event_id), location) %>% 
+      summarise(count_sum = sum(total_count), .groups = "drop") %>% 
+      pull(count_sum), 
+    occ.long %>% 
+      group_by(eventID) %>% 
+      summarise(count_sum = sum(individualCount), .groups = "drop") %>% 
+      pull(count_sum)
+  )
 )
 
+
 # write to file
 write_tsv(occ.long, here("data", "dwca", "occurrence.txt"), na = "")
diff --git a/R/manuscript_data.R b/R/manuscript_data.R
@@ -24,22 +24,22 @@ cs.header.orig <- tbl(con, "vCensus_Phocid_Header") %>%
   select(header_id, census_phocid_header_id, season_name, 
          census_date_start, census_date_end, census_days, 
          surveyed_pst, research_program) %>% 
-  # TODO: temporary to avoid including half of 2023/24 data
+  # TODO: temporary to avoid including 2023/24 data
   filter(census_date_start < as.Date("2023-07-01"))
 
 cs.header <- cs.header.orig %>% select(-census_phocid_header_id)
 
 stopifnot(
   nrow(cs.header) == nrow(collect(tbl(con, "census_phocid_header")) %>% 
-                            # TODO: temporary to avoid including half of 2023/24 data
+                            # TODO: temporary to avoid including 2023/24 data
                             filter(census_date_start < as.Date("2023-07-01")))
 )
 
 
 cs.wide <- tbl(con, "vCensus_Phocid") %>% 
   arrange(census_date, species, location_group) %>% 
   rename(header_id = census_phocid_header_id) %>% 
-  # TODO: temporary to avoid including half of 2023/24 data
+  # TODO: temporary to avoid including 2023/24 data
   filter(census_date < as.Date("2023-07-01")) %>% 
   collect() %>% 
   select(header_id, observer, census_date, location_group, species, 
@@ -174,7 +174,7 @@ cs.counts <- cs.core.pst %>% rename(event_id = header_id)
 
 #-------------------------------------------------------------------------------
 # Save data
-write_csv(cs.events, file = here("data", "manuscript", "cs-phoc-events.csv"), na = "")
+write_csv(cs.events, here("data", "manuscript", "cs-phoc-events.csv"), na = "")
 write_csv(cs.counts, here("data", "manuscript", "cs-phoc-counts.csv"), na = "")