README.Rmd

---
output: github_document
---

<!-- README.md is generated from README.Rmd. Please edit that file -->

```{r, include = FALSE}
knitr::opts_chunk$set(
  echo = FALSE,
  collapse = TRUE,
  comment = "#>"
)
```

# Teaching datasets for Lecture Series on Open Reproducible Science in R

<!-- badges: start -->
[![License for data](https://img.shields.io/badge/license (for data)-CC0-blue.svg)](https://creativecommons.org/publicdomain/zero/1.0/)
<!-- badges: end -->

This repository contains a collection of teaching datasets that can be used for teaching [R](https://cran.r-project.org). Although this repository was created specifically for use in teaching the [Open Reproducible Science in R](https://oxford-ihtm.io/open-reproducible-science) lecture series, the data made available through here is open for access and use by anyone and is distributed under a Creative Commons 1.0 Universal (CC0) license.

## Dataset description

```{r data-description}
file_names <- list.files() |>
  (\(x) x[stringr::str_detect(string = x, pattern = "README|teaching_datasets|Base|End", negate = TRUE)])()

file_types <- file_names |>
  stringr::str_split(pattern = "\\.", simplify = TRUE) |>
  (\(x) x[ , 2])() |>
  toupper()

file_description <- vector(mode = "character", length = length(file_types))

file_description[file_names == "ba.dat"] <- "The dataset from Bland JM, Altman DG. Statistical Methods for Assessing Agreement Between Two Methods of Clinical Measurement. The Lancet. 1986;1: 307–310."
file_description[file_names == "bateman.dat"] <- "On Saturday, 21st April 1990, a luncheon was held in the home of Jean Bateman. There was a total of forty-five guests which included thirty-five members of the Department of Epidemiology and Population Sciences at the London School of Hygiene and Tropical Medicine. On Sunday morning, 22nd April 1990, Jean awoke with symptoms of gastrointestinal illness; her husband awoke with similar symptoms. The possibility of an outbreak related to the luncheon was strengthened when several of the guests telephoned Jean on Sunday and reported illness. On Monday, 23rd April 1990, there was an unusually large number of department members absent from work and reporting illness. Data from this outbreak is stored in this dataset."
file_description[file_names == "ca.dat"] <- "A dataset on the survival of cancer patients in two different treatment groups"
file_description[file_names == "cover.dat"] <- "The dataset contains data from a coverage survey for a therapeutic feeding program (TFP) in central Malawi undertaken in March 2003. Data were collected using the centric systematic area sampling method to define sampling locations: A number of communities located closest to the centres of thirty 10 x 10 kilometre grid squares were sampled using active (investigative) case-finding."
file_description[file_names == "diets.dat"] <- "The dataset contains data from a trial of two different diets undertaken at an adult therapeutic feeding centre in Somalia."
file_description[file_names == "fem.dat"] <- "A dataset from 118 female pyschiatric patients"
file_description[file_names == "fem.xlsx"] <- "A dataset from 118 female pyschiatric patients"
file_description[file_names == "gudhiv.dat"] <- "This data is from a cross-sectional study of 435 male patients who presented with sexually transmitted infections at an outpatient clinic in The Gambia between August 1988 and June 1990."
file_description[file_names == "malaria.dat"] <- "A dataset that contains data on rainfall (in mm) and the number of cases of malaria reported from health centres in an administrative district of Ethiopia between July 1997 and July 1999"
file_description[file_names == "octe.dat"] <- "This data is from a matched case-control study investigating the association between oral contraceptive use and thromboembolism. The cases are 175 women aged between 15 and 44 years admitted to hospital for thromboembolism and discharged alive. The controls are female patients admitted for conditions believed to be unrelated to oral contraceptive use. Cases and controls were matched on age, ethnic group, marital status, parity, income, place of residence, and date of hospitalisation."
file_description[file_names == "pop.dat"] <- "A dataset that contains data on the age (in months) and sex of 438 children aged between six and sixty months collected as part of a nutritional anthropometry survey of the Khosh Valley in Northeast Afghanistan."
file_description[file_names == "salex.dat"] <- "This data comes from a food-borne outbreak. On Saturday 17th October 1992, eighty-two people attended a buffet meal at a sports club. Within fourteen to twenty-four hours, fifty-one of the participants developed diarrhoea, with nausea, vomiting, abdominal pain and fever."
file_description[file_names == "school_nutrition.csv"] <- "A dataset from a nutrition survey of school children 10 years and older from Pakistan."
file_description[file_names == "school_nutrition.xlsx"] <- "A dataset from a nutrition survey of school children 10 years and older from Pakistan."
file_description[file_names == "south_wollo_coverage.csv"] <- "A dataset from a Community-based Management of Acute Malnutrition (CMAM) programme in South Wollo Zone, Ethiopia"
file_description[file_names == "sssw.dat"] <- "This dataset contains data on the marital status, home circumstances, and ethnic group of 152 persons recruited into a study into the levels of stress experienced by student social workers in the United Kingdom."
file_description[file_names == "tsstamp.dat"] <- "This data is from a matched case-control study investigating the association between the use of different brands of tampon and toxic shock syndrome undertaken during an outbreak. Only a subset of the original dataset is used here."
file_description[file_names == "waste.dat"] <- "The dataset contains the location of twenty-three recent cases of childhood cancer in 5 by 5 km square surrounding an industrial waste disposal site."
file_description[file_names == "koko_plus_coverage.csv"] <- "Dataset from a coverage survey of Koko+ in Eastern Ghana"

file_use <- vector(mode = "character", length = length(file_names))

file_use[file_names == "ba.dat"] <- "Useful for learning and practicing how to perform statistical methods to compare diagnostic tests using the Bland and Altman approach and the Bland and Altman plot"
file_use[file_names == "bateman.dat"] <- "Useful for learning and practicing how to perform logistic regression"
file_use[file_names == "ca.dat"] <- "Useful for learning and practicing how to perform survival anlaysis"
file_use[file_names == "cover.dat"] <- "Useful for learning and practicing how to analyse survey data and how to perform basic spatial analysis"
file_use[file_names == "diets.dat"] <- "Useful for learning and practicing statistical tests to show difference in mean between two groups"
file_use[file_names == "fem.dat"] <- "Useful for learning and practicing various statistical tests, linear regression, logistic regression, and linear modelling"
file_use[file_names == "fem.xlsx"] <- "Useful for learning and practicing various statistical tests, linear regression, logistic regression, and linear modelling"
file_use[file_names == "gudhiv.dat"] <- "Useful for learning and practicing logistic regression"
file_use[file_names == "malaria.dat"] <- "Usefule for learning and practicing time series analysis and plotting"
file_use[file_names == "nut.dat"] <- "Useful for learning and practicing how to analyse survey data"
file_use[file_names == "octe.dat"] <- "Useful for learning and practicing how to perform analysis for a matched cases-control study"
file_use[file_names == "pop.dat"] <- "Useful for learning and practicing how to create various plots including a population pyramid plot"
file_use[file_names == "salex.dat"] <- "Useful for learning and practicing how to perform analysis for relative risk and odds ratios"
file_use[file_names == "school_nutrition.csv"] <- "Useful for learning how to analyse survey data"
file_use[file_names == "school_nutrition.xlsx"] <- "Useful for learning how to analyse survey data"
file_use[file_names == "south_wollo_coverage.csv"] <- "Useful for learning how to analyse survey data and perform basic spatial analysis"
file_use[file_names == "sssw.dat"] <- "Useful for learning how to analyse survey data and using various plots for exploratory data analysis"
file_use[file_names == "tsstamp.dat"] <- "Useful for learning and practicing how to perform logistic regression and stratified analysis"
file_use[file_names == "waste.dat"] <- "Useful for learning and practicing computer simulation to test spatial clustering"
file_use[file_names == "koko_plus_coverage.csv"] <- "Useful for learning how to analyse survey data, perform basic spatial analysis, and perform comparative analysis for evaluating programme performance"

data.frame(file_names, file_types, file_description, file_use) |>
  knitr::kable(row.names = FALSE, col.names = c("File Name", "File Type", "File Description", "Epidemiology/Statistics Usage"))

```