From 0226a6c36a07648a381fe6d29be29b8f41ae798b Mon Sep 17 00:00:00 2001 From: Anthony Damico Date: Thu, 12 Dec 2024 10:59:52 -0500 Subject: [PATCH] nis --- docs/national-immunization-survey-nis.html | 22 +++++++++++----------- docs/search_index.json | 2 +- metadata/nis.txt | 22 +++++++++++----------- nis.Rmd | 22 +++++++++++----------- 4 files changed, 34 insertions(+), 34 deletions(-) diff --git a/docs/national-immunization-survey-nis.html b/docs/national-immunization-survey-nis.html index e3bcbcce..51fc600e 100644 --- a/docs/national-immunization-survey-nis.html +++ b/docs/national-immunization-survey-nis.html @@ -1231,10 +1231,10 @@

Recommended Reading

Download, Import, Preparation

-

Download the fixed-width file:

+

Download the 2023 fixed-width file:

dat_tf <- tempfile()
 
-dat_url <- "https://ftp.cdc.gov/pub/Vaccines_NIS/NISPUF21.DAT"
+dat_url <- "https://ftp.cdc.gov/pub/Vaccines_NIS/NISPUF23.DAT"
 
 download.file( dat_url , dat_tf , mode = 'wb' )

Edit then execute the import script provided by the CDC:

@@ -1242,7 +1242,7 @@

Download, Import, Preparation r_tf <- tempfile() -r_script_url <- "https://ftp.cdc.gov/pub/Vaccines_NIS/NISPUF21.R" +r_script_url <- "https://ftp.cdc.gov/pub/Vaccines_NIS/NISPUF23.R" r_input_lines <- readLines( r_script_url ) @@ -1259,7 +1259,7 @@

Download, Import, Preparationsource( r_tf , echo = TRUE ) # rename the resultant data.frame object -nis_df <- NISPUF21 +nis_df <- NISPUF23 names( nis_df ) <- tolower( names( nis_df ) ) @@ -1446,7 +1446,7 @@

Regression Models and Tests of Association

Replication Example

-

This example matches the statistics and standard errors from Data User’s Guide Table 4:

+

This example matches the statistics and standard errors from Data User’s Guide Table 4:

results <-
     svyby( 
         ~ p_utd431h314_rout_s , 
@@ -1459,12 +1459,12 @@ 

Replication Example standard_errors <- results[ , "se.p_utd431h314_rout_sUTD" , drop = FALSE ] -stopifnot( round( coefficients[ "HISPANIC" , ] , 3 ) == .711 ) -stopifnot( round( coefficients[ "NON-HISPANIC WHITE ONLY" , ] , 3 ) == .742 ) -stopifnot( round( coefficients[ "NON-HISPANIC BLACK ONLY" , ] , 3 ) == .647 ) -stopifnot( round( standard_errors[ "HISPANIC" , ] , 3 ) == .015 ) -stopifnot( round( standard_errors[ "NON-HISPANIC WHITE ONLY" , ] , 3 ) == .009 ) -stopifnot( round( standard_errors[ "NON-HISPANIC BLACK ONLY" , ] , 3 ) == .022 )

+stopifnot( round( coefficients[ "HISPANIC" , ] , 3 ) == .674 ) +stopifnot( round( coefficients[ "NON-HISPANIC WHITE ONLY" , ] , 3 ) == .716 ) +stopifnot( round( coefficients[ "NON-HISPANIC BLACK ONLY" , ] , 3 ) == .666 ) +stopifnot( round( standard_errors[ "HISPANIC" , ] , 3 ) == .017 ) +stopifnot( round( standard_errors[ "NON-HISPANIC WHITE ONLY" , ] , 3 ) == .008 ) +stopifnot( round( standard_errors[ "NON-HISPANIC BLACK ONLY" , ] , 3 ) == .023 )
diff --git a/docs/search_index.json b/docs/search_index.json index bfb98668..0d6a65c7 100644 --- a/docs/search_index.json +++ b/docs/search_index.json @@ -1 +1 @@ -[["index.html", "Analyze Survey Data for Free Forty-Nine Public Microdatasets, One Easy To Type Website", " Analyze Survey Data for Free Forty-Nine Public Microdatasets, One Easy To Type Website Please ask questions about this book on cross validated for survey statistics or stackoverflow for R. This textbook replaces my archived blog, prior code, and the no longer maintained lodown package. A work of R is never finished, merely abandoned. - Anthony Damico "],["trend-analysis-of-complex-survey-data.html", "Trend Analysis of Complex Survey Data Download, Import, Preparation Append Polynomials to Each Year Unadjusted Analysis Examples Calculate Joinpoints Needed Calculate Predicted Marginals Identify Joinpoint(s) or Breakpoint(s) Interpret and Conclude", " Trend Analysis of Complex Survey Data The purpose of this analysis is to make statistically valid statements such as, “there was a significant linear decrease in the prevalence of high school aged americans who have ever smoked a cigarette across the period 1999-2011” with complex sample survey data. This step-by-step walkthrough exactly reproduces the statistics presented in the Center for Disease Control & Prevention’s (CDC) linear trend analysis. This analysis may complement qualitative evaluation on prevalence changes observed from surveillance data by providing quantitative evidence, such as when a joinpoint (also called breakpoint or changepoint) occurred; however, this analysis does not explain why or how changes in trends occur. Download, Import, Preparation Download and import the multi-year stacked file: library(SAScii) library(readr) sas_url <- "https://www.cdc.gov/healthyyouth/data/yrbs/sadc_2019/2019-SADC-SAS-Input-Program.sas" dat_url <- "https://www.cdc.gov/healthyyouth/data/yrbs/sadc_2019/sadc_2019_national.dat" sas_positions <- parse.SAScii( sas_url ) sas_positions[ , 'varname' ] <- tolower( sas_positions[ , 'varname' ] ) variables_to_keep <- c( "sex" , "grade" , "race4" , "q30" , "year" , "psu" , "stratum" , "weight" ) sas_positions[ , 'column_types' ] <- ifelse( !( sas_positions[ , 'varname' ] %in% variables_to_keep ) , "_" , ifelse( sas_positions[ , 'char' ] , "c" , "d" ) ) yrbss_tbl <- read_fwf( dat_url , fwf_widths( abs( sas_positions[ , 'width' ] ) , col_names = sas_positions[ , 'varname' ] ) , col_types = paste0( sas_positions[ , 'column_types' ] , collapse = "" ) , na = c( "" , "." ) ) yrbss_df <- data.frame( yrbss_tbl ) Restrict the dataset to only years shown in the original analysis and re-name the main variable: yrbss_df <- subset( yrbss_df , year %in% seq( 1991 , 2011 , 2 ) ) yrbss_df[ , 'ever_smoked' ] <- as.numeric( yrbss_df[ , 'q30' ] == 1 ) yrbss_df[ , 'q30' ] <- NULL Recode each categorical variable to factor class: yrbss_df[ , 'sex' ] <- relevel( factor( yrbss_df[ , 'sex' ] ) , ref = "2" ) for ( i in c( 'race4' , 'grade' ) ){ yrbss_df[ , i ] <- relevel( factor( yrbss_df[ , i ] ) , ref = "1" ) } Append Polynomials to Each Year “The polynomials we have used as predictors to this point are natural polynomials, generated from the linear predictor by centering and then powering the linear predictor.” For more detail on this subject, see page 216 of Applied Multiple Regression/Correlation Analysis for the Behavioral Sciences By Jacob Cohen, Patricia Cohen, Stephen G. West, Leona S. Aiken distinct_years_available <- length( seq( 1991 , 2011 , 2 ) ) # store the linear polynomials c11l <- contr.poly( distinct_years_available )[ , ".L" ] # store the quadratic polynomials c11q <- contr.poly( distinct_years_available )[ , ".Q" ] # store the cubic polynomials c11c <- contr.poly( distinct_years_available )[ , ".C" ] For each record in the data set, tack on the linear, quadratic, and cubic contrast value, these contrast values will serve as replacement for the linear year variable in any regression: # year^1 term (linear) yrbss_df[ , "t11l" ] <- c11l[ match( yrbss_df[ , "year" ] , seq( 1991 , 2011 , 2 ) ) ] # year^2 term (quadratic) yrbss_df[ , "t11q" ] <- c11q[ match( yrbss_df[ , "year" ] , seq( 1991 , 2011 , 2 ) ) ] # year^3 term (cubic) yrbss_df[ , "t11c" ] <- c11c[ match( yrbss_df[ , "year" ] , seq( 1991 , 2011 , 2 ) ) ] Unadjusted Analysis Examples Construct a complex sample survey design and match the published unadjusted prevalence rates: options( survey.lonely.psu = "adjust" ) library(survey) des <- svydesign( id = ~psu , strata = ~interaction( stratum , year ) , data = yrbss_df , weights = ~weight , nest = TRUE ) prevalence_over_time <- svyby( ~ ever_smoked , ~ year , des , svymean , na.rm = TRUE ) # confirm prevalence rates match published estimates # of high school students that ever smoked stopifnot( all.equal( round( coef( prevalence_over_time ) , 3 ) , c( .701 , .695 , .713 , .702 , .704 , .639 , .584 , .543 , .503 , .463 , .447 ) , check.attributes = FALSE ) ) Calculate Joinpoints Needed Using the orthogonal coefficients (linear, quadratic, cubic terms) that we previously added to our yrbss_df object before constructing the multi-year stacked survey design, determine how many joinpoints will be needed for a trend analysis. Epidemiological models typically control for possible confounding variables such as age, sex, and race/ethnicity, so those have been included alongside the linear, cubic, and quadratic year terms. Calculate the “ever smoked” regression, adjusted by sex, grade, race/ethnicity, and linear year contrast: linyear <- svyglm( ever_smoked ~ sex + race4 + grade + t11l , design = des , family = quasibinomial ) summary( linyear ) The linear year-contrast variable t11l is significant. Therefore, there is probably going to be some sort of trend. A linear trend by itself does not need joinpoints. Not one, just zero joinpoints. If the linear term were the only significant term (out of linear, quadratic, cubic), then we would not need to calculate a joinpoint. In other words, we would not need to figure out where to best break our time trend into two, three, or even four segments. Since the linear trend is significant, we know there is at least one change across the entire 1991 to 2011 period. Interpretation note about segments of time: The linear term t11l was significant, so we probably have a significant linear trend somewhere to report. Now we need to figure out when that significant linear trend started and when it ended. It might be semantically true that there was a significant linear decrease in high school aged smoking over the entire period of our data 1991-2011; however, it’s inexact to end this analysis after only detecting a linear trend. The purpose of the following few steps is to cordon off different time points from one another. As you’ll see later, there actually was not any detectable decrease from 1991-1999. The entirety of the decline in smoking occurred over the period from 1999-2011. So these next (methodologically tricky) steps serve to provide you and your audience with a more careful statement of statistical significance. It’s not technically wrong to conclude that smoking declined over the period of 1991-2011, it’s just verbose. Think of it as the difference between “humans first walked on the moon in the sixties” and “humans first walked on the moon in 1969” - both statements are correct, but the latter exhibits greater scientific precision. Calculate the “ever smoked” binomial regression, adjusted by sex, grade, race/ethnicity, and both linear and quadratic year contrasts. Notice the addition of t11q: quadyear <- svyglm( ever_smoked ~ sex + race4 + grade + t11l + t11q , design = des , family = quasibinomial ) summary( quadyear ) The linear year-contrast variable is significant but the quadratic year-contrast variable is also significant. Therefore, we should use joinpoint software (the segmented package) for this analysis. A significant quadratic trend needs one joinpoint. Since both linear and quadratic terms are significant, we can also move ahead and test whether the cubic term is also significant. Calculate the “ever smoked” binomial regression, adjusted by sex, grade, race/ethnicity, and linear, quadratic, and cubic year contrasts. Notice the addition of t11c: cubyear <- svyglm( ever_smoked ~ sex + race4 + grade + t11l + t11q + t11c , design = des , family = quasibinomial ) summary( cubyear ) The cubic year-contrast term is also significant in this model. Therefore, we might potentially evaluate this trend using two joinpoints. In other words, a significant result for all linear, quadratic, and cubic year contrasts at this point means we might be able to evaluate three distinct trends (separated by our two joinpoints) across the broader 1991 - 2011 time period of analysis. Although we might now have the statistical ability to analyze three distinct time periods (separated by two joinpoints) across our data, the utility of this depends on the circumstances. Cubic and higher polynomials account for not only the direction of change but also the pace of that change, allowing statistical statements that might not be of interest to an audience: While it might be an exercise in precision to conclude that smoking rates dropped quickest across 1999-2003 and less quickly across 2003-2011, that scientific pair of findings may not be as compelling as the simpler (quadratic but not cubic) statement that smoking rates have dropped across the period of 1999-2011. Calculate Predicted Marginals Calculate the survey-year-independent predictor effects and store these results: marginals <- svyglm( formula = ever_smoked ~ sex + race4 + grade , design = des , family = quasibinomial ) Run these marginals through the svypredmeans function. For archaeology fans out there, this function emulates the PREDMARG statement in the ancient language of SUDAAN: ( means_for_joinpoint <- svypredmeans( marginals , ~factor( year ) ) ) Clean up these results a bit in preparation for a joinpoint analysis: # coerce the results to a data.frame object means_for_joinpoint <- as.data.frame( means_for_joinpoint ) # extract the row names as the survey year means_for_joinpoint[ , "year" ] <- as.numeric( rownames( means_for_joinpoint ) ) # must be sorted, just in case it's not already means_for_joinpoint <- means_for_joinpoint[ order( means_for_joinpoint[ , "year" ] ) , ] Identify Joinpoint(s) or Breakpoint(s) Let’s take a look at how confident we are in the value at each adjusted timepoint. Carrying out a trend analysis requires creating new weights to fit a piecewise linear regression. First, create that weight variable: means_for_joinpoint[ , "wgt" ] <- with( means_for_joinpoint, ( mean / SE ) ^ 2 ) Second, fit a piecewise linear regression, estimating the ‘starting’ linear model with the usual lm function using the log values and the weights: o <- lm( log( mean ) ~ year , weights = wgt , data = means_for_joinpoint ) Now that the regression has been structured correctly, estimate the year that our complex survey trend should be broken into two or more segments: library(segmented) # find only one joinpoint os <- segmented( o , ~year ) summary( os ) Look for the Estimated Break-Point(s) in that result - that’s the critical number from this joinpoint analysis. The segmented package uses an iterative procedure (described in the article below); between-year solutions are returned and should be rounded to the nearest time point in the analysis. The joinpoint software implements two estimating algorithms: the grid-search and the Hudson algorithm. For more detail about these methods, see Muggeo V. (2003) Estimating regression models with unknown break-points. Statistics in Medicine, 22: 3055-3071.. Obtain the annual percent change estimates for each time point: slope( os , APC = TRUE ) The confidence intervals for the annual percent change (APC) may be different from the ones returned by NCI’s Joinpoint Software; for further details, check out Muggeo V. (2010) A Comment on `Estimating average annual per cent change in trend analysis’ by Clegg et al., Statistics in Medicine; 28, 3670-3682. Statistics in Medicine, 29, 1958-1960. This analysis returned similar results to the NCI’s Joinpoint Regression Program by estimating a joinpoint at year=1999 - and, more precisely, that the start of that decreasing trend in smoking prevalence happened at an APC of -3.92 percent. That is, slope2 from the output above. Remember that the cubic-year model above had significant terms as well. Therefore, it would be statistically defensible to calculate two joinpoints rather than only one. However, for this analyses, breaking the 1999-2011 trend into two separate downward trends might not be of interest to the audience. Looking at the slope2 and slope3 estimates and confidence intervals, we might be able to conclude that “ever smoking” decreased across 1999-2003 and also decreased (albeit less rapidly) across 2003-2011. However, communicating two consecutive downward trends might not be of much interest to a lay audience. Forgoing a second possible joinpoint makes sense when the direction of change is more compelling than the pace of change: # find two joinpoints rather than only one os2 <- segmented( o , ~year , npsi = 2 ) summary( os2 ) slope( os2 , APC = TRUE ) Interpret and Conclude After identifying the joinpoint for smoking prevalence, we can create two regression models (one for each time segment - if we had two joinpoints, we would need three regression models). The first model covers the years leading up to (and including) the joinpoint (i.e., 1991 to 1999). The second model includes the years from the joinpoint forward (i.e., 1999 to 2011). So start with 1991, 1993, 1995, 1997, 1999, the five year-points before (and including) 1999: # calculate a five-timepoint linear contrast vector c5l <- contr.poly( 5 )[ , 1 ] # tack the five-timepoint linear contrast vectors onto the current survey design object des <- update( des , t5l = c5l[ match( year , seq( 1991 , 1999 , 2 ) ) ] ) pre_91_99 <- svyglm( ever_smoked ~ sex + race4 + grade + t5l , design = subset( des , year <= 1999 ) , family = quasibinomial ) summary( pre_91_99 ) # confirm 1991-1999 trend coefficient matches published estimates stopifnot( round( pre_91_99$coefficients['t5l'] , 5 ) == .03704 ) This reproduces the calculations behind the sentence on pdf page 6 of the original document: In this example, T5L_L had a p-value=0.52261 and beta=0.03704. Therefore, there was “no significant change in the prevalence of ever smoking a cigarette during 1991-1999.” Then move on to 1999, 2001, 2003, 2005, 2007, 2009, and 2011, the seven year-points after (and including) 1999: # calculate a seven-timepoint linear contrast vector c7l <- contr.poly( 7 )[ , 1 ] # tack the seven-timepoint linear contrast vectors onto the current survey design object des <- update( des , t7l = c7l[ match( year , seq( 1999 , 2011 , 2 ) ) ] ) post_99_11 <- svyglm( ever_smoked ~ sex + race4 + grade + t7l , design = subset( des , year >= 1999 ) , family = quasibinomial ) summary( post_99_11 ) # confirm 1999-2011 trend coefficient matches published estimates stopifnot( round( post_99_11$coefficients['t7l'] , 5 ) == -0.99165 ) This reproduces the calculations behind the sentence on pdf page 6 of the original document: In this example, T7L_R had a p-value<0.0001 and beta=-0.99165. Therefore, there was a “significant linear decrease in the prevalence of ever smoking a cigarette during 1999-2011.” "],["american-community-survey-acs.html", "American Community Survey (ACS) Recommended Reading Download, Import, Preparation Analysis Examples with the survey library   Replication Example Poverty and Inequality Estimation with convey   Analysis Examples with srvyr  ", " American Community Survey (ACS) The US Census Bureau’s annual replacement for the long-form decennial census. Two tables per state, the first with one row per household and the second with one row per individual. The civilian population of the United States. Released annually since 2005. Administered and financed by the US Census Bureau. Recommended Reading Four Example Strengths & Limitations: ✔️ Large sample size and sub-national geographies ✔️ Large userbase and supporting projects ❌ Short questionnaire ❌ Program participation undercount Three Example Findings: Life expectancy in adulthood fell between 1990 and 2018 for those without a university degree. In 2021, 8 million families lived with non-relatives and were not renters nor homeowners themselves. More than half of rural physicians were at least 50 years old in 2017, more than a quarter at least 60. Two Methodology Documents: Guidance for Data Users Wikipedia Entry One Haiku: # one percent sample # the decennial census # in miniature Download, Import, Preparation Choose either the entire United States with sas_hus.zip, or use a state’s abbreviation like sas_hal.zip for Alabama or sas_hak.zip for Alaska. This imports the Alabama household file: library(haven) tf_household <- tempfile() this_url_household <- "https://www2.census.gov/programs-surveys/acs/data/pums/2023/1-Year/sas_hal.zip" download.file( this_url_household , tf_household , mode = 'wb' ) unzipped_files_household <- unzip( tf_household , exdir = tempdir() ) acs_sas_household <- grep( '\\\\.sas7bdat$' , unzipped_files_household , value = TRUE ) acs_df_household <- read_sas( acs_sas_household ) names( acs_df_household ) <- tolower( names( acs_df_household ) ) Choose either the entire United States with sas_pus.zip, or use a state’s abbreviation like sas_pal.zip for Alabama or sas_pak.zip for Alaska. This imports the Alabama person file: tf_person <- tempfile() this_url_person <- "https://www2.census.gov/programs-surveys/acs/data/pums/2023/1-Year/sas_pal.zip" download.file( this_url_person , tf_person , mode = 'wb' ) unzipped_files_person <- unzip( tf_person , exdir = tempdir() ) acs_sas_person <- grep( '\\\\.sas7bdat$' , unzipped_files_person , value = TRUE ) acs_df_person <- read_sas( acs_sas_person ) names( acs_df_person ) <- tolower( names( acs_df_person ) ) Remove overlapping column and merge household + person files: acs_df_household[ , 'rt' ] <- NULL acs_df_person[ , 'rt' ] <- NULL acs_df <- merge( acs_df_household , acs_df_person ) stopifnot( nrow( acs_df ) == nrow( acs_df_person ) ) acs_df[ , 'one' ] <- 1 Save Locally   Save the object at any point: # acs_fn <- file.path( path.expand( "~" ) , "ACS" , "this_file.rds" ) # saveRDS( acs_df , file = acs_fn , compress = FALSE ) Load the same object: # acs_df <- readRDS( acs_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) acs_design <- svrepdesign( weight = ~pwgtp , repweights = 'pwgtp[0-9]+' , scale = 4 / 80 , rscales = rep( 1 , 80 ) , mse = TRUE , type = 'JK1' , data = acs_df ) Variable Recoding Add new columns to the data set: acs_design <- update( acs_design , state_name = factor( as.numeric( state ) , levels = c(1L, 2L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 53L, 54L, 55L, 56L, 72L) , labels = c("Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "District of Columbia", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming", "Puerto Rico") ) , cit = factor( cit , levels = 1:5 , labels = c( 'born in the u.s.' , 'born in the territories' , 'born abroad to american parents' , 'naturalized citizen' , 'non-citizen' ) ) , poverty_level = as.numeric( povpip ) , married = as.numeric( mar %in% 1 ) , sex = factor( sex , labels = c( 'male' , 'female' ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( acs_design , "sampling" ) != 0 ) svyby( ~ one , ~ cit , acs_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , acs_design ) svyby( ~ one , ~ cit , acs_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ poverty_level , acs_design , na.rm = TRUE ) svyby( ~ poverty_level , ~ cit , acs_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ sex , acs_design ) svyby( ~ sex , ~ cit , acs_design , svymean ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ poverty_level , acs_design , na.rm = TRUE ) svyby( ~ poverty_level , ~ cit , acs_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ sex , acs_design ) svyby( ~ sex , ~ cit , acs_design , svytotal ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ poverty_level , acs_design , 0.5 , na.rm = TRUE ) svyby( ~ poverty_level , ~ cit , acs_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ ssip , denominator = ~ pincp , acs_design , na.rm = TRUE ) Subsetting Restrict the survey design to senior citizens: sub_acs_design <- subset( acs_design , agep >= 65 ) Calculate the mean (average) of this subset: svymean( ~ poverty_level , sub_acs_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ poverty_level , acs_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ poverty_level , ~ cit , acs_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( acs_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ poverty_level , acs_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ poverty_level , acs_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ poverty_level , acs_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ married , acs_design , method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( poverty_level ~ married , acs_design ) Perform a chi-squared test of association for survey data: svychisq( ~ married + sex , acs_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( poverty_level ~ married + sex , acs_design ) summary( glm_result ) Replication Example This matches statistics, standard errors, and margin of errors from Alabama’s 2023 PUMS tallies: Match the sum of the weights: stopifnot( round( coef( svytotal( ~ one , acs_design ) ) , 0 ) == 5108468 ) Compute the population by age: pums_estimate <- c(287689L, 306458L, 325713L, 355557L, 334520L, 640995L, 649985L, 621783L, 307747L, 344812L, 553817L, 289119L, 90273L) pums_standard_error <- c(2698L, 5964L, 5865L, 5081L, 4427L, 5202L, 4615L, 4804L, 4947L, 4804L, 2166L, 3600L, 3080L) pums_margin_of_error <- c(4439L, 9811L, 9647L, 8358L, 7282L, 8557L, 7592L, 7903L, 8137L, 7902L, 3563L, 5922L, 5067L) results <- svytotal( ~ as.numeric( agep %in% 0:4 ) + as.numeric( agep %in% 5:9 ) + as.numeric( agep %in% 10:14 ) + as.numeric( agep %in% 15:19 ) + as.numeric( agep %in% 20:24 ) + as.numeric( agep %in% 25:34 ) + as.numeric( agep %in% 35:44 ) + as.numeric( agep %in% 45:54 ) + as.numeric( agep %in% 55:59 ) + as.numeric( agep %in% 60:64 ) + as.numeric( agep %in% 65:74 ) + as.numeric( agep %in% 75:84 ) + as.numeric( agep %in% 85:100 ) , acs_design ) stopifnot( all( round( coef( results ) , 0 ) == pums_estimate ) ) stopifnot( all( round( SE( results ) , 0 ) == pums_standard_error ) ) stopifnot( all( round( SE( results ) * 1.645 , 0 ) == pums_margin_of_error ) ) Poverty and Inequality Estimation with convey   The R convey library estimates measures of income concentration, poverty, inequality, and wellbeing. This textbook details the available features. As a starting point for ACS users, this code calculates the gini coefficient on complex sample survey data: library(convey) acs_design <- convey_prep( acs_design ) svygini( ~ hincp , acs_design , na.rm = TRUE ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for ACS users, this code replicates previously-presented examples: library(srvyr) acs_srvyr_design <- as_survey( acs_design ) Calculate the mean (average) of a linear variable, overall and by groups: acs_srvyr_design %>% summarize( mean = survey_mean( poverty_level , na.rm = TRUE ) ) acs_srvyr_design %>% group_by( cit ) %>% summarize( mean = survey_mean( poverty_level , na.rm = TRUE ) ) "],["area-health-resources-files-ahrf.html", "Area Health Resources Files (AHRF) Recommended Reading Download, Import, Preparation Analysis Examples with base R   Replication Example Analysis Examples with dplyr   Analysis Examples with data.table   Analysis Examples with duckdb  ", " Area Health Resources Files (AHRF) National, state, and county-level data on health care professions, health facilities, population characteristics, health workforce training, hospital utilization and expenditure, and the environment. One table with one row per county and a second table with one row per state. Replaced annually with the latest available county- and state-level statistics. Compiled by the Bureau of Health Workforce at the Health Services and Resources Administration. Recommended Reading Two Methodology Documents: User Documentation for the County Area Health Resources File (AHRF) 2021-2022 Release Frequently Asked Questions One Haiku: # local aggregates # to spread merge join spline regress # like fresh buttered bread Download, Import, Preparation Download and import the most current county-level file: library(haven) tf <- tempfile() ahrf_url <- "https://data.hrsa.gov//DataDownload/AHRF/AHRF_2021-2022_SAS.zip" download.file( ahrf_url , tf , mode = 'wb' ) unzipped_files <- unzip( tf , exdir = tempdir() ) sas_fn <- grep( "\\\\.sas7bdat$" , unzipped_files , value = TRUE ) ahrf_tbl <- read_sas( sas_fn ) ahrf_df <- data.frame( ahrf_tbl ) names( ahrf_df ) <- tolower( names( ahrf_df ) ) Save Locally   Save the object at any point: # ahrf_fn <- file.path( path.expand( "~" ) , "AHRF" , "this_file.rds" ) # saveRDS( ahrf_df , file = ahrf_fn , compress = FALSE ) Load the same object: # ahrf_df <- readRDS( ahrf_fn ) Variable Recoding Add new columns to the data set: ahrf_df <- transform( ahrf_df , cbsa_indicator_code = factor( as.numeric( f1406720 ) , levels = 0:2 , labels = c( "not metro" , "metro" , "micro" ) ) , mhi_2020 = f1322620 , whole_county_hpsa_2022 = as.numeric( f0978722 ) == 1 , census_region = factor( as.numeric( f04439 ) , levels = 1:4 , labels = c( "northeast" , "midwest" , "south" , "west" ) ) ) Analysis Examples with base R   Unweighted Counts Count the unweighted number of records in the table, overall and by groups: nrow( ahrf_df ) table( ahrf_df[ , "cbsa_indicator_code" ] , useNA = "always" ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: mean( ahrf_df[ , "mhi_2020" ] , na.rm = TRUE ) tapply( ahrf_df[ , "mhi_2020" ] , ahrf_df[ , "cbsa_indicator_code" ] , mean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: prop.table( table( ahrf_df[ , "census_region" ] ) ) prop.table( table( ahrf_df[ , c( "census_region" , "cbsa_indicator_code" ) ] ) , margin = 2 ) Calculate the sum of a linear variable, overall and by groups: sum( ahrf_df[ , "mhi_2020" ] , na.rm = TRUE ) tapply( ahrf_df[ , "mhi_2020" ] , ahrf_df[ , "cbsa_indicator_code" ] , sum , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: quantile( ahrf_df[ , "mhi_2020" ] , 0.5 , na.rm = TRUE ) tapply( ahrf_df[ , "mhi_2020" ] , ahrf_df[ , "cbsa_indicator_code" ] , quantile , 0.5 , na.rm = TRUE ) Subsetting Limit your data.frame to California: sub_ahrf_df <- subset( ahrf_df , f12424 == "CA" ) Calculate the mean (average) of this subset: mean( sub_ahrf_df[ , "mhi_2020" ] , na.rm = TRUE ) Measures of Uncertainty Calculate the variance, overall and by groups: var( ahrf_df[ , "mhi_2020" ] , na.rm = TRUE ) tapply( ahrf_df[ , "mhi_2020" ] , ahrf_df[ , "cbsa_indicator_code" ] , var , na.rm = TRUE ) Regression Models and Tests of Association Perform a t-test: t.test( mhi_2020 ~ whole_county_hpsa_2022 , ahrf_df ) Perform a chi-squared test of association: this_table <- table( ahrf_df[ , c( "whole_county_hpsa_2022" , "census_region" ) ] ) chisq.test( this_table ) Perform a generalized linear model: glm_result <- glm( mhi_2020 ~ whole_county_hpsa_2022 + census_region , data = ahrf_df ) summary( glm_result ) Replication Example Match the record count in row number 8,543 of AHRF 2021-2022 Technical Documentation.xlsx: stopifnot( nrow( ahrf_df ) == 3232 ) Analysis Examples with dplyr   The R dplyr library offers an alternative grammar of data manipulation to base R and SQL syntax. dplyr offers many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, and the tidyverse style of non-standard evaluation. This vignette details the available features. As a starting point for AHRF users, this code replicates previously-presented examples: library(dplyr) ahrf_tbl <- as_tibble( ahrf_df ) Calculate the mean (average) of a linear variable, overall and by groups: ahrf_tbl %>% summarize( mean = mean( mhi_2020 , na.rm = TRUE ) ) ahrf_tbl %>% group_by( cbsa_indicator_code ) %>% summarize( mean = mean( mhi_2020 , na.rm = TRUE ) ) Analysis Examples with data.table   The R data.table library provides a high-performance version of base R’s data.frame with syntax and feature enhancements for ease of use, convenience and programming speed. data.table offers concise syntax: fast to type, fast to read, fast speed, memory efficiency, a careful API lifecycle management, an active community, and a rich set of features. This vignette details the available features. As a starting point for AHRF users, this code replicates previously-presented examples: library(data.table) ahrf_dt <- data.table( ahrf_df ) Calculate the mean (average) of a linear variable, overall and by groups: ahrf_dt[ , mean( mhi_2020 , na.rm = TRUE ) ] ahrf_dt[ , mean( mhi_2020 , na.rm = TRUE ) , by = cbsa_indicator_code ] Analysis Examples with duckdb   The R duckdb library provides an embedded analytical data management system with support for the Structured Query Language (SQL). duckdb offers a simple, feature-rich, fast, and free SQL OLAP management system. This vignette details the available features. As a starting point for AHRF users, this code replicates previously-presented examples: library(duckdb) con <- dbConnect( duckdb::duckdb() , dbdir = 'my-db.duckdb' ) dbWriteTable( con , 'ahrf' , ahrf_df ) Calculate the mean (average) of a linear variable, overall and by groups: dbGetQuery( con , 'SELECT AVG( mhi_2020 ) FROM ahrf' ) dbGetQuery( con , 'SELECT cbsa_indicator_code , AVG( mhi_2020 ) FROM ahrf GROUP BY cbsa_indicator_code' ) "],["american-housing-survey-ahs.html", "American Housing Survey (AHS) Recommended Reading Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " American Housing Survey (AHS) The nationwide assessment of housing stock, with information on physical condition and neighborhood, costs of financing and maintenance, owner and renter characteristics, and changes over time. Nationally-representative and metropolitan flat files with one row per household, plus relational files. A complex sample survey of occupied and vacant housing units designed to generalize to all structures in the United States, both nationally and also for about thirty-five metropolitan areas. Released more or less biennially since 1973, with longitudinal samples redrawn in 1985 and 2015. Sponsored by the Department of Housing and Urban Development, run by the Census Bureau. Recommended Reading Four Example Strengths & Limitations: ✔️ National, state, and metro area geographies ✔️ Housing unit-focused questionnaire provides greater detail on housing stock ❌ Housing unit-focused questionnaire asks fewer detailed questions of occupants on some topics ❌ Underreported estimate of adjustable rate mortgages Three Example Findings: In 2017, 21% of residences nationwide did not have adequate space for COVID-19 isolation. From 1991 to 2017, single men earned 1.5% higher housing investment returns vs. women. More than a quarter of a million households receiving HUD rental assistance lacked accessibility features but had a member using a mobility device (like a wheelchair or walker) in 2019. Two Methodology Documents: Getting Started with the Public Use File: 2015 to 2023 Wikipedia Entry One Haiku: # real estate supply # half bath addition, raised roof # vent, rent too damn high Download, Import, Preparation Download and import the national 2023 flat file: library(haven) library(httr) tf <- tempfile() this_url <- paste0( "https://www2.census.gov/programs-surveys/ahs/" , "2023/AHS%202023%20National%20PUF%20v1.0%20Flat%20SAS.zip" ) GET( this_url , write_disk( tf ) , progress() ) ahs_tbl <- read_sas( tf ) ahs_df <- data.frame( ahs_tbl ) names( ahs_df ) <- tolower( names( ahs_df ) ) Save Locally   Save the object at any point: # ahs_fn <- file.path( path.expand( "~" ) , "AHS" , "this_file.rds" ) # saveRDS( ahs_df , file = ahs_fn , compress = FALSE ) Load the same object: # ahs_df <- readRDS( ahs_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) ahs_design <- svrepdesign( weights = ~ weight , repweights = "repweight[1-9]" , type = "Fay" , rho = ( 1 - 1 / sqrt( 4 ) ) , mse = TRUE , data = ahs_df ) Variable Recoding Add new columns to the data set: ahs_design <- update( ahs_design , one = 1 , tenure = factor( ifelse( tenure %in% c( -6 , 'N' ) , 4 , tenure ) , levels = 1:4 , labels = c( 'Owned or being bought' , 'Rented for cash rent' , 'Occupied without payment of cash rent' , 'Not occupied' ) ) , lotsize = factor( lotsize , levels = 1:7 , labels = c( "Less then 1/8 acre" , "1/8 up to 1/4 acre" , "1/4 up to 1/2 acre" , "1/2 up to 1 acre" , "1 up to 5 acres" , "5 up to 10 acres" , "10 acres or more" ) ) , below_poverty = as.numeric( perpovlvl < 100 ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( ahs_design , "sampling" ) != 0 ) svyby( ~ one , ~ tenure , ahs_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , ahs_design ) svyby( ~ one , ~ tenure , ahs_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ totrooms , ahs_design , na.rm = TRUE ) svyby( ~ totrooms , ~ tenure , ahs_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ lotsize , ahs_design , na.rm = TRUE ) svyby( ~ lotsize , ~ tenure , ahs_design , svymean , na.rm = TRUE ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ totrooms , ahs_design , na.rm = TRUE ) svyby( ~ totrooms , ~ tenure , ahs_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ lotsize , ahs_design , na.rm = TRUE ) svyby( ~ lotsize , ~ tenure , ahs_design , svytotal , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ totrooms , ahs_design , 0.5 , na.rm = TRUE ) svyby( ~ totrooms , ~ tenure , ahs_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ totrooms , denominator = ~ rent , ahs_design , na.rm = TRUE ) Subsetting Restrict the survey design to homes with a garage or carport: sub_ahs_design <- subset( ahs_design , garage == 1 ) Calculate the mean (average) of this subset: svymean( ~ totrooms , sub_ahs_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ totrooms , ahs_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ totrooms , ~ tenure , ahs_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( ahs_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ totrooms , ahs_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ totrooms , ahs_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ totrooms , ahs_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ below_poverty , ahs_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( totrooms ~ below_poverty , ahs_design ) Perform a chi-squared test of association for survey data: svychisq( ~ below_poverty + lotsize , ahs_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( totrooms ~ below_poverty + lotsize , ahs_design ) summary( glm_result ) Replication Example This example matches the estimate and margin of error of the Total row of the General Housing tab from the AHS 2023 Table Specifications and PUF Estimates for User Verification: result <- svytotal( ~ as.numeric( intstatus == 1 ) , ahs_design ) stopifnot( round( coef( result ) / 1000 , 0 ) == 133231 ) ci_results <- confint( result , level = 0.9 ) stopifnot( round( ( ci_results[ 2 ] - coef( result ) ) / 1000 , 0 ) == 381 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for AHS users, this code replicates previously-presented examples: library(srvyr) ahs_srvyr_design <- as_survey( ahs_design ) Calculate the mean (average) of a linear variable, overall and by groups: ahs_srvyr_design %>% summarize( mean = survey_mean( totrooms , na.rm = TRUE ) ) ahs_srvyr_design %>% group_by( tenure ) %>% summarize( mean = survey_mean( totrooms , na.rm = TRUE ) ) "],["american-national-election-studies-anes.html", "American National Election Studies (ANES) Recommended Reading Function Definitions Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " American National Election Studies (ANES) A time series recording belief, public opinion, and political participation back to Dewey vs. Truman. Most tables contain one row per sampled eligible voter, varying weights like pre- and post-election. A complex sample generalizing to eligible voters in the U.S. with some panels to follow individuals. Core studies released quadrennially (presidential elections), plus pilot studies (often at midterms). Administered by a consortium of universities and funded by the National Science Foundation. Recommended Reading Four Example Strengths & Limitations: ✔️ Time series studies interview both before and after quadrennial elections ✔️ Instrument design tested in smaller study prior to inclusion ❌ Turnout errors are part of a long-standing problem of turnout over-estimation in surveys ❌ Prior survey questions not always asked again Three Example Findings: Younger Americans were less politically polarized than older Americans in 2020. In 2020, 90% of Biden and Trump voters also opted for a congressional candidate of the same party. Between 1996 and 2016, demographic groups least likely to use the Internet and social media experienced larger changes in political polarization than those more likely to use the Internet. Two Methodology Documents: ANES 2020 Time Series Study Full Release: User Guide and Codebook How to Analyze ANES Survey Data One Haiku: # chez sacrificed queen # quadrennial bloodless coup # knight churchill's least worst Function Definitions Define a function to import a stata file as a data.frame: library(haven) anes_import_dta <- function( this_fn ){ this_tbl <- read_dta( this_fn ) this_tbl <- zap_labels( this_tbl ) this_df <- data.frame( this_tbl ) names( this_df ) <- tolower( names( this_df ) ) this_df } Download, Import, Preparation Register for the ANES Data Center at https://electionstudies.org/ Choose 2020 Time Series Study Download the STATA version of the February 10, 2022 file: library(haven) anes_fn <- file.path( path.expand( "~" ) , "anes_timeseries_2020_stata_20220210.dta" ) anes_df <- anes_import_dta( anes_fn ) Save Locally   Save the object at any point: # anes_fn <- file.path( path.expand( "~" ) , "ANES" , "this_file.rds" ) # saveRDS( anes_df , file = anes_fn , compress = FALSE ) Load the same object: # anes_df <- readRDS( anes_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) anes_design <- svydesign( ids = ~ v200010c , strata = ~ v200010d , weights = ~ v200010a , data = subset( anes_df , v200010a > 0 ) , nest = TRUE ) Variable Recoding Add new columns to the data set: anes_design <- update( anes_design , one = 1 , democratic_party_rating = ifelse( v201156 %in% 0:100 , v201156 , NA ) , republican_party_rating = ifelse( v201157 %in% 0:100 , v201157 , NA ) , primary_voter = ifelse( v201020 %in% 1:2 , as.numeric( v201020 == 1 ) , NA ) , think_gov_spend_least = factor( v201645 , levels = 1:4 , labels = c( 'foreign aid (correct)' , 'medicare' , 'national defense' , 'social security' ) ) , undoc_kids = factor( v201423x , levels = 1:6 , labels = c( 'should sent back - favor a great deal' , 'should sent back - favor a moderate amount' , 'should sent back - favor a little' , 'should allow to stay - favor a little' , 'should allow to stay - favor a moderate amount' , 'should allow to stay - favor a great deal' ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( anes_design , "sampling" ) != 0 ) svyby( ~ one , ~ undoc_kids , anes_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , anes_design ) svyby( ~ one , ~ undoc_kids , anes_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ republican_party_rating , anes_design , na.rm = TRUE ) svyby( ~ republican_party_rating , ~ undoc_kids , anes_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ think_gov_spend_least , anes_design , na.rm = TRUE ) svyby( ~ think_gov_spend_least , ~ undoc_kids , anes_design , svymean , na.rm = TRUE ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ republican_party_rating , anes_design , na.rm = TRUE ) svyby( ~ republican_party_rating , ~ undoc_kids , anes_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ think_gov_spend_least , anes_design , na.rm = TRUE ) svyby( ~ think_gov_spend_least , ~ undoc_kids , anes_design , svytotal , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ republican_party_rating , anes_design , 0.5 , na.rm = TRUE ) svyby( ~ republican_party_rating , ~ undoc_kids , anes_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ republican_party_rating , denominator = ~ democratic_party_rating , anes_design , na.rm = TRUE ) Subsetting Restrict the survey design to party id: independent: sub_anes_design <- subset( anes_design , v201231x == 4 ) Calculate the mean (average) of this subset: svymean( ~ republican_party_rating , sub_anes_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ republican_party_rating , anes_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ republican_party_rating , ~ undoc_kids , anes_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( anes_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ republican_party_rating , anes_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ republican_party_rating , anes_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ republican_party_rating , anes_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ primary_voter , anes_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( republican_party_rating ~ primary_voter , anes_design ) Perform a chi-squared test of association for survey data: svychisq( ~ primary_voter + think_gov_spend_least , anes_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( republican_party_rating ~ primary_voter + think_gov_spend_least , anes_design ) summary( glm_result ) Replication Example This example matches statistics and standard errors in the Age rows of the ANES respondents (weighted) column of Table 1A from Benchmark and Attrition Report for the ANES 2016 Time Series Study: Log in to the ANES Data Center at https://electionstudies.org/ Choose 2016 Time Series Study. Download the DTA version of the September 4, 2019 file Download the DTA version of the Methodology File December 10, 2018 anes2016_fn <- file.path( path.expand( "~" ) , "anes_timeseries_2016.dta" ) anes2016_df <- anes_import_dta( anes2016_fn ) method2016_fn <- file.path( path.expand( "~" ) , "anes_timeseries_2016_methodology_dta.dta" ) method2016_df <- anes_import_dta( method2016_fn ) before_nrow <- nrow( anes2016_df ) anes2016_df <- merge( anes2016_df , method2016_df , by = 'v160001' ) stopifnot( nrow( anes2016_df ) == before_nrow ) anes2016_df[ , 'age_categories' ] <- factor( findInterval( anes2016_df[ , 'v161267' ] , c( 18 , seq( 30 , 70 , 10 ) ) ) , levels = 1:6 , labels = c( '18-29' , '30-39' , '40-49' , '50-59' , '60-69' , '70 or older' ) ) anes2016_design <- svrepdesign( data = subset( anes2016_df , v160101f > 0 ) , weights = ~ v160101f , repweights = 'weight_ftf_rkwt([0-9]+)' , type = 'JK1' , scale = 32 / 33 ) ( results <- svymean( ~ age_categories , anes2016_design , na.rm = TRUE ) ) published_results <- c( 0.21 , 0.158 , 0.156 , 0.2 , 0.147 , 0.129 ) published_standard_errors <- c( 0.0091 , 0.009 , 0.0094 , 0.0122 , 0.0069 , 0.0083 ) stopifnot( all( round( coef( results ) , 3 ) == published_results ) ) stopifnot( all( round( SE( results ) , 4 ) == published_standard_errors ) ) This example matches statistics and standard errors in the Age rows of the Design-consistent, with published strata column of Table 1 from How to Analyze ANES Survey Data: Log in to the ANES Data Center at https://electionstudies.org/ Choose 2004 Time Series Study4 Download the DTA version of the Full Release August 16, 2005 file Choose 2006 Pilot Study Download the DTA version of the April 26, 2007 file anes2004_fn <- file.path( path.expand( "~" ) , "anes2004TS.dta" ) anes2004_df <- anes_import_dta( anes2004_fn ) pilot2006_fn <- file.path( path.expand( "~" ) , "anes2006pilot.dta" ) pilot2006_df <- anes_import_dta( pilot2006_fn ) before_nrow <- nrow( pilot2006_df ) pilot2006_df <- merge( pilot2006_df , anes2004_df , by.x = 'v06p001' , by.y = 'v040001' ) stopifnot( nrow( pilot2006_df ) == before_nrow ) pilot2006_df[ , 'age_categories' ] <- factor( findInterval( pilot2006_df[ , 'v043250' ] , c( 18 , seq( 30 , 70 , 10 ) ) ) , levels = 1:6 , labels = c( '18-29' , '30-39' , '40-49' , '50-59' , '60-69' , '70 or older' ) ) pilot2006_design <- svydesign( id = ~v06p007b , strata = ~v06p007a , data = pilot2006_df , weights = ~v06p002 , nest = TRUE ) ( results <- svymean( ~ age_categories , pilot2006_design , na.rm = TRUE ) ) published_results <- c( 0.207 , 0.162 , 0.218 , 0.175 , 0.111 , 0.126 ) published_standard_errors <- c( 0.0251 , 0.024 , 0.022 , 0.0149 , 0.0125 , 0.0287 ) stopifnot( all( round( coef( results ) , 3 ) == published_results ) ) stopifnot( all( round( SE( results ) , 4 ) == published_standard_errors ) ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for ANES users, this code replicates previously-presented examples: library(srvyr) anes_srvyr_design <- as_survey( anes_design ) Calculate the mean (average) of a linear variable, overall and by groups: anes_srvyr_design %>% summarize( mean = survey_mean( republican_party_rating , na.rm = TRUE ) ) anes_srvyr_design %>% group_by( undoc_kids ) %>% summarize( mean = survey_mean( republican_party_rating , na.rm = TRUE ) ) "],["american-time-use-survey-atus.html", "American Time Use Survey (ATUS) Recommended Reading Function Definitions Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " American Time Use Survey (ATUS) Sampled individuals write down everything they do for a single twenty-four hour period, in ten minute intervals. Time use data allows for the study of uncompensated work like cooking, chores, childcare. Many tables with structures described in the user guide, linkable to the Current Population Survey. A complex survey generalizing to person-hours among civilian non-institutional americans aged 15+. Released annually since 2003. Administered by the Bureau of Labor Statistics. Recommended Reading Four Example Strengths & Limitations: ✔️ Detailed respondent activity information ✔️ Network of international time use researchers ❌ Each individual respondent contributes only 24 hours of activity on “diary day” ❌ Limited sample sizes do not represent smaller geographic areas Three Example Findings: On average during 2021 and 2022, 37.1 million people in the US provided unpaid eldercare. Approximately 15% of working hours were performed at home in the US from 2011 to 2018. Low physical activity during 2014-2016 cannot be broadly attributed to limited leisure time. Two Methodology Documents: American Time Use Survey User’s Guide Wikipedia Entry One Haiku: # don't judge me bruno # eat one hour, sleep the rest # it's my lazy day Function Definitions Define a function to download, unzip, and import each comma-separated value dat file: library(httr) atus_csv_import <- function( this_url ){ this_tf <- tempfile() this_dl <- GET( this_url , user_agent( "email@address.com") ) writeBin( content( this_dl ) , this_tf ) unzipped_files <- unzip( this_tf , exdir = tempdir() ) this_dat <- grep( '\\\\.dat$' , unzipped_files , value = TRUE ) this_df <- read.csv( this_dat ) file.remove( c( this_tf , unzipped_files ) ) names( this_df ) <- tolower( names( this_df ) ) this_df } Download, Import, Preparation Download and import the activity, respondent, roster, and weights tables: act_df <- atus_csv_import( "https://www.bls.gov/tus/datafiles/atusact-2023.zip" ) resp_df <- atus_csv_import( "https://www.bls.gov/tus/datafiles/atusresp-2023.zip" ) rost_df <- atus_csv_import( "https://www.bls.gov/tus/datafiles/atusrost-2023.zip" ) wgts_df <- atus_csv_import( "https://www.bls.gov/tus/datafiles/atuswgts-2023.zip" ) Specify which variables to keep in each of the data.frame objects: act_df <- act_df[ c( 'tucaseid' , 'tutier1code' , 'tutier2code' , 'tuactdur24' ) ] resp_df <- resp_df[ c( 'tucaseid' , 'tufinlwgt' , 'tulineno' ) ] rost_df <- rost_df[ , c( 'tucaseid' , 'tulineno' , 'teage' , 'tesex' ) ] Distribute travel-related activities (tutier1code == 18 from the lexicon) based on their second tier code: act_df[ act_df[ , 'tutier1code' ] == 18 & act_df[ , 'tutier2code' ] == 99 , 'tutier1code' ] <- 50 act_df[ act_df[ , 'tutier1code' ] == 18 , 'tutier1code' ] <- act_df[ act_df[ , 'tutier1code' ] == 18 , 'tutier2code' ] Sum up all durations at the (respondent x major activity category)-level: act_long_df <- aggregate( tuactdur24 ~ tucaseid + tutier1code , data = act_df , sum ) act_wide_df <- reshape( act_long_df , idvar = 'tucaseid' , timevar = 'tutier1code' , direction = 'wide' ) # for individuals not engaging in an activity category, replace missings with zero minutes act_wide_df[ is.na( act_wide_df ) ] <- 0 # for all columns except the respondent identifier, convert minutes to hours act_wide_df[ , -1 ] <- act_wide_df[ , -1 ] / 60 Merge the respondent and summed activity tables, then the roster table, and finally the replicate weights: resp_act_df <- merge( resp_df , act_wide_df ) stopifnot( nrow( resp_act_df ) == nrow( resp_df ) ) resp_act_rost_df <- merge( resp_act_df , rost_df ) stopifnot( nrow( resp_act_rost_df ) == nrow( resp_df ) ) atus_df <- merge( resp_act_rost_df , wgts_df ) stopifnot( nrow( atus_df ) == nrow( resp_df ) ) # remove dots from column names names( atus_df ) <- gsub( "\\\\." , "_" , names( atus_df ) ) atus_df[ , 'one' ] <- 1 Save Locally   Save the object at any point: # atus_fn <- file.path( path.expand( "~" ) , "ATUS" , "this_file.rds" ) # saveRDS( atus_df , file = atus_fn , compress = FALSE ) Load the same object: # atus_df <- readRDS( atus_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) atus_design <- svrepdesign( weights = ~ tufinlwgt , repweights = "finlwgt[0-9]" , type = "Fay" , rho = ( 1 - 1 / sqrt( 4 ) ) , mse = TRUE , data = atus_df ) Variable Recoding Add new columns to the data set: # caring for and helping household members is top level 03 from the lexicon # https://www.bls.gov/tus/lexicons/lexiconwex2023.pdf atus_design <- update( atus_design , any_care = as.numeric( tuactdur24_3 > 0 ) , tesex = factor( tesex , levels = 1:2 , labels = c( 'male' , 'female' ) ) , age_category = factor( 1 + findInterval( teage , c( 18 , 35 , 65 ) ) , labels = c( "under 18" , "18 - 34" , "35 - 64" , "65 or older" ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( atus_design , "sampling" ) != 0 ) svyby( ~ one , ~ age_category , atus_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , atus_design ) svyby( ~ one , ~ age_category , atus_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ tuactdur24_1 , atus_design ) svyby( ~ tuactdur24_1 , ~ age_category , atus_design , svymean ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ tesex , atus_design ) svyby( ~ tesex , ~ age_category , atus_design , svymean ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ tuactdur24_1 , atus_design ) svyby( ~ tuactdur24_1 , ~ age_category , atus_design , svytotal ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ tesex , atus_design ) svyby( ~ tesex , ~ age_category , atus_design , svytotal ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ tuactdur24_1 , atus_design , 0.5 ) svyby( ~ tuactdur24_1 , ~ age_category , atus_design , svyquantile , 0.5 , ci = TRUE ) Estimate a ratio: svyratio( numerator = ~ tuactdur24_5 , denominator = ~ tuactdur24_12 , atus_design ) Subsetting Restrict the survey design to any time volunteering: sub_atus_design <- subset( atus_design , tuactdur24_15 > 0 ) Calculate the mean (average) of this subset: svymean( ~ tuactdur24_1 , sub_atus_design ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ tuactdur24_1 , atus_design ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ tuactdur24_1 , ~ age_category , atus_design , svymean ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( atus_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ tuactdur24_1 , atus_design ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ tuactdur24_1 , atus_design , deff = TRUE ) # SRS with replacement svymean( ~ tuactdur24_1 , atus_design , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ any_care , atus_design , method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( tuactdur24_1 ~ any_care , atus_design ) Perform a chi-squared test of association for survey data: svychisq( ~ any_care + tesex , atus_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( tuactdur24_1 ~ any_care + tesex , atus_design ) summary( glm_result ) Replication Example This example matches the “Caring for and helping household members” row of Table A-1: hours_per_day_civilian_population <- svymean( ~ tuactdur24_3 , atus_design ) stopifnot( round( coef( hours_per_day_civilian_population ) , 2 ) == 0.5 ) percent_engaged_per_day <- svymean( ~ any_care , atus_design ) stopifnot( round( coef( percent_engaged_per_day ) , 3 ) == 0.22 ) hours_per_day_among_engaged <- svymean( ~ tuactdur24_3 , subset( atus_design , any_care ) ) stopifnot( round( coef( hours_per_day_among_engaged ) , 2 ) == 2.29 ) This example matches the average hours and SE from Section 7.5 of the User’s Guide: Download and import the activity, activity summary, respondent, and weights tables: actsum07_df <- atus_csv_import( "https://www.bls.gov/tus/datafiles/atussum_2007.zip" ) resp07_df <- atus_csv_import( "https://www.bls.gov/tus/datafiles/atusresp_2007.zip" ) act07_df <- atus_csv_import( "https://www.bls.gov/tus/datafiles/atusact_2007.zip" ) wgts07_df <- atus_csv_import( "https://www.bls.gov/tus/datafiles/atuswgts_2007.zip" ) Option 1. Sum the two television fields from the activity summary file, removing zeroes: television_per_person <- data.frame( tucaseid = actsum07_df[ , 'tucaseid' ] , tuactdur24 = rowSums( actsum07_df[ , c( 't120303' , 't120304' ) ] ) ) television_per_person <- television_per_person[ television_per_person[ , 'tuactdur24' ] > 0 , ] Option 2. Limit the activity file to television watching records according to the 2007 Lexicon: television_activity <- subset( act07_df , tutier1code == 12 & tutier2code == 3 & tutier3code %in% 3:4 ) television_activity_summed <- aggregate( tuactdur24 ~ tucaseid , data = television_activity , sum ) Confirm both aggregation options yield the same results: stopifnot( all( television_per_person[ , 'tucaseid' ] == television_activity_summed[ , 'tucaseid' ] ) ) stopifnot( all( television_per_person[ , 'tuactdur24' ] == television_activity_summed[ , 'tuactdur24' ] ) ) Merge the respondent and summed activity tables, then the replicate weights: resp07_tpp_df <- merge( resp07_df[ , c( 'tucaseid' , 'tufinlwgt' ) ] , television_per_person , all.x = TRUE ) stopifnot( nrow( resp07_tpp_df ) == nrow( resp07_df ) ) # for individuals without television time, replace missings with zero minutes resp07_tpp_df[ is.na( resp07_tpp_df[ , 'tuactdur24' ] ) , 'tuactdur24' ] <- 0 # convert minutes to hours resp07_tpp_df[ , 'tuactdur24_hour' ] <- resp07_tpp_df[ , 'tuactdur24' ] / 60 atus07_df <- merge( resp07_tpp_df , wgts07_df ) stopifnot( nrow( atus07_df ) == nrow( resp07_df ) ) Construct a complex sample survey design: atus07_design <- svrepdesign( weights = ~ tufinlwgt , repweights = "finlwgt[0-9]" , type = "Fay" , rho = ( 1 - 1 / sqrt( 4 ) ) , data = atus07_df ) Match the statistic and SE of the number of hours daily that americans older than 14 watch tv: result <- svymean( ~ tuactdur24_hour , atus07_design ) stopifnot( round( coef( result ) , 2 ) == 2.62 ) stopifnot( round( SE( result ) , 4 ) == 0.0293 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for ATUS users, this code replicates previously-presented examples: library(srvyr) atus_srvyr_design <- as_survey( atus_design ) Calculate the mean (average) of a linear variable, overall and by groups: atus_srvyr_design %>% summarize( mean = survey_mean( tuactdur24_1 ) ) atus_srvyr_design %>% group_by( age_category ) %>% summarize( mean = survey_mean( tuactdur24_1 ) ) "],["behavioral-risk-factor-surveillance-system-brfss.html", "Behavioral Risk Factor Surveillance System (BRFSS) Recommended Reading Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " Behavioral Risk Factor Surveillance System (BRFSS) A health behavior telephone interview survey with enough sample size to examine all fifty states. One table with one row per telephone respondent. A complex survey designed to generalize to the civilian non-institutional adult population of the U.S. Released annually since 1984 but all states did not participate until 1994. Administered by the Centers for Disease Control and Prevention. Recommended Reading Four Example Strengths & Limitations: ✔️ Wide variety of disease surveillance utilities across every state ✔️ Sufficient sample to examine selected cities and counties in addition to all states ❌ Not every topical module asked in every state ❌ Kentucky and Pennsylvania were unable to collect enough data for the public use file in 2023 Three Example Findings: Among adults in 2000, 52% of ever smokers had quit smoking, and this ratio rose to 61% by 2019. By 2030, 49% of US adults will have obesity, and in every state this rate will be above 35%. Disabled Iowan adults in 2019 were more than three times more likely to indicate having depression. Two Methodology Documents: BRFSS Data User Guide Wikipedia Entry One Haiku: # a cellphone vibrates # it's the cdc! asking # if you ate veggies Download, Import, Preparation Download and import the national file: library(haven) zip_tf <- tempfile() zip_url <- "https://www.cdc.gov/brfss/annual_data/2023/files/LLCP2023XPT.zip" download.file( zip_url , zip_tf , mode = 'wb' ) brfss_tbl <- read_xpt( zip_tf ) brfss_df <- data.frame( brfss_tbl ) names( brfss_df ) <- tolower( names( brfss_df ) ) brfss_df[ , 'one' ] <- 1 Save Locally   Save the object at any point: # brfss_fn <- file.path( path.expand( "~" ) , "BRFSS" , "this_file.rds" ) # saveRDS( brfss_df , file = brfss_fn , compress = FALSE ) Load the same object: # brfss_df <- readRDS( brfss_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) options( survey.lonely.psu = "adjust" ) variables_to_keep <- c( 'one' , 'x_psu' , 'x_ststr' , 'x_llcpwt' , 'genhlth' , 'medcost1' , 'x_state' , 'x_age80' , 'physhlth' , 'menthlth' , 'x_hlthpl1' ) brfss_df <- brfss_df[ variables_to_keep ] brfss_national_design <- svydesign( id = ~ x_psu , strata = ~ x_ststr , data = brfss_df , weight = ~ x_llcpwt , nest = TRUE ) Since large linearized survey designs execute slowly, a replication design might be preferrable for exploratory analysis. Coefficients (such as means and medians) do not change, standard errors and confidence intervals differ slightly. The initial conversion with as.svrepdesign requires an extended period of processing time (perhaps run once overnight), subsequent analyses will finish much faster: # brfss_replication_design <- # as.svrepdesign( # brfss_national_design , # type = 'bootstrap' # ) # system.time( print( svymean( ~ x_age80 , brfss_national_design ) ) ) # system.time( print( svymean( ~ x_age80 , brfss_replication_design ) ) ) In this example, limit the national design to only Alaska for quicker processing: brfss_design <- subset( brfss_national_design , x_state %in% 2 ) Variable Recoding Add new columns to the data set: brfss_design <- update( brfss_design , fair_or_poor_health = ifelse( genhlth %in% 1:5 , as.numeric( genhlth > 3 ) , NA ) , no_doc_visit_due_to_cost = factor( medcost1 , levels = c( 1 , 2 , 7 , 9 ) , labels = c( "yes" , "no" , "dk" , "rf" ) ) , physhlth_days_not_good = ifelse( physhlth <= 30 , physhlth , ifelse( physhlth == 88 , 0 , NA ) ) , menthlth_days_not_good = ifelse( menthlth <= 30 , menthlth , ifelse( menthlth == 88 , 0 , NA ) ) , state_name = factor( x_state , levels = c(1, 2, 4, 5, 6, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56, 66, 72, 78) , labels = c("ALABAMA", "ALASKA", "ARIZONA", "ARKANSAS", "CALIFORNIA", "COLORADO", "CONNECTICUT", "DELAWARE", "DISTRICT OF COLUMBIA", "FLORIDA", "GEORGIA", "HAWAII", "IDAHO", "ILLINOIS", "INDIANA", "IOWA", "KANSAS", "KENTUCKY", "LOUISIANA", "MAINE", "MARYLAND", "MASSACHUSETTS", "MICHIGAN", "MINNESOTA", "MISSISSIPPI", "MISSOURI", "MONTANA", "NEBRASKA", "NEVADA", "NEW HAMPSHIRE", "NEW JERSEY", "NEW MEXICO", "NEW YORK", "NORTH CAROLINA", "NORTH DAKOTA", "OHIO", "OKLAHOMA", "OREGON", "PENNSYLVANIA", "RHODE ISLAND", "SOUTH CAROLINA", "SOUTH DAKOTA", "TENNESSEE", "TEXAS", "UTAH", "VERMONT", "VIRGINIA", "WASHINGTON", "WEST VIRGINIA", "WISCONSIN", "WYOMING", "GUAM", "PUERTO RICO", "U.S. VIRGIN ISLANDS") ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( brfss_design , "sampling" ) != 0 ) svyby( ~ one , ~ state_name , brfss_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , brfss_design ) svyby( ~ one , ~ state_name , brfss_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ x_age80 , brfss_design ) svyby( ~ x_age80 , ~ state_name , brfss_design , svymean ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ no_doc_visit_due_to_cost , brfss_design , na.rm = TRUE ) svyby( ~ no_doc_visit_due_to_cost , ~ state_name , brfss_design , svymean , na.rm = TRUE ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ x_age80 , brfss_design ) svyby( ~ x_age80 , ~ state_name , brfss_design , svytotal ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ no_doc_visit_due_to_cost , brfss_design , na.rm = TRUE ) svyby( ~ no_doc_visit_due_to_cost , ~ state_name , brfss_design , svytotal , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ x_age80 , brfss_design , 0.5 ) svyby( ~ x_age80 , ~ state_name , brfss_design , svyquantile , 0.5 , ci = TRUE ) Estimate a ratio: svyratio( numerator = ~ physhlth_days_not_good , denominator = ~ menthlth_days_not_good , brfss_design , na.rm = TRUE ) Subsetting Restrict the survey design to persons without health insurance: sub_brfss_design <- subset( brfss_design , x_hlthpl1 == 2 ) Calculate the mean (average) of this subset: svymean( ~ x_age80 , sub_brfss_design ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ x_age80 , brfss_design ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ x_age80 , ~ state_name , brfss_design , svymean ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( brfss_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ x_age80 , brfss_design ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ x_age80 , brfss_design , deff = TRUE ) # SRS with replacement svymean( ~ x_age80 , brfss_design , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ fair_or_poor_health , brfss_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( x_age80 ~ fair_or_poor_health , brfss_design ) Perform a chi-squared test of association for survey data: svychisq( ~ fair_or_poor_health + no_doc_visit_due_to_cost , brfss_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( x_age80 ~ fair_or_poor_health + no_doc_visit_due_to_cost , brfss_design ) summary( glm_result ) Replication Example This example matches Alaska’s confidence intervals from the BRFSS Prevalence & Trends Data: result <- svymean( ~ no_doc_visit_due_to_cost , subset( brfss_design , no_doc_visit_due_to_cost %in% c( 'yes' , 'no' ) ) , na.rm = TRUE ) stopifnot( round( coef( result )[1] , 3 ) == 0.111 ) stopifnot( round( confint( result )[ 1 , 1 ] , 3 ) == 0.098 ) stopifnot( round( confint( result )[ 1 , 2 ] , 3 ) == 0.123 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for BRFSS users, this code replicates previously-presented examples: library(srvyr) brfss_srvyr_design <- as_survey( brfss_design ) Calculate the mean (average) of a linear variable, overall and by groups: brfss_srvyr_design %>% summarize( mean = survey_mean( x_age80 ) ) brfss_srvyr_design %>% group_by( state_name ) %>% summarize( mean = survey_mean( x_age80 ) ) "],["consumer-expenditure-survey-ces.html", "Consumer Expenditure Survey (CES) Recommended Reading Download, Import, Preparation Analysis Examples with the survey library   Replication Example", " Consumer Expenditure Survey (CES) A household budget survey designed to guide major economic indicators like the Consumer Price Index. One table of survey responses per quarter with one row per sampled household (consumer unit). Additional tables containing one record per expenditure. A complex sample survey designed to generalize to the civilian non-institutional U.S. population. Released annually since 1996. Administered by the Bureau of Labor Statistics. Recommended Reading Four Example Strengths & Limitations: ✔️ Detailed expenditure categories ✔️ Respondents diary spending for two consecutive 1-week periods ❌ Measures purchases but not consumption ❌ Consumer unit definition differs from households or families in other surveys Three Example Findings: In 2022, one third of total nationwide expenditures were attributed to housing-related expenses. Between 2015 and early 2022, male household heads consumed a greater proportion of resources (33%) compared to female household heads (28%), who, in turn, consume more than children (23%). In 2020, if income increased by $100, spending on all food and alcohol increased by $14 on average. Two Methodology Documents: Consumer Expenditure Surveys Public Use Microdata Getting Started Guide Wikipedia Entry One Haiku: # price indices and # you spent how much on beans, jack? # pocketbook issues Download, Import, Preparation Download both the prior and current year of interview microdata: library(httr) tf_prior_year <- tempfile() this_url_prior_year <- "https://www.bls.gov/cex/pumd/data/stata/intrvw22.zip" dl_prior_year <- GET( this_url_prior_year , user_agent( "email@address.com" ) ) writeBin( content( dl_prior_year ) , tf_prior_year ) unzipped_files_prior_year <- unzip( tf_prior_year , exdir = tempdir() ) tf_current_year <- tempfile() this_url_current_year <- "https://www.bls.gov/cex/pumd/data/stata/intrvw23.zip" dl_current_year <- GET( this_url_current_year , user_agent( "email@address.com" ) ) writeBin( content( dl_current_year ) , tf_current_year ) unzipped_files_current_year <- unzip( tf_current_year , exdir = tempdir() ) unzipped_files <- c( unzipped_files_current_year , unzipped_files_prior_year ) Import and stack all 2023 quarterly files plus 2024’s first quarter: library(haven) fmli_files <- grep( "fmli2[3-4]" , unzipped_files , value = TRUE ) fmli_tbls <- lapply( fmli_files , read_dta ) fmli_dfs <- lapply( fmli_tbls , data.frame ) fmli_dfs <- lapply( fmli_dfs , function( w ){ names( w ) <- tolower( names( w ) ) ; w } ) fmli_cols <- lapply( fmli_dfs , names ) intersecting_cols <- Reduce( intersect , fmli_cols ) fmli_dfs <- lapply( fmli_dfs , function( w ) w[ intersecting_cols ] ) ces_df <- do.call( rbind , fmli_dfs ) Scale the weight columns based on the number of months in 2023: ces_df[ , c( 'qintrvyr' , 'qintrvmo' ) ] <- sapply( ces_df[ , c( 'qintrvyr' , 'qintrvmo' ) ] , as.numeric ) weight_columns <- grep( 'wt' , names( ces_df ) , value = TRUE ) ces_df <- transform( ces_df , mo_scope = ifelse( qintrvyr %in% 2023 & qintrvmo %in% 1:3 , qintrvmo - 1 , ifelse( qintrvyr %in% 2024 , 4 - qintrvmo , 3 ) ) ) for ( this_column in weight_columns ){ ces_df[ is.na( ces_df[ , this_column ] ) , this_column ] <- 0 ces_df[ , paste0( 'popwt_' , this_column ) ] <- ( ces_df[ , this_column ] * ces_df[ , 'mo_scope' ] / 12 ) } Combine previous quarter and current quarter variables into a single variable: expenditure_variables <- gsub( "pq$" , "" , grep( "pq$" , names( ces_df ) , value = TRUE ) ) # confirm that for every variable ending in pq, # there's the same variable ending in cq stopifnot( all( paste0( expenditure_variables , 'cq' ) %in% names( ces_df ) ) ) # confirm none of the variables without the pq or cq suffix exist if( any( expenditure_variables %in% names( ces_df ) ) ) stop( "variable conflict" ) for( this_column in expenditure_variables ){ ces_df[ , this_column ] <- rowSums( ces_df[ , paste0( this_column , c( 'pq' , 'cq' ) ) ] , na.rm = TRUE ) # annualize the quarterly spending ces_df[ , this_column ] <- 4 * ces_df[ , this_column ] ces_df[ is.na( ces_df[ , this_column ] ) , this_column ] <- 0 } Append any interview survey UCC found at https://www.bls.gov/cex/ce_source_integrate.xlsx: ucc_exp <- c( "450110" , "450210" ) mtbi_files <- grep( "mtbi2[3-4]" , unzipped_files , value = TRUE ) mtbi_tbls <- lapply( mtbi_files , read_dta ) mtbi_dfs <- lapply( mtbi_tbls , data.frame ) mtbi_dfs <- lapply( mtbi_dfs , function( w ){ names( w ) <- tolower( names( w ) ) ; w } ) mtbi_dfs <- lapply( mtbi_dfs , function( w ) w[ c( 'newid' , 'cost' , 'ucc' , 'ref_yr' ) ] ) mtbi_df <- do.call( rbind , mtbi_dfs ) mtbi_df <- subset( mtbi_df , ( ref_yr %in% 2023 ) & ( ucc %in% ucc_exp ) ) mtbi_agg <- aggregate( cost ~ newid , data = mtbi_df , sum ) names( mtbi_agg ) <- c( 'newid' , 'new_car_truck_exp' ) before_nrow <- nrow( ces_df ) ces_df <- merge( ces_df , mtbi_agg , all.x = TRUE ) stopifnot( nrow( ces_df ) == before_nrow ) ces_df[ is.na( ces_df[ , 'new_car_truck_exp' ] ) , 'new_car_truck_exp' ] <- 0 Save Locally   Save the object at any point: # ces_fn <- file.path( path.expand( "~" ) , "CES" , "this_file.rds" ) # saveRDS( ces_df , file = ces_fn , compress = FALSE ) Load the same object: # ces_df <- readRDS( ces_fn ) Survey Design Definition Construct a multiply-imputed, complex sample survey design: Separate the ces_df data.frame into five implicates, each differing from the others only in the multiply-imputed variables: library(survey) library(mitools) # create a vector containing all of the multiply-imputed variables # (leaving the numbers off the end) mi_vars <- gsub( "5$" , "" , grep( "[a-z]5$" , names( ces_df ) , value = TRUE ) ) # loop through each of the five variables.. for ( i in 1:5 ){ # copy the 'ces_df' table over to a new temporary data frame 'x' x <- ces_df # loop through each of the multiply-imputed variables.. for ( j in mi_vars ){ # copy the contents of the current column (for example 'welfare1') # over to a new column ending in 'mi' (for example 'welfaremi') x[ , paste0( j , 'mi' ) ] <- x[ , paste0( j , i ) ] # delete the all five of the imputed variable columns x <- x[ , !( names( x ) %in% paste0( j , 1:5 ) ) ] } assign( paste0( 'imp' , i ) , x ) } ces_design <- svrepdesign( weights = ~ finlwt21 , repweights = "^wtrep[0-9][0-9]$" , data = imputationList( list( imp1 , imp2 , imp3 , imp4 , imp5 ) ) , type = "BRR" , combined.weights = TRUE , mse = TRUE ) Variable Recoding Add new columns to the data set: ces_design <- update( ces_design , one = 1 , any_food_stamp = as.numeric( jfs_amtmi > 0 ) , bls_urbn = factor( bls_urbn , levels = 1:2 , labels = c( 'urban' , 'rural' ) ) , sex_ref = factor( sex_ref , levels = 1:2 , labels = c( 'male' , 'female' ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: MIcombine( with( ces_design , svyby( ~ one , ~ one , unwtd.count ) ) ) MIcombine( with( ces_design , svyby( ~ one , ~ bls_urbn , unwtd.count ) ) ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: MIcombine( with( ces_design , svytotal( ~ one ) ) ) MIcombine( with( ces_design , svyby( ~ one , ~ bls_urbn , svytotal ) ) ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: MIcombine( with( ces_design , svymean( ~ totexp ) ) ) MIcombine( with( ces_design , svyby( ~ totexp , ~ bls_urbn , svymean ) ) ) Calculate the distribution of a categorical variable, overall and by groups: MIcombine( with( ces_design , svymean( ~ sex_ref ) ) ) MIcombine( with( ces_design , svyby( ~ sex_ref , ~ bls_urbn , svymean ) ) ) Calculate the sum of a linear variable, overall and by groups: MIcombine( with( ces_design , svytotal( ~ totexp ) ) ) MIcombine( with( ces_design , svyby( ~ totexp , ~ bls_urbn , svytotal ) ) ) Calculate the weighted sum of a categorical variable, overall and by groups: MIcombine( with( ces_design , svytotal( ~ sex_ref ) ) ) MIcombine( with( ces_design , svyby( ~ sex_ref , ~ bls_urbn , svytotal ) ) ) Calculate the median (50th percentile) of a linear variable, overall and by groups: MIcombine( with( ces_design , svyquantile( ~ totexp , 0.5 , se = TRUE ) ) ) MIcombine( with( ces_design , svyby( ~ totexp , ~ bls_urbn , svyquantile , 0.5 , se = TRUE , ci = TRUE ) ) ) Estimate a ratio: MIcombine( with( ces_design , svyratio( numerator = ~ totexp , denominator = ~ fincbtxmi ) ) ) Subsetting Restrict the survey design to california residents: sub_ces_design <- subset( ces_design , state == '06' ) Calculate the mean (average) of this subset: MIcombine( with( sub_ces_design , svymean( ~ totexp ) ) ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- MIcombine( with( ces_design , svymean( ~ totexp ) ) ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- MIcombine( with( ces_design , svyby( ~ totexp , ~ bls_urbn , svymean ) ) ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( ces_design$designs[[1]] ) Calculate the complex sample survey-adjusted variance of any statistic: MIcombine( with( ces_design , svyvar( ~ totexp ) ) ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement MIcombine( with( ces_design , svymean( ~ totexp , deff = TRUE ) ) ) # SRS with replacement MIcombine( with( ces_design , svymean( ~ totexp , deff = "replace" ) ) ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: # MIsvyciprop( ~ any_food_stamp , ces_design , # method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: # MIsvyttest( totexp ~ any_food_stamp , ces_design ) Perform a chi-squared test of association for survey data: # MIsvychisq( ~ any_food_stamp + sex_ref , ces_design ) Perform a survey-weighted generalized linear model: glm_result <- MIcombine( with( ces_design , svyglm( totexp ~ any_food_stamp + sex_ref ) ) ) summary( glm_result ) Replication Example This example matches the number of consumer units and the Cars and trucks, new rows of Table R-1: result <- MIcombine( with( ces_design , svytotal( ~ as.numeric( popwt_finlwt21 / finlwt21 ) ) ) ) stopifnot( round( coef( result ) , -3 ) == 134556000 ) results <- sapply( weight_columns , function( this_column ){ sum( ces_df[ , 'new_car_truck_exp' ] * ces_df[ , this_column ] ) / sum( ces_df[ , paste0( 'popwt_' , this_column ) ] ) } ) stopifnot( round( results[1] , 2 ) == 2896.03 ) standard_error <- sqrt( ( 1 / 44 ) * sum( ( results[-1] - results[1] )^2 ) ) stopifnot( round( standard_error , 2 ) == 225.64 ) # note the minor differences MIcombine( with( ces_design , svymean( ~ cartkn ) ) ) "],["california-health-interview-survey-chis.html", "California Health Interview Survey (CHIS) Recommended Reading Function Definitions Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " California Health Interview Survey (CHIS) California’s National Health Interview Survey (NHIS), a healthcare survey for the nation’s largest state. One adult, one teenage (12-17), and one child table, each with one row per sampled respondent. A complex survey designed to generalize to the civilian non-institutionalized population of California. Released annually since 2011, and biennially since 2001. Administered by the UCLA Center for Health Policy Research. Recommended Reading Four Example Strengths & Limitations: ✔️ Neighborhood-level estimates ✔️ Oversamples allow targeted research questions ❌ Low response rates compared to nationwide surveys ❌ Two-year data periods reduces precision of trend analyses Three Example Findings: In 2021, adults with limited English proficiency were less likely to use video or telephone telehealth. The share of non-citizen kids reporting excellent health increased from 2013-2015 to 2017-2019. Adults working from home had worse health behaviors and mental health than other workers in 2021. Two Methodology Documents: CHIS 2021-2022 Methodology Report Series, Report 1: Sample Design DESIGN CHIS 2021-2022 Methodology Report Series, Report 5: Weighting and Variance Estimation One Haiku: # strike gold, movie star # play, third wish cali genie # statewide health survey Function Definitions Define a function to unzip and import each Stata file: library(haven) chis_import <- function( this_fn ){ these_files <- unzip( this_fn , exdir = tempdir() ) stata_fn <- grep( "ADULT\\\\.|CHILD\\\\.|TEEN\\\\." , these_files , value = TRUE ) this_tbl <- read_stata( stata_fn ) this_df <- data.frame( this_tbl ) names( this_df ) <- tolower( names( this_df ) ) # remove labelled classes labelled_cols <- sapply( this_df , function( w ) class( w )[1] == 'haven_labelled' ) this_df[ labelled_cols ] <- sapply( this_df[ labelled_cols ] , as.numeric ) this_df } Download, Import, Preparation Register at the UCLA Center for Health Policy Research at https://healthpolicy.ucla.edu/user/register. Choose Year: 2022, Age Group: Adult and Teen and Child, File Type: Stata. Download the 2022 Adult, Teen, and Child Stata files (version Oct 2023). Import the adult, teen, and child stata tables into data.frame objects: chis_adult_df <- chis_import( file.path( path.expand( "~" ) , "adult_stata_2022.zip" ) ) chis_teen_df <- chis_import( file.path( path.expand( "~" ) , "teen_stata_2022.zip" ) ) chis_child_df <- chis_import( file.path( path.expand( "~" ) , "child_stata_2022.zip" ) ) Harmonize the general health condition variable across the three data.frame objects: chis_adult_df[ , 'general_health' ] <- c( 1 , 2 , 3 , 4 , 4 )[ chis_adult_df[ , 'ab1' ] ] chis_teen_df[ , 'general_health' ] <- chis_teen_df[ , 'tb1_p1' ] chis_child_df[ , 'general_health' ] <- c( 1 , 2 , 3 , 4 , 4 )[ chis_child_df[ , 'ca6' ] ] Add four age categories across the three data.frame objects: chis_adult_df[ , 'age_categories' ] <- ifelse( chis_adult_df[ , 'srage_p1' ] >= 65 , 4 , 3 ) chis_teen_df[ , 'age_categories' ] <- 2 chis_child_df[ , 'age_categories' ] <- 1 Harmonize the usual source of care variable across the three data.frame objects: chis_adult_df[ , 'no_usual_source_of_care' ] <- as.numeric( chis_adult_df[ , 'ah1v2' ] == 2 ) chis_teen_df[ , 'no_usual_source_of_care' ] <- as.numeric( chis_teen_df[ , 'tf1v2' ] == 2 ) chis_child_df[ , 'no_usual_source_of_care' ] <- as.numeric( chis_child_df[ , 'cd1v2' ] == 2 ) Add monthly fruit and vegetable counts to the adult data.frame object, blanking the other two: chis_adult_df[ , 'adult_fruits_past_month' ] <- chis_adult_df[ , 'ae2' ] chis_adult_df[ , 'adult_veggies_past_month' ] <- chis_adult_df[ , 'ae7' ] chis_teen_df[ , c( 'adult_fruits_past_month' , 'adult_veggies_past_month' ) ] <- NA chis_child_df[ , c( 'adult_fruits_past_month' , 'adult_veggies_past_month' ) ] <- NA Specify which variables to keep in each of the data.frame objects, then stack them: variables_to_keep <- c( grep( '^rakedw' , names( chis_adult_df ) , value = TRUE ) , 'general_health' , 'age_categories' , 'adult_fruits_past_month' , 'adult_veggies_past_month' , 'srsex' , 'povll2_p1v2' , 'no_usual_source_of_care' ) chis_df <- rbind( chis_child_df[ variables_to_keep ] , chis_teen_df[ variables_to_keep ] , chis_adult_df[ variables_to_keep ] ) Save Locally   Save the object at any point: # chis_fn <- file.path( path.expand( "~" ) , "CHIS" , "this_file.rds" ) # saveRDS( chis_df , file = chis_fn , compress = FALSE ) Load the same object: # chis_df <- readRDS( chis_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) chis_design <- svrepdesign( data = chis_df , weights = ~ rakedw0 , repweights = "rakedw[1-9]" , type = "other" , scale = 1 , rscales = 1 , mse = TRUE ) Variable Recoding Add new columns to the data set: chis_design <- update( chis_design , one = 1 , gender = factor( srsex , levels = 1:2 , labels = c( 'male' , 'female' ) ) , age_categories = factor( age_categories , levels = 1:4 , labels = c( 'children under 12' , 'teens age 12-17' , 'adults age 18-64' , 'seniors' ) ) , general_health = factor( general_health , levels = 1:4 , labels = c( 'Excellent' , 'Very good' , 'Good' , 'Fair/Poor' ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( chis_design , "sampling" ) != 0 ) svyby( ~ one , ~ general_health , chis_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , chis_design ) svyby( ~ one , ~ general_health , chis_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ povll2_p1v2 , chis_design ) svyby( ~ povll2_p1v2 , ~ general_health , chis_design , svymean ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ gender , chis_design ) svyby( ~ gender , ~ general_health , chis_design , svymean ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ povll2_p1v2 , chis_design ) svyby( ~ povll2_p1v2 , ~ general_health , chis_design , svytotal ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ gender , chis_design ) svyby( ~ gender , ~ general_health , chis_design , svytotal ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ povll2_p1v2 , chis_design , 0.5 ) svyby( ~ povll2_p1v2 , ~ general_health , chis_design , svyquantile , 0.5 , ci = TRUE ) Estimate a ratio: svyratio( numerator = ~ adult_fruits_past_month , denominator = ~ adult_veggies_past_month , chis_design , na.rm = TRUE ) Subsetting Restrict the survey design to seniors: sub_chis_design <- subset( chis_design , age_categories == 'seniors' ) Calculate the mean (average) of this subset: svymean( ~ povll2_p1v2 , sub_chis_design ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ povll2_p1v2 , chis_design ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ povll2_p1v2 , ~ general_health , chis_design , svymean ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( chis_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ povll2_p1v2 , chis_design ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ povll2_p1v2 , chis_design , deff = TRUE ) # SRS with replacement svymean( ~ povll2_p1v2 , chis_design , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ no_usual_source_of_care , chis_design , method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( povll2_p1v2 ~ no_usual_source_of_care , chis_design ) Perform a chi-squared test of association for survey data: svychisq( ~ no_usual_source_of_care + gender , chis_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( povll2_p1v2 ~ no_usual_source_of_care + gender , chis_design ) summary( glm_result ) Replication Example This matches the proportions and counts from AskCHIS. The standard errors do not match precisely, but the team at UCLA confirmed this survey design definition to be correct, and that the minor standard error and confidence interval differences should not impact any analyses from a statistical perspective: chis_adult_design <- svrepdesign( data = chis_adult_df , weights = ~ rakedw0 , repweights = "rakedw[1-9]" , type = "other" , scale = 1 , rscales = 1 , mse = TRUE ) chis_adult_design <- update( chis_adult_design , ab1 = factor( ab1 , levels = 1:5 , labels = c( 'Excellent' , 'Very good' , 'Good' , 'Fair' , 'Poor' ) ) ) this_proportion <- svymean( ~ ab1 , chis_adult_design ) stopifnot( round( coef( this_proportion ) , 3 ) == c( 0.183 , 0.340 , 0.309 , 0.139 , 0.029 ) ) this_count <- svytotal( ~ ab1 , chis_adult_design ) stopifnot( round( coef( this_count ) , -3 ) == c( 5414000 , 10047000 , 9138000 , 4106000 , 855000 ) ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for CHIS users, this code replicates previously-presented examples: library(srvyr) chis_srvyr_design <- as_survey( chis_design ) Calculate the mean (average) of a linear variable, overall and by groups: chis_srvyr_design %>% summarize( mean = survey_mean( povll2_p1v2 ) ) chis_srvyr_design %>% group_by( general_health ) %>% summarize( mean = survey_mean( povll2_p1v2 ) ) "],["census-of-governments-cog.html", "Census of Governments (COG) Recommended Reading Download, Import, Preparation Analysis Examples with base R   Replication Example Analysis Examples with dplyr   Analysis Examples with data.table   Analysis Examples with duckdb  ", " Census of Governments (COG) Location, employment, and payroll for state and local (but not federal) government agencies in the U.S. One record per agency, one per agency function, plus the government units master address file. Complete enumeration of civilian employment in state and local governments in the 50 states + D.C. The Annual Survey of Public Employment & Payroll becomes a census in years ending with 2 and 7. Administered and financed by the US Census Bureau. Recommended Reading Two Methodology Documents: 2022 Census of Governments, Survey of Public Employment & Payroll Methodology Government Units Survey Methodology One Haiku: # courthouse steps wedding # schools police fire water # no fed mail invite Download, Import, Preparation Download, import, and stack the government units listing file: library(readxl) tf_gus <- tempfile() gus_url <- "https://www2.census.gov/programs-surveys/gus/datasets/2022/govt_units_2022.ZIP" download.file( gus_url , tf_gus , mode = 'wb' ) unzipped_files_gus <- unzip( tf_gus , exdir = tempdir() ) xlsx_gus_fn <- grep( "\\\\.xlsx$" , unzipped_files_gus , value = TRUE ) xlsx_sheets <- excel_sheets( xlsx_gus_fn ) # read all sheets into a list of tibbles gus_tbl_list <- lapply( xlsx_sheets , function( w ) read_excel( xlsx_gus_fn , sheet = w ) ) # convert all tibbles to data.frame objects gus_df_list <- lapply( gus_tbl_list , data.frame ) # lowercase all column names gus_df_list <- lapply( gus_df_list , function( w ){ names( w ) <- tolower( names( w ) ) ; w } ) # add the excel tab source to each data.frame for( i in seq( xlsx_sheets ) ) gus_df_list[[ i ]][ , 'source_tab' ] <- xlsx_sheets[ i ] # determine which columns are in all tables column_intersect <- Reduce( intersect , lapply( gus_df_list , names ) ) # determine which columns are in some but not all tables column_union <- unique( unlist( lapply( gus_df_list , names ) ) ) # these columns will be discarded by stacking: unique( unlist( lapply( lapply( gus_df_list , names ) , function( w ) column_union[ !column_union %in% w ] ) ) ) # stack all excel sheets, keeping only the columns that all tables have in common gus_df <- Reduce( rbind , lapply( gus_df_list , function( w ) w[ column_intersect ] ) ) Download and import the survey of public employment & payroll, one record per function (not per unit): tf_apes <- tempfile() apes_url <- paste0( "https://www2.census.gov/programs-surveys/apes/datasets/" , "2022/2022%20COG-E%20Individual%20Unit%20Files.zip" ) download.file( apes_url , tf_apes , mode = 'wb' ) unzipped_files_apes <- unzip( tf_apes , exdir = tempdir() ) xlsx_apes_fn <- grep( "\\\\.xlsx$" , unzipped_files_apes , value = TRUE ) apes_tbl <- read_excel( xlsx_apes_fn ) apes_df <- data.frame( apes_tbl ) names( apes_df ) <- tolower( names( apes_df ) ) Review the non-matching records between these two tables, then merge: # all DEP School Districts and a third of Special Districts are not in the `apes_df` table( gus_df[ , 'census_id_gidid' ] %in% apes_df[ , 'individual.unit.id' ] , gus_df[ , 'source_tab' ] , useNA = 'always' ) # state governments are not in the `gus_df` table( apes_df[ , 'individual.unit.id' ] %in% gus_df[ , 'census_id_gidid' ] , apes_df[ , 'type.of.government' ] , useNA = 'always' ) # check for overlapping field names: ( overlapping_names <- intersect( names( apes_df ) , names( gus_df ) ) ) # rename the state column in `gus_df` to state abbreviation names( gus_df )[ names( gus_df ) == 'state' ] <- 'stateab' double_df <- merge( apes_df , gus_df , by.x = 'individual.unit.id' , by.y = 'census_id_gidid' , all.x = TRUE ) stopifnot( nrow( double_df ) == nrow( apes_df ) ) # replace dots with underscores names( double_df ) <- gsub( "\\\\." , "_" , names( double_df ) ) Keep either the one record per agency rows or the one record per function rows: # `Total - All Government Employment Functions` records sum to the same as all other records: with( double_df , tapply( full_time_employees , grepl( "Total" , government_function ) , sum ) ) with( double_df , tapply( part_time_payroll , grepl( "Total" , government_function ) , sum ) ) # keep one record per government function (multiple records per agency): cog_df <- subset( double_df , !grepl( "Total" , government_function ) ) # keep one record per government agency: # cog_df <- subset( double_df , grepl( "Total" , government_function ) ) Save Locally   Save the object at any point: # cog_fn <- file.path( path.expand( "~" ) , "COG" , "this_file.rds" ) # saveRDS( cog_df , file = cog_fn , compress = FALSE ) Load the same object: # cog_df <- readRDS( cog_fn ) Variable Recoding Add new columns to the data set: cog_df <- transform( cog_df , one = 1 , total_payroll = full_time_payroll + part_time_payroll , total_employees = full_time_employees + part_time_employees , any_full_time_employees = full_time_employees > 0 ) Analysis Examples with base R   Unweighted Counts Count the unweighted number of records in the table, overall and by groups: nrow( cog_df ) table( cog_df[ , "type_of_government" ] , useNA = "always" ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: mean( cog_df[ , "full_time_employees" ] ) tapply( cog_df[ , "full_time_employees" ] , cog_df[ , "type_of_government" ] , mean ) Calculate the distribution of a categorical variable, overall and by groups: prop.table( table( cog_df[ , "census_region" ] ) ) prop.table( table( cog_df[ , c( "census_region" , "type_of_government" ) ] ) , margin = 2 ) Calculate the sum of a linear variable, overall and by groups: sum( cog_df[ , "full_time_employees" ] ) tapply( cog_df[ , "full_time_employees" ] , cog_df[ , "type_of_government" ] , sum ) Calculate the median (50th percentile) of a linear variable, overall and by groups: quantile( cog_df[ , "full_time_employees" ] , 0.5 ) tapply( cog_df[ , "full_time_employees" ] , cog_df[ , "type_of_government" ] , quantile , 0.5 ) Subsetting Limit your data.frame to Elementary, Secondary, Higher, and Other Educational Government Agencies: sub_cog_df <- subset( cog_df , grepl( 'Education' , government_function ) ) Calculate the mean (average) of this subset: mean( sub_cog_df[ , "full_time_employees" ] ) Measures of Uncertainty Calculate the variance, overall and by groups: var( cog_df[ , "full_time_employees" ] ) tapply( cog_df[ , "full_time_employees" ] , cog_df[ , "type_of_government" ] , var ) Regression Models and Tests of Association Perform a t-test: t.test( full_time_employees ~ any_full_time_employees , cog_df ) Perform a chi-squared test of association: this_table <- table( cog_df[ , c( "any_full_time_employees" , "census_region" ) ] ) chisq.test( this_table ) Perform a generalized linear model: glm_result <- glm( full_time_employees ~ any_full_time_employees + census_region , data = cog_df ) summary( glm_result ) Replication Example This example matches excel cell “C17” of Employment & Payroll Data by State and by Function: financial_admin_df <- subset( cog_df , government_function == 'Financial Administration' ) stopifnot( sum( financial_admin_df[ , 'full_time_employees' ] ) == 401394 ) Analysis Examples with dplyr   The R dplyr library offers an alternative grammar of data manipulation to base R and SQL syntax. dplyr offers many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, and the tidyverse style of non-standard evaluation. This vignette details the available features. As a starting point for COG users, this code replicates previously-presented examples: library(dplyr) cog_tbl <- as_tibble( cog_df ) Calculate the mean (average) of a linear variable, overall and by groups: cog_tbl %>% summarize( mean = mean( full_time_employees ) ) cog_tbl %>% group_by( type_of_government ) %>% summarize( mean = mean( full_time_employees ) ) Analysis Examples with data.table   The R data.table library provides a high-performance version of base R’s data.frame with syntax and feature enhancements for ease of use, convenience and programming speed. data.table offers concise syntax: fast to type, fast to read, fast speed, memory efficiency, a careful API lifecycle management, an active community, and a rich set of features. This vignette details the available features. As a starting point for COG users, this code replicates previously-presented examples: library(data.table) cog_dt <- data.table( cog_df ) Calculate the mean (average) of a linear variable, overall and by groups: cog_dt[ , mean( full_time_employees ) ] cog_dt[ , mean( full_time_employees ) , by = type_of_government ] Analysis Examples with duckdb   The R duckdb library provides an embedded analytical data management system with support for the Structured Query Language (SQL). duckdb offers a simple, feature-rich, fast, and free SQL OLAP management system. This vignette details the available features. As a starting point for COG users, this code replicates previously-presented examples: library(duckdb) con <- dbConnect( duckdb::duckdb() , dbdir = 'my-db.duckdb' ) dbWriteTable( con , 'cog' , cog_df ) Calculate the mean (average) of a linear variable, overall and by groups: dbGetQuery( con , 'SELECT AVG( full_time_employees ) FROM cog' ) dbGetQuery( con , 'SELECT type_of_government , AVG( full_time_employees ) FROM cog GROUP BY type_of_government' ) "],["current-population-survey-cps.html", "Current Population Survey (CPS) Recommended Reading Download, Import, Preparation Analysis Examples with the survey library   Replication Example Poverty and Inequality Estimation with convey   Analysis Examples with srvyr  ", " Current Population Survey (CPS) The principal labor force survey, providing income, poverty, and health insurance coverage estimates. One table with one row per sampled household, a second table with one row per family within each sampled household, and a third table with one row per individual within each of those families. A complex sample designed to generalize to the civilian non-institutional population of the US. Released annually since 1998, linkable to the Basic Monthly releases. Administered jointly by the US Census Bureau and the Bureau of Labor Statistics. Recommended Reading Four Example Strengths & Limitations: ✔️ Detailed labor force categorizations ✔️ Transparent methodological changes ❌ Retirement and investment income undercount ❌ Informal worker undercount Three Example Findings: In 2024, 57% of 18 to 24 year olds and 16% of 25 to 34 year olds lived in their parental home. The ratio of working poor to all individuals in the labor force for at least 27 weeks was 4% in 2022. Between 2022 and 2023, the share of children without health coverage rose from 5.4% to 5.8%. Two Methodology Documents: Current Population Survey 2024 Annual Social and Economic (ASEC) Supplement Wikipedia Entry One Haiku: # jobs robbed by robot # luddite rebellion looming # blue, due to red pill Download, Import, Preparation Download and unzip the 2024 file: library(httr) tf <- tempfile() this_url <- "https://www2.census.gov/programs-surveys/cps/datasets/2024/march/asecpub24sas.zip" GET( this_url , write_disk( tf ) , progress() ) unzipped_files <- unzip( tf , exdir = tempdir() ) Import all four files: library(haven) four_tbl <- lapply( unzipped_files , read_sas ) four_df <- lapply( four_tbl , data.frame ) four_df <- lapply( four_df , function( w ){ names( w ) <- tolower( names( w ) ) ; w } ) household_df <- four_df[[ grep( 'hhpub' , basename( unzipped_files ) ) ]] family_df <- four_df[[ grep( 'ffpub' , basename( unzipped_files ) ) ]] person_df <- four_df[[ grep( 'pppub' , basename( unzipped_files ) ) ]] repwgts_df <- four_df[[ grep( 'repwgt' , basename( unzipped_files ) ) ]] Divide weights: household_df[ , 'hsup_wgt' ] <- household_df[ , 'hsup_wgt' ] / 100 family_df[ , 'fsup_wgt' ] <- family_df[ , 'fsup_wgt' ] / 100 for ( j in c( 'marsupwt' , 'a_ernlwt' , 'a_fnlwgt' ) ) person_df[ , j ] <- person_df[ , j ] / 100 Merge these four files: names( family_df )[ names( family_df ) == 'fh_seq' ] <- 'h_seq' names( person_df )[ names( person_df ) == 'ph_seq' ] <- 'h_seq' names( person_df )[ names( person_df ) == 'phf_seq' ] <- 'ffpos' hh_fm_df <- merge( household_df , family_df ) hh_fm_pr_df <- merge( hh_fm_df , person_df ) cps_df <- merge( hh_fm_pr_df , repwgts_df ) stopifnot( nrow( cps_df ) == nrow( person_df ) ) Save Locally   Save the object at any point: # cps_fn <- file.path( path.expand( "~" ) , "CPS" , "this_file.rds" ) # saveRDS( cps_df , file = cps_fn , compress = FALSE ) Load the same object: # cps_df <- readRDS( cps_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) cps_design <- svrepdesign( weights = ~ marsupwt , repweights = "pwwgt[1-9]" , type = "Fay" , rho = ( 1 - 1 / sqrt( 4 ) ) , data = cps_df , combined.weights = TRUE , mse = TRUE ) Variable Recoding Add new columns to the data set: cps_design <- update( cps_design , one = 1 , a_maritl = factor( a_maritl , labels = c( "married - civilian spouse present" , "married - AF spouse present" , "married - spouse absent" , "widowed" , "divorced" , "separated" , "never married" ) ) , state_name = factor( gestfips , levels = c(1L, 2L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 53L, 54L, 55L, 56L) , labels = c("Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "District of Columbia", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming") ) , male = as.numeric( a_sex == 1 ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( cps_design , "sampling" ) != 0 ) svyby( ~ one , ~ state_name , cps_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , cps_design ) svyby( ~ one , ~ state_name , cps_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ ptotval , cps_design ) svyby( ~ ptotval , ~ state_name , cps_design , svymean ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ a_maritl , cps_design ) svyby( ~ a_maritl , ~ state_name , cps_design , svymean ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ ptotval , cps_design ) svyby( ~ ptotval , ~ state_name , cps_design , svytotal ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ a_maritl , cps_design ) svyby( ~ a_maritl , ~ state_name , cps_design , svytotal ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ ptotval , cps_design , 0.5 ) svyby( ~ ptotval , ~ state_name , cps_design , svyquantile , 0.5 , ci = TRUE ) Estimate a ratio: svyratio( numerator = ~ moop , denominator = ~ ptotval , cps_design ) Subsetting Restrict the survey design to persons aged 18-64: sub_cps_design <- subset( cps_design , a_age %in% 18:64 ) Calculate the mean (average) of this subset: svymean( ~ ptotval , sub_cps_design ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ ptotval , cps_design ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ ptotval , ~ state_name , cps_design , svymean ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( cps_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ ptotval , cps_design ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ ptotval , cps_design , deff = TRUE ) # SRS with replacement svymean( ~ ptotval , cps_design , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ male , cps_design , method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( ptotval ~ male , cps_design ) Perform a chi-squared test of association for survey data: svychisq( ~ male + a_maritl , cps_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( ptotval ~ male + a_maritl , cps_design ) summary( glm_result ) Replication Example This example matches the count and share of individuals with health insurance in Table H-01: count_covered <- svytotal( ~ as.numeric( cov == 1 ) , cps_design ) stopifnot( round( coef( count_covered ) , -5 ) == 305200000 ) stopifnot( round( coef( count_covered ) - confint( count_covered , level = 0.9 )[1] , -3 ) == 704000 ) share_covered <- svymean( ~ as.numeric( cov == 1 ) , subset( cps_design , cov > 0 ) ) stopifnot( round( coef( share_covered ) , 3 ) == 0.920 ) stopifnot( round( coef( share_covered ) - confint( share_covered , level = 0.9 )[1] , 3 ) == 0.002 ) Poverty and Inequality Estimation with convey   The R convey library estimates measures of income concentration, poverty, inequality, and wellbeing. This textbook details the available features. As a starting point for CPS users, this code calculates the gini coefficient on complex sample survey data: library(convey) cps_design <- convey_prep( cps_design ) cps_household_design <- subset( cps_design , a_exprrp %in% 1:2 ) svygini( ~ htotval , cps_household_design ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for CPS users, this code replicates previously-presented examples: library(srvyr) cps_srvyr_design <- as_survey( cps_design ) Calculate the mean (average) of a linear variable, overall and by groups: cps_srvyr_design %>% summarize( mean = survey_mean( ptotval ) ) cps_srvyr_design %>% group_by( state_name ) %>% summarize( mean = survey_mean( ptotval ) ) "],["exame-nacional-de-desempenho-de-estudantes-enade.html", "Exame Nacional de Desempenho de Estudantes (ENADE) Recommended Reading Download, Import, Preparation Analysis Examples with base R   Replication Example Analysis Examples with dplyr   Analysis Examples with data.table   Analysis Examples with duckdb  ", " Exame Nacional de Desempenho de Estudantes (ENADE) The nationwide mandatory examination of college graduates. One table with one row per individual undergraduate student in Brazil. An enumeration of undergraduate students in Brazil. Released annually since 2004. Compiled by the Instituto Nacional de Estudos e Pesquisas Educacionais Anísio Teixeira (INEP). Recommended Reading Two Methodology Documents: Cálculo da nota final do Exame Nacional de Desempenho dos Estudiantes Wikipedia Entry One Haiku: # undergraduates # sit for standardized testing # exit interview Download, Import, Preparation Download, import, and merge two of the 2021 files: library(httr) library(archive) tf <- tempfile() this_url <- "https://download.inep.gov.br/microdados/microdados_enade_2021.zip" GET( this_url , write_disk( tf ) , progress() ) archive_extract( tf , dir = tempdir() ) read_enade_archive <- function( this_regular_expression , this_directory ){ this_filename <- grep( this_regular_expression , list.files( this_directory , recursive = TRUE , full.names = TRUE ) , value = TRUE ) this_df <- read.table( this_filename , header = TRUE , sep = ";" , na.strings = "" ) names( this_df ) <- tolower( names( this_df ) ) this_df } arq1_df <- read_enade_archive( 'arq1\\\\.txt$' , tempdir() ) arq1_df <- unique( arq1_df[ c( 'co_curso' , 'co_uf_curso' , 'co_categad' , 'co_grupo' ) ] ) arq3_df <- read_enade_archive( 'arq3\\\\.txt$' , tempdir() ) enade_df <- merge( arq3_df , arq1_df ) stopifnot( nrow( enade_df ) == nrow( arq3_df ) ) Save Locally   Save the object at any point: # enade_fn <- file.path( path.expand( "~" ) , "ENADE" , "this_file.rds" ) # saveRDS( enade_df , file = enade_fn , compress = FALSE ) Load the same object: # enade_df <- readRDS( enade_fn ) Variable Recoding Add new columns to the data set: enade_df <- transform( enade_df , # qual foi o tempo gasto por voce para concluir a prova? less_than_two_hours = as.numeric( co_rs_i9 %in% c( 'A' , 'B' ) ) , administrative_category = factor( co_categad , levels = c( 1:5 , 7 ) , labels = c( '1. Pública Federal' , '2. Pública Estadual' , '3. Pública Municipal' , '4. Privada com fins lucrativos' , '5. Privada sem fins lucrativos' , '7. Especial' ) ) , state_name = factor( co_uf_curso , levels = c( 11:17 , 21:29 , 31:33 , 35 , 41:43 , 50:53 ) , labels = c( "Rondonia" , "Acre" , "Amazonas" , "Roraima" , "Para" , "Amapa" , "Tocantins" , "Maranhao" , "Piaui" , "Ceara" , "Rio Grande do Norte" , "Paraiba" , "Pernambuco" , "Alagoas" , "Sergipe" , "Bahia" , "Minas Gerais" , "Espirito Santo" , "Rio de Janeiro" , "Sao Paulo" , "Parana" , "Santa Catarina" , "Rio Grande do Sul" , "Mato Grosso do Sul" , "Mato Grosso" , "Goias" , "Distrito Federal" ) ) ) Analysis Examples with base R   Unweighted Counts Count the unweighted number of records in the table, overall and by groups: nrow( enade_df ) table( enade_df[ , "administrative_category" ] , useNA = "always" ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: mean( enade_df[ , "nt_obj_fg" ] , na.rm = TRUE ) tapply( enade_df[ , "nt_obj_fg" ] , enade_df[ , "administrative_category" ] , mean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: prop.table( table( enade_df[ , "state_name" ] ) ) prop.table( table( enade_df[ , c( "state_name" , "administrative_category" ) ] ) , margin = 2 ) Calculate the sum of a linear variable, overall and by groups: sum( enade_df[ , "nt_obj_fg" ] , na.rm = TRUE ) tapply( enade_df[ , "nt_obj_fg" ] , enade_df[ , "administrative_category" ] , sum , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: quantile( enade_df[ , "nt_obj_fg" ] , 0.5 , na.rm = TRUE ) tapply( enade_df[ , "nt_obj_fg" ] , enade_df[ , "administrative_category" ] , quantile , 0.5 , na.rm = TRUE ) Subsetting Limit your data.frame to students reporting that the general training section was easy or very easy: sub_enade_df <- subset( enade_df , co_rs_i1 %in% c( "A" , "B" ) ) Calculate the mean (average) of this subset: mean( sub_enade_df[ , "nt_obj_fg" ] , na.rm = TRUE ) Measures of Uncertainty Calculate the variance, overall and by groups: var( enade_df[ , "nt_obj_fg" ] , na.rm = TRUE ) tapply( enade_df[ , "nt_obj_fg" ] , enade_df[ , "administrative_category" ] , var , na.rm = TRUE ) Regression Models and Tests of Association Perform a t-test: t.test( nt_obj_fg ~ less_than_two_hours , enade_df ) Perform a chi-squared test of association: this_table <- table( enade_df[ , c( "less_than_two_hours" , "state_name" ) ] ) chisq.test( this_table ) Perform a generalized linear model: glm_result <- glm( nt_obj_fg ~ less_than_two_hours + state_name , data = enade_df ) summary( glm_result ) Replication Example This example matches the tecnologia em gestão da tecnologia da informação test scores on PDF page 48 of the 2021 final results document: it_students <- subset( enade_df , co_grupo %in% 6409 ) results <- sapply( it_students[ c( 'nt_fg' , 'nt_ce' , 'nt_ger' ) ] , mean , na.rm = TRUE ) stopifnot( round( results[ 'nt_fg' ] , 1 ) == 30.4 ) stopifnot( round( results[ 'nt_ce' ] , 1 ) == 38.2 ) stopifnot( round( results[ 'nt_ger' ] , 1 ) == 36.3 ) Analysis Examples with dplyr   The R dplyr library offers an alternative grammar of data manipulation to base R and SQL syntax. dplyr offers many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, and the tidyverse style of non-standard evaluation. This vignette details the available features. As a starting point for ENADE users, this code replicates previously-presented examples: library(dplyr) enade_tbl <- as_tibble( enade_df ) Calculate the mean (average) of a linear variable, overall and by groups: enade_tbl %>% summarize( mean = mean( nt_obj_fg , na.rm = TRUE ) ) enade_tbl %>% group_by( administrative_category ) %>% summarize( mean = mean( nt_obj_fg , na.rm = TRUE ) ) Analysis Examples with data.table   The R data.table library provides a high-performance version of base R’s data.frame with syntax and feature enhancements for ease of use, convenience and programming speed. data.table offers concise syntax: fast to type, fast to read, fast speed, memory efficiency, a careful API lifecycle management, an active community, and a rich set of features. This vignette details the available features. As a starting point for ENADE users, this code replicates previously-presented examples: library(data.table) enade_dt <- data.table( enade_df ) Calculate the mean (average) of a linear variable, overall and by groups: enade_dt[ , mean( nt_obj_fg , na.rm = TRUE ) ] enade_dt[ , mean( nt_obj_fg , na.rm = TRUE ) , by = administrative_category ] Analysis Examples with duckdb   The R duckdb library provides an embedded analytical data management system with support for the Structured Query Language (SQL). duckdb offers a simple, feature-rich, fast, and free SQL OLAP management system. This vignette details the available features. As a starting point for ENADE users, this code replicates previously-presented examples: library(duckdb) con <- dbConnect( duckdb::duckdb() , dbdir = 'my-db.duckdb' ) dbWriteTable( con , 'enade' , enade_df ) Calculate the mean (average) of a linear variable, overall and by groups: dbGetQuery( con , 'SELECT AVG( nt_obj_fg ) FROM enade' ) dbGetQuery( con , 'SELECT administrative_category , AVG( nt_obj_fg ) FROM enade GROUP BY administrative_category' ) "],["exame-nacional-do-ensino-medio-enem.html", "Exame Nacional do Ensino Medio (ENEM) Recommended Reading Download, Import, Preparation Analysis Examples with base R   Replication Example Analysis Examples with dplyr   Analysis Examples with data.table   Analysis Examples with duckdb  ", " Exame Nacional do Ensino Medio (ENEM) The national student aptitude test, used to assess high school completion and university admission. One table with one row per test-taking student, a second of study habit questionnaire respondents. Updated annually since 1998. Maintained by Brazil’s Instituto Nacional de Estudos e Pesquisas Educacionais Anisio Teixeira Recommended Reading Two Methodology Documents: Leia_Me_Enem included in each annual zipped file Wikipedia Entry One Haiku: # graduation stage # shake hands, toss cap, unroll scroll, # mais um exame? Download, Import, Preparation Download and unzip the 2022 file: library(httr) library(archive) tf <- tempfile() this_url <- "https://download.inep.gov.br/microdados/microdados_enem_2022.zip" GET( this_url , write_disk( tf ) , progress() ) archive_extract( tf , dir = tempdir() ) Import the 2022 file: library(readr) enem_fns <- list.files( tempdir() , recursive = TRUE , full.names = TRUE ) enem_fn <- grep( "MICRODADOS_ENEM_([0-9][0-9][0-9][0-9])\\\\.csv$" , enem_fns , value = TRUE ) enem_tbl <- read_csv2( enem_fn , locale = locale( encoding = 'latin1' ) ) enem_df <- data.frame( enem_tbl ) names( enem_df ) <- tolower( names( enem_df ) ) Save Locally   Save the object at any point: # enem_fn <- file.path( path.expand( "~" ) , "ENEM" , "this_file.rds" ) # saveRDS( enem_df , file = enem_fn , compress = FALSE ) Load the same object: # enem_df <- readRDS( enem_fn ) Variable Recoding Add new columns to the data set: enem_df <- transform( enem_df , domestic_worker = as.numeric( q007 %in% c( 'B' , 'C' , 'D' ) ) , administrative_category = factor( tp_dependencia_adm_esc , levels = 1:4 , labels = c( 'Federal' , 'Estadual' , 'Municipal' , 'Privada' ) ) , state_name = factor( co_uf_esc , levels = c( 11:17 , 21:29 , 31:33 , 35 , 41:43 , 50:53 ) , labels = c( "Rondonia" , "Acre" , "Amazonas" , "Roraima" , "Para" , "Amapa" , "Tocantins" , "Maranhao" , "Piaui" , "Ceara" , "Rio Grande do Norte" , "Paraiba" , "Pernambuco" , "Alagoas" , "Sergipe" , "Bahia" , "Minas Gerais" , "Espirito Santo" , "Rio de Janeiro" , "Sao Paulo" , "Parana" , "Santa Catarina" , "Rio Grande do Sul" , "Mato Grosso do Sul" , "Mato Grosso" , "Goias" , "Distrito Federal" ) ) ) Analysis Examples with base R   Unweighted Counts Count the unweighted number of records in the table, overall and by groups: nrow( enem_df ) table( enem_df[ , "administrative_category" ] , useNA = "always" ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: mean( enem_df[ , "nu_nota_mt" ] , na.rm = TRUE ) tapply( enem_df[ , "nu_nota_mt" ] , enem_df[ , "administrative_category" ] , mean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: prop.table( table( enem_df[ , "state_name" ] ) ) prop.table( table( enem_df[ , c( "state_name" , "administrative_category" ) ] ) , margin = 2 ) Calculate the sum of a linear variable, overall and by groups: sum( enem_df[ , "nu_nota_mt" ] , na.rm = TRUE ) tapply( enem_df[ , "nu_nota_mt" ] , enem_df[ , "administrative_category" ] , sum , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: quantile( enem_df[ , "nu_nota_mt" ] , 0.5 , na.rm = TRUE ) tapply( enem_df[ , "nu_nota_mt" ] , enem_df[ , "administrative_category" ] , quantile , 0.5 , na.rm = TRUE ) Subsetting Limit your data.frame to mother graduated from high school: sub_enem_df <- subset( enem_df , q002 %in% c( 'E' , 'F' , 'G' ) ) Calculate the mean (average) of this subset: mean( sub_enem_df[ , "nu_nota_mt" ] , na.rm = TRUE ) Measures of Uncertainty Calculate the variance, overall and by groups: var( enem_df[ , "nu_nota_mt" ] , na.rm = TRUE ) tapply( enem_df[ , "nu_nota_mt" ] , enem_df[ , "administrative_category" ] , var , na.rm = TRUE ) Regression Models and Tests of Association Perform a t-test: t.test( nu_nota_mt ~ domestic_worker , enem_df ) Perform a chi-squared test of association: this_table <- table( enem_df[ , c( "domestic_worker" , "state_name" ) ] ) chisq.test( this_table ) Perform a generalized linear model: glm_result <- glm( nu_nota_mt ~ domestic_worker + state_name , data = enem_df ) summary( glm_result ) Replication Example This example matches the registration counts in the Sinopse ENEM 2022 Excel table: stopifnot( nrow( enem_df ) == 3476105 ) Analysis Examples with dplyr   The R dplyr library offers an alternative grammar of data manipulation to base R and SQL syntax. dplyr offers many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, and the tidyverse style of non-standard evaluation. This vignette details the available features. As a starting point for ENEM users, this code replicates previously-presented examples: library(dplyr) enem_tbl <- as_tibble( enem_df ) Calculate the mean (average) of a linear variable, overall and by groups: enem_tbl %>% summarize( mean = mean( nu_nota_mt , na.rm = TRUE ) ) enem_tbl %>% group_by( administrative_category ) %>% summarize( mean = mean( nu_nota_mt , na.rm = TRUE ) ) Analysis Examples with data.table   The R data.table library provides a high-performance version of base R’s data.frame with syntax and feature enhancements for ease of use, convenience and programming speed. data.table offers concise syntax: fast to type, fast to read, fast speed, memory efficiency, a careful API lifecycle management, an active community, and a rich set of features. This vignette details the available features. As a starting point for ENEM users, this code replicates previously-presented examples: library(data.table) enem_dt <- data.table( enem_df ) Calculate the mean (average) of a linear variable, overall and by groups: enem_dt[ , mean( nu_nota_mt , na.rm = TRUE ) ] enem_dt[ , mean( nu_nota_mt , na.rm = TRUE ) , by = administrative_category ] Analysis Examples with duckdb   The R duckdb library provides an embedded analytical data management system with support for the Structured Query Language (SQL). duckdb offers a simple, feature-rich, fast, and free SQL OLAP management system. This vignette details the available features. As a starting point for ENEM users, this code replicates previously-presented examples: library(duckdb) con <- dbConnect( duckdb::duckdb() , dbdir = 'my-db.duckdb' ) dbWriteTable( con , 'enem' , enem_df ) Calculate the mean (average) of a linear variable, overall and by groups: dbGetQuery( con , 'SELECT AVG( nu_nota_mt ) FROM enem' ) dbGetQuery( con , 'SELECT administrative_category , AVG( nu_nota_mt ) FROM enem GROUP BY administrative_category' ) "],["european-social-survey-ess.html", "European Social Survey (ESS) Recommended Reading Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " European Social Survey (ESS) The barometer of political opinion and behavior across the continent. One table per country with one row per sampled respondent. A complex sample designed to generalize to residents aged 15 and older in participating nations. Released biennially since 2002. Headquartered at City, University of London and governed by a scientific team across Europe. Recommended Reading Four Example Strengths & Limitations: ✔️ Rotating modules allow external researchers to propose new questions ✔️ Sub-national geographies available ❌ Country-specific differences in methodology ❌ Questionnaires only translated into languages spoken by at least 5% of each country’s population Three Example Findings: Childless adults aged 65 to 74 in 2002 were not more socially excluded than those in 2018. Between 2002-2003 and 2016-2017, there was little change overall in the extent to which Europeans felt that their countries were made a better or worse place to live as a result of migration. The 2022 Russian invasion of Ukraine reduced authoritarian attitudes across eight nations. Two Methodology Documents: Findings from the European Social Survey Wikipedia Entry One Haiku: # pent up belief gauge # open border monarchists # survey for your thoughts Download, Import, Preparation Register at the ESS Data Portal at https://ess-search.nsd.no/. Choose ESS round 8 - 2016. Welfare attitudes, Attitudes to climate change. Download the integrated file and also the sample design (SDDF) files as SAV (SPSS) files: library(foreign) ess_int_df <- read.spss( file.path( path.expand( "~" ) , "ESS8e02_2.sav" ) , to.data.frame = TRUE , use.value.labels = FALSE ) ess_sddf_df <- read.spss( file.path( path.expand( "~" ) , "ESS8SDDFe01_1.sav" ) , to.data.frame = TRUE , use.value.labels = FALSE ) ess_df <- merge( ess_int_df , ess_sddf_df , by = c( 'cntry' , 'idno' ) ) stopifnot( nrow( ess_df ) == nrow( ess_int_df ) ) Save Locally   Save the object at any point: # ess_fn <- file.path( path.expand( "~" ) , "ESS" , "this_file.rds" ) # saveRDS( ess_df , file = ess_fn , compress = FALSE ) Load the same object: # ess_df <- readRDS( ess_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) options( survey.lonely.psu = "adjust" ) ess_df[ , 'anweight' ] <- ess_df[ , 'pspwght' ] * ess_df[ , 'pweight' ] * 10000 ess_design <- svydesign( ids = ~psu , strata = ~stratum , weights = ~anweight , data = ess_df , nest = TRUE ) Variable Recoding Add new columns to the data set: ess_design <- update( ess_design , one = 1 , gndr = factor( gndr , labels = c( 'male' , 'female' ) ) , netusoft = factor( netusoft , levels = 1:5 , labels = c( 'Never' , 'Only occasionally' , 'A few times a week' , 'Most days' , 'Every day' ) ) , belonging_to_particular_religion = as.numeric( rlgblg == 1 ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( ess_design , "sampling" ) != 0 ) svyby( ~ one , ~ cntry , ess_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , ess_design ) svyby( ~ one , ~ cntry , ess_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ ppltrst , ess_design , na.rm = TRUE ) svyby( ~ ppltrst , ~ cntry , ess_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ gndr , ess_design , na.rm = TRUE ) svyby( ~ gndr , ~ cntry , ess_design , svymean , na.rm = TRUE ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ ppltrst , ess_design , na.rm = TRUE ) svyby( ~ ppltrst , ~ cntry , ess_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ gndr , ess_design , na.rm = TRUE ) svyby( ~ gndr , ~ cntry , ess_design , svytotal , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ ppltrst , ess_design , 0.5 , na.rm = TRUE ) svyby( ~ ppltrst , ~ cntry , ess_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ ppltrst , denominator = ~ pplfair , ess_design , na.rm = TRUE ) Subsetting Restrict the survey design to voters: sub_ess_design <- subset( ess_design , vote == 1 ) Calculate the mean (average) of this subset: svymean( ~ ppltrst , sub_ess_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ ppltrst , ess_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ ppltrst , ~ cntry , ess_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( ess_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ ppltrst , ess_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ ppltrst , ess_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ ppltrst , ess_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ belonging_to_particular_religion , ess_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( ppltrst ~ belonging_to_particular_religion , ess_design ) Perform a chi-squared test of association for survey data: svychisq( ~ belonging_to_particular_religion + gndr , ess_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( ppltrst ~ belonging_to_particular_religion + gndr , ess_design ) summary( glm_result ) Replication Example This example matches statistics and confidence intervals within 0.1% from the Guide to Using Weights and Sample Design Indicators with ESS Data: published_proportions <- c( 0.166 , 0.055 , 0.085 , 0.115 , 0.578 ) published_lb <- c( 0.146 , 0.045 , 0.072 , 0.099 , 0.550 ) published_ub <- c( 0.188 , 0.068 , 0.100 , 0.134 , 0.605 ) austrians <- subset( ess_design , cntry == 'AT' ) ( results <- svymean( ~ netusoft , austrians , na.rm = TRUE ) ) stopifnot( all( round( coef( results ) , 3 ) == published_proportions ) ) ( ci_results <- confint( results ) ) stopifnot( all( abs( ci_results[ , 1 ] - published_lb ) < 0.0015 ) ) stopifnot( all( abs( ci_results[ , 2 ] - published_ub ) < 0.0015 ) ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for ESS users, this code replicates previously-presented examples: library(srvyr) ess_srvyr_design <- as_survey( ess_design ) Calculate the mean (average) of a linear variable, overall and by groups: ess_srvyr_design %>% summarize( mean = survey_mean( ppltrst , na.rm = TRUE ) ) ess_srvyr_design %>% group_by( cntry ) %>% summarize( mean = survey_mean( ppltrst , na.rm = TRUE ) ) "],["fda-adverse-event-reporting-system-faers.html", "FDA Adverse Event Reporting System (FAERS) Recommended Reading Function Definitions Download, Import, Preparation Analysis Examples with base R   Replication Example Analysis Examples with dplyr   Analysis Examples with data.table   Analysis Examples with duckdb  ", " FDA Adverse Event Reporting System (FAERS) The post-marketing safety surveillance program for drug and therapeutic biological products. Multiple tables linked by primaryid including demographics, outcomes, drug start and end dates. Voluntary reports from practitioners and patients, not representative, no verification of causality. Published quarterly since 2004, file structure revisions at 2012Q4 and 2014Q3. Maintained by the United States Food and Drug Administration (FDA). Recommended Reading Two Methodology Documents: ASC_NTS.DOC included in each quarterly zipped file, especially the Entity Relationship Diagram Questions and Answers on FDA’s Adverse Event Reporting System (FAERS) One Haiku: # side effect guestbook # violet you're turning violet # vi'lent dose response Function Definitions Define a function to import each text file: read_faers <- function( this_fn ){ read.table( this_fn , sep = "$" , header = TRUE , comment.char = "" , quote = "" ) } Download, Import, Preparation Download the quarterly file: library(httr) tf <- tempfile() this_url <- "https://fis.fda.gov/content/Exports/faers_ascii_2023q1.zip" GET( this_url , write_disk( tf ) , progress() ) unzipped_files <- unzip( tf , exdir = tempdir() ) Import multiple tables from the downloaded quarter of microdata: # one record per report faers_demo_df <- read_faers( grep( 'DEMO23Q1\\\\.txt$' , unzipped_files , value = TRUE ) ) # one or more record per report faers_drug_df <- read_faers( grep( 'DRUG23Q1\\\\.txt$' , unzipped_files , value = TRUE ) ) # zero or more records per report faers_outcome_df <- read_faers( grep( 'OUTC23Q1\\\\.txt$' , unzipped_files , value = TRUE ) ) Construct an analysis file limited to reported deaths: # limit the outcome file to deaths faers_deaths_df <- subset( faers_outcome_df , outc_cod == 'DE' ) # merge demographics with each reported death faers_df <- merge( faers_demo_df , faers_deaths_df ) # confirm that the analysis file matches the number of death outcomes stopifnot( nrow( faers_deaths_df ) == nrow( faers_df ) ) # confirm zero reports include multiple deaths from the same reported adverse event stopifnot( nrow( faers_df ) == length( unique( faers_df[ , 'primaryid' ] ) ) ) Save Locally   Save the object at any point: # faers_fn <- file.path( path.expand( "~" ) , "FAERS" , "this_file.rds" ) # saveRDS( faers_df , file = faers_fn , compress = FALSE ) Load the same object: # faers_df <- readRDS( faers_fn ) Variable Recoding Add new columns to the data set: faers_df <- transform( faers_df , physician_reported = as.numeric( occp_cod == "MD" ) , reporter_country_categories = ifelse( reporter_country == 'US' , 'USA' , ifelse( reporter_country == 'COUNTRY NOT SPECIFIED' , 'missing' , ifelse( reporter_country == 'JP' , 'Japan' , ifelse( reporter_country == 'UK' , 'UK' , ifelse( reporter_country == 'CA' , 'Canada' , ifelse( reporter_country == 'FR' , 'France' , 'Other' ) ) ) ) ) ) , init_fda_year = as.numeric( substr( init_fda_dt , 1 , 4 ) ) ) Analysis Examples with base R   Unweighted Counts Count the unweighted number of records in the table, overall and by groups: nrow( faers_df ) table( faers_df[ , "reporter_country_categories" ] , useNA = "always" ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: mean( faers_df[ , "init_fda_year" ] , na.rm = TRUE ) tapply( faers_df[ , "init_fda_year" ] , faers_df[ , "reporter_country_categories" ] , mean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: prop.table( table( faers_df[ , "sex" ] ) ) prop.table( table( faers_df[ , c( "sex" , "reporter_country_categories" ) ] ) , margin = 2 ) Calculate the sum of a linear variable, overall and by groups: sum( faers_df[ , "init_fda_year" ] , na.rm = TRUE ) tapply( faers_df[ , "init_fda_year" ] , faers_df[ , "reporter_country_categories" ] , sum , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: quantile( faers_df[ , "init_fda_year" ] , 0.5 , na.rm = TRUE ) tapply( faers_df[ , "init_fda_year" ] , faers_df[ , "reporter_country_categories" ] , quantile , 0.5 , na.rm = TRUE ) Subsetting Limit your data.frame to elderly persons: sub_faers_df <- subset( faers_df , age_grp == "E" ) Calculate the mean (average) of this subset: mean( sub_faers_df[ , "init_fda_year" ] , na.rm = TRUE ) Measures of Uncertainty Calculate the variance, overall and by groups: var( faers_df[ , "init_fda_year" ] , na.rm = TRUE ) tapply( faers_df[ , "init_fda_year" ] , faers_df[ , "reporter_country_categories" ] , var , na.rm = TRUE ) Regression Models and Tests of Association Perform a t-test: t.test( init_fda_year ~ physician_reported , faers_df ) Perform a chi-squared test of association: this_table <- table( faers_df[ , c( "physician_reported" , "sex" ) ] ) chisq.test( this_table ) Perform a generalized linear model: glm_result <- glm( init_fda_year ~ physician_reported + sex , data = faers_df ) summary( glm_result ) Replication Example This example matches the death frequency counts in the OUTC23Q1.pdf file in the downloaded quarter: stopifnot( nrow( faers_df ) == 37704 ) Analysis Examples with dplyr   The R dplyr library offers an alternative grammar of data manipulation to base R and SQL syntax. dplyr offers many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, and the tidyverse style of non-standard evaluation. This vignette details the available features. As a starting point for FAERS users, this code replicates previously-presented examples: library(dplyr) faers_tbl <- as_tibble( faers_df ) Calculate the mean (average) of a linear variable, overall and by groups: faers_tbl %>% summarize( mean = mean( init_fda_year , na.rm = TRUE ) ) faers_tbl %>% group_by( reporter_country_categories ) %>% summarize( mean = mean( init_fda_year , na.rm = TRUE ) ) Analysis Examples with data.table   The R data.table library provides a high-performance version of base R’s data.frame with syntax and feature enhancements for ease of use, convenience and programming speed. data.table offers concise syntax: fast to type, fast to read, fast speed, memory efficiency, a careful API lifecycle management, an active community, and a rich set of features. This vignette details the available features. As a starting point for FAERS users, this code replicates previously-presented examples: library(data.table) faers_dt <- data.table( faers_df ) Calculate the mean (average) of a linear variable, overall and by groups: faers_dt[ , mean( init_fda_year , na.rm = TRUE ) ] faers_dt[ , mean( init_fda_year , na.rm = TRUE ) , by = reporter_country_categories ] Analysis Examples with duckdb   The R duckdb library provides an embedded analytical data management system with support for the Structured Query Language (SQL). duckdb offers a simple, feature-rich, fast, and free SQL OLAP management system. This vignette details the available features. As a starting point for FAERS users, this code replicates previously-presented examples: library(duckdb) con <- dbConnect( duckdb::duckdb() , dbdir = 'my-db.duckdb' ) dbWriteTable( con , 'faers' , faers_df ) Calculate the mean (average) of a linear variable, overall and by groups: dbGetQuery( con , 'SELECT AVG( init_fda_year ) FROM faers' ) dbGetQuery( con , 'SELECT reporter_country_categories , AVG( init_fda_year ) FROM faers GROUP BY reporter_country_categories' ) "],["general-social-survey-gss.html", "General Social Survey (GSS) Recommended Reading Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " General Social Survey (GSS) A historical record of the concerns, experiences, attitudes, and practices of residents of the United States. Both cross-sectional and panel tables with one row per sampled respondent. A complex sample survey generalizing to non-institutionalized adults (18+) in the United States. Updated biennially since 1972. Funded by National Science Foundation, administered by the National Opinion Research Center. Recommended Reading Four Example Strengths & Limitations: ✔️ Fifty years of comparable measures for trend analyses ✔️ Fifteen minutes of questionnaire aligns with International Social Survey Programme ❌ One adult per household interviewed, living in larger households lowers probability of selection ❌ In 2022, 21% of mixed-mode interviews were aged 65+ versus 10% of the web-only oversample Three Example Findings: Between 2000 and 2021, confidence in the scientific community has remained steady. During 2018 to 2021, support for gun control was overwhelmingly positive among survey respondents, differing from sentiment analysis of social media data over the same period. In 2021, 24% of Americans reported they were “not too happy” in life, up from 13% in 2018. Two Methodology Documents: DOCUMENTATION AND PUBLIC USE FILE CODEBOOK (Release 1) Wikipedia Entry One Haiku: # chat about who will # be allowed marriage, children. # first date questionnaire Download, Import, Preparation Download and import the 1972-2022 cumulative data file: library(haven) zip_tf <- tempfile() zip_url <- "https://gss.norc.org/Documents/sas/GSS_sas.zip" download.file( zip_url , zip_tf , mode = 'wb' ) unzipped_files <- unzip( zip_tf , exdir = tempdir() ) gss_tbl <- read_sas( grep( '\\\\.sas7bdat$' , unzipped_files , value = TRUE ) ) gss_df <- data.frame( gss_tbl ) names( gss_df ) <- tolower( names( gss_df ) ) gss_df[ , 'one' ] <- 1 Save Locally   Save the object at any point: # gss_fn <- file.path( path.expand( "~" ) , "GSS" , "this_file.rds" ) # saveRDS( gss_df , file = gss_fn , compress = FALSE ) Load the same object: # gss_df <- readRDS( gss_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) options( survey.lonely.psu = "adjust" ) gss_design <- svydesign( ~ vpsu , strata = ~ interaction( year , vstrat ) , data = subset( gss_df , year >= 1975 & !is.na( wtssnrps ) ) , weights = ~ wtssnrps , nest = TRUE ) Variable Recoding Add new columns to the data set: gss_design <- update( gss_design , polviews = factor( polviews , levels = 1:7 , labels = c( "Extremely liberal" , "Liberal" , "Slightly liberal" , "Moderate, middle of the road" , "Slightly conservative" , "Conservative" , "Extremely conservative" ) ) , born_in_usa = as.numeric( born == 1 ) , race = factor( race , levels = 1:3 , labels = c( "white" , "black" , "other" ) ) , region = factor( region , levels = 1:9 , labels = c( "New England" , "Middle Atlantic" , "East North Central" , "West North Central" , "South Atlantic" , "East South Central" , "West South Central" , "Mountain" , "Pacific" ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( gss_design , "sampling" ) != 0 ) svyby( ~ one , ~ region , gss_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , gss_design ) svyby( ~ one , ~ region , gss_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ age , gss_design , na.rm = TRUE ) svyby( ~ age , ~ region , gss_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ race , gss_design , na.rm = TRUE ) svyby( ~ race , ~ region , gss_design , svymean , na.rm = TRUE ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ age , gss_design , na.rm = TRUE ) svyby( ~ age , ~ region , gss_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ race , gss_design , na.rm = TRUE ) svyby( ~ race , ~ region , gss_design , svytotal , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ age , gss_design , 0.5 , na.rm = TRUE ) svyby( ~ age , ~ region , gss_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ adults , denominator = ~ hompop , gss_design , na.rm = TRUE ) Subsetting Restrict the survey design to females: sub_gss_design <- subset( gss_design , sex == 2 ) Calculate the mean (average) of this subset: svymean( ~ age , sub_gss_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ age , gss_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ age , ~ region , gss_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( gss_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ age , gss_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ age , gss_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ age , gss_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ born_in_usa , gss_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( age ~ born_in_usa , gss_design ) Perform a chi-squared test of association for survey data: svychisq( ~ born_in_usa + race , gss_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( age ~ born_in_usa + race , gss_design ) summary( glm_result ) Replication Example Match the unweighted record count totals on PDF page 74 of the Public Use File codebook: stopifnot( nrow( subset( gss_design , year == 2021 ) ) == 4032 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for GSS users, this code replicates previously-presented examples: library(srvyr) gss_srvyr_design <- as_survey( gss_design ) Calculate the mean (average) of a linear variable, overall and by groups: gss_srvyr_design %>% summarize( mean = survey_mean( age , na.rm = TRUE ) ) gss_srvyr_design %>% group_by( region ) %>% summarize( mean = survey_mean( age , na.rm = TRUE ) ) "],["health-and-retirement-study-hrs.html", "Health and Retirement Study (HRS) Recommended Reading Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " Health and Retirement Study (HRS) This detailed longitudinal study of the elderly in the United States allows for findings such as, “Among community residents aged 55-64 years old in 1998, what share lived in nursing homes by 2020?” Many tables from different timepoints, most with one row per sampled respondent and linkable. A complex sample survey designed to generalize to Americans aged 50+ at each interview point. Released biennially since 1992. Administered by the University of Michigan’s Institute for Social Research with data management by the RAND Corporation and cross-national harmonization by the University of Southern California. Funded by the National Institute on Aging and the Social Security Administration. Recommended Reading Four Example Strengths & Limitations: ✔️ Multiple cohorts allow extended tracking of older individuals across time ✔️ Linkable to Medicare, Medicaid, SSA records, also to genetic and biomarker data ❌ Sample size may prevent analysis of smaller populations or rare events ❌ Attritors may differ in many ways from the general population Three Example Findings: Among individuals aged 50+ between 2003 and 2016, 80% of those who experienced a long-term care hospital stay subsequently died or suffered a severe impairment within 30 months. Wealth loss of 75%+ was negatively associated with subsequent cognitive function over 2012-2016. The total monetary cost of dementia in 2010 was between $157 billion and $215 billion. Two Methodology Documents: Getting Started with the Health and Retirement Study RAND HRS Longitudinal File 2020 (V1) Documentation One Haiku: # sankey diagram # comes alive at fifty five # till death? you respond Download, Import, Preparation Register at the HRS Data Portal at https://hrsdata.isr.umich.edu/user/register. Choose RAND HRS Longitudinal File 2020 Latest release: Mar 2023 (V1). Download the STATA dataset randhrs1992_2020v1_STATA.zip dated 04/05/2023: library(haven) hrs_fn <- file.path( path.expand( "~" ) , "randhrs1992_2020v1.dta" ) hrs_tbl <- read_dta( hrs_fn ) hrs_df <- data.frame( hrs_tbl ) names( hrs_df ) <- tolower( names( hrs_df ) ) Save Locally   Save the object at any point: # hrs_fn <- file.path( path.expand( "~" ) , "HRS" , "this_file.rds" ) # saveRDS( hrs_df , file = hrs_fn , compress = FALSE ) Load the same object: # hrs_df <- readRDS( hrs_fn ) Survey Design Definition Construct a complex sample survey design: This design generalizes to residents of the United States that were living in the community in 1996 (wave 3) and also still alive (and participating in the survey) as of 2020 (wave 15): library(survey) hrs_design <- svydesign( id = ~ raehsamp , strata = ~ raestrat , weights = ~ r3wtresp , nest = TRUE , data = subset( hrs_df , r3wtresp > 0 & inw15 == 1 ) ) Variable Recoding Add new columns to the data set: hrs_design <- update( hrs_design , one = 1 , working_in_1996 = r3work , working_in_2020 = r15work , marital_stat_1996 = factor( r3mstat , levels = 1:8 , labels = c( "Married" , "Married, spouse absent" , "Partnered" , "Separated" , "Divorced" , "Separated/divorced" , "Widowed" , "Never married" ) ) , marital_stat_2020 = factor( r15mstat , levels = 1:8 , labels = c( "Married" , "Married, spouse absent" , "Partnered" , "Separated" , "Divorced" , "Separated/divorced" , "Widowed" , "Never married" ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( hrs_design , "sampling" ) != 0 ) svyby( ~ one , ~ marital_stat_1996 , hrs_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , hrs_design ) svyby( ~ one , ~ marital_stat_1996 , hrs_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ h15ahous , hrs_design , na.rm = TRUE ) svyby( ~ h15ahous , ~ marital_stat_1996 , hrs_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ marital_stat_2020 , hrs_design , na.rm = TRUE ) svyby( ~ marital_stat_2020 , ~ marital_stat_1996 , hrs_design , svymean , na.rm = TRUE ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ h15ahous , hrs_design , na.rm = TRUE ) svyby( ~ h15ahous , ~ marital_stat_1996 , hrs_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ marital_stat_2020 , hrs_design , na.rm = TRUE ) svyby( ~ marital_stat_2020 , ~ marital_stat_1996 , hrs_design , svytotal , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ h15ahous , hrs_design , 0.5 , na.rm = TRUE ) svyby( ~ h15ahous , ~ marital_stat_1996 , hrs_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ h4ahous , denominator = ~ h15ahous , hrs_design , na.rm = TRUE ) Subsetting Restrict the survey design to : sub_hrs_design <- subset( hrs_design , working_in_1996 == 1 ) Calculate the mean (average) of this subset: svymean( ~ h15ahous , sub_hrs_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ h15ahous , hrs_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ h15ahous , ~ marital_stat_1996 , hrs_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( hrs_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ h15ahous , hrs_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ h15ahous , hrs_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ h15ahous , hrs_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ working_in_2020 , hrs_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( h15ahous ~ working_in_2020 , hrs_design ) Perform a chi-squared test of association for survey data: svychisq( ~ working_in_2020 + marital_stat_2020 , hrs_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( h15ahous ~ working_in_2020 + marital_stat_2020 , hrs_design ) summary( glm_result ) Replication Example This example matches statistics and confidence intervals to four digits from the Gateway to Global Aging’s An Introduction to HRS, RAND HRS Longitudinal File, and Harmonized HRS: Navigate to Contributed Projects at https://hrsdata.isr.umich.edu/data-products/contributed-projects. Choose Gateway Harmonized HRS Latest release: Aug 2023 Version D Download the STATA dataset H_HRS_d_stata.zip dated 09/12/2023 harmonized_hrs_fn <- file.path( path.expand( "~" ) , "H_HRS_d.dta" ) harmonized_hrs_tbl <- read_dta( harmonized_hrs_fn ) harmonized_hrs_df <- data.frame( harmonized_hrs_tbl ) names( harmonized_hrs_df ) <- tolower( names( harmonized_hrs_df ) ) Merge on cluster and strata variables from the RAND HRS Longitudinal file: harmonized_hrs_rand_df <- merge( harmonized_hrs_df , hrs_df[ c( 'hhid' , 'pn' , 'raestrat' , 'raehsamp' ) ] , by = c( 'hhid' , 'pn' ) ) stopifnot( nrow( harmonized_hrs_rand_df ) == nrow( hrs_df ) ) Limit the survey design to respondents answering at least two of the five different life satisfaction questions in the 2014 (wave 12) psychosocial leave-behind survey: h12sc_df <- subset( harmonized_hrs_rand_df , r12scwtresp > 0 & inw12sc == 1 ) r12sc_design <- svydesign( ~ raehsamp , strata = ~ raestrat , data = h12sc_df , weights = ~ r12scwtresp , nest = TRUE ) Reproduce the coefficient, standard error, and confidence intervals presented at 53:20 of the tutorial: result <- svymean( ~ r12lsatsc , r12sc_design , na.rm = TRUE ) stopifnot( round( coef( result ) , 4 ) == 4.9822 ) stopifnot( round( SE( result ) , 4 ) == 0.0226 ) stopifnot( round( confint( result , df = degf( r12sc_design ) ) , 4 ) == c( 4.9369 , 5.0276 ) ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for HRS users, this code replicates previously-presented examples: library(srvyr) hrs_srvyr_design <- as_survey( hrs_design ) Calculate the mean (average) of a linear variable, overall and by groups: hrs_srvyr_design %>% summarize( mean = survey_mean( h15ahous , na.rm = TRUE ) ) hrs_srvyr_design %>% group_by( marital_stat_1996 ) %>% summarize( mean = survey_mean( h15ahous , na.rm = TRUE ) ) "],["medicare-current-beneficiary-survey-mcbs.html", "Medicare Current Beneficiary Survey (MCBS) Recommended Reading Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " Medicare Current Beneficiary Survey (MCBS) The monitoring system for Medicare enrollees in the United States on topics not available in the program’s administrative data, such as out of pocket expenditure and beneficiary satisfaction. Survey and supplemental tables with one row per sampled individual, although downloadable datasets not linkable. A complex sample survey designed to generalize to all elderly and disabled individuals with at least one month of program enrollment during the calendar year. Released annually as a public use file since 2015. Conducted by the Office of Enterprise Data and Analytics (OEDA) of the Centers for Medicare & Medicaid Services (CMS) through a contract with NORC at the University of Chicago. Recommended Reading Four Example Strengths & Limitations: ✔️ Respondents interviewed up to three times annually across four years ✔️ More than 1.2 million interviews since inception ❌ Some respondents designate a proxy to interview on their behalf ❌ Facility residents included, but not in public use file Three Example Findings: Among all Medicare beneficiaries, 7 percent reported having problems paying a medical bill in 2021. Between 1999 and 2017, Medicare beneficiaries with diabetes faced higher out-of-pocket costs. In 2020, healthcare expenditure for non-fatal falls was $80 billion, the majority paid by Medicare. Two Methodology Documents: MCBS Methodology Report MCBS Advanced Tutorial on Weighting and Variance Estimation One Haiku: # old, or disabled # access to medical care, # utilization Download, Import, Preparation tf <- tempfile() this_url <- "https://www.cms.gov/files/zip/cspuf2021.zip" download.file( this_url , tf , mode = 'wb' ) unzipped_files <- unzip( tf , exdir = tempdir() ) mcbs_csv <- grep( '\\\\.csv$' , unzipped_files , value = TRUE ) mcbs_df <- read.csv( mcbs_csv ) names( mcbs_df ) <- tolower( names( mcbs_df ) ) Save Locally   Save the object at any point: # mcbs_fn <- file.path( path.expand( "~" ) , "MCBS" , "this_file.rds" ) # saveRDS( mcbs_df , file = mcbs_fn , compress = FALSE ) Load the same object: # mcbs_df <- readRDS( mcbs_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) mcbs_design <- svrepdesign( weight = ~cspufwgt , repweights = 'cspuf[0-9]+' , mse = TRUE , type = 'Fay' , rho = 0.3 , data = mcbs_df ) Variable Recoding Add new columns to the data set: mcbs_design <- update( mcbs_design , one = 1 , csp_age = factor( csp_age , levels = 1:3 , labels = c( '01: younger than 65' , '02: 65 to 74' , '03: 75 or older' ) ) , two_or_more_chronic_conditions = as.numeric( csp_nchrncnd > 1 ) , csp_sex = factor( csp_sex , labels = c( 'male' , 'female' ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( mcbs_design , "sampling" ) != 0 ) svyby( ~ one , ~ csp_age , mcbs_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , mcbs_design ) svyby( ~ one , ~ csp_age , mcbs_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ pamtoop , mcbs_design ) svyby( ~ pamtoop , ~ csp_age , mcbs_design , svymean ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ csp_sex , mcbs_design ) svyby( ~ csp_sex , ~ csp_age , mcbs_design , svymean ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ pamtoop , mcbs_design ) svyby( ~ pamtoop , ~ csp_age , mcbs_design , svytotal ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ csp_sex , mcbs_design ) svyby( ~ csp_sex , ~ csp_age , mcbs_design , svytotal ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ pamtoop , mcbs_design , 0.5 ) svyby( ~ pamtoop , ~ csp_age , mcbs_design , svyquantile , 0.5 , ci = TRUE ) Estimate a ratio: svyratio( numerator = ~ pamtoop , denominator = ~ pamttot , mcbs_design ) Subsetting Restrict the survey design to household income below $25,000: sub_mcbs_design <- subset( mcbs_design , csp_income == 1 ) Calculate the mean (average) of this subset: svymean( ~ pamtoop , sub_mcbs_design ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ pamtoop , mcbs_design ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ pamtoop , ~ csp_age , mcbs_design , svymean ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( mcbs_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ pamtoop , mcbs_design ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ pamtoop , mcbs_design , deff = TRUE ) # SRS with replacement svymean( ~ pamtoop , mcbs_design , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ two_or_more_chronic_conditions , mcbs_design , method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( pamtoop ~ two_or_more_chronic_conditions , mcbs_design ) Perform a chi-squared test of association for survey data: svychisq( ~ two_or_more_chronic_conditions + csp_sex , mcbs_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( pamtoop ~ two_or_more_chronic_conditions + csp_sex , mcbs_design ) summary( glm_result ) Replication Example This example matches the weighted total from the 2021 Data User’s Guide: Cost Supplement File Public Use File: stopifnot( round( coef( svytotal( ~ one , mcbs_design ) ) , 0 ) == 59040948 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for MCBS users, this code replicates previously-presented examples: library(srvyr) mcbs_srvyr_design <- as_survey( mcbs_design ) Calculate the mean (average) of a linear variable, overall and by groups: mcbs_srvyr_design %>% summarize( mean = survey_mean( pamtoop ) ) mcbs_srvyr_design %>% group_by( csp_age ) %>% summarize( mean = survey_mean( pamtoop ) ) "],["medical-expenditure-panel-survey-meps.html", "Medical Expenditure Panel Survey (MEPS) Recommended Reading Function Definitions Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " Medical Expenditure Panel Survey (MEPS) The Household Component captures person-level spending across service categories, coverage types. The consolidated file contains one row per individual within each sampled household, other tables contain one record per event (like prescription fills, hospitalizations), per job, per insurance policy. A complex sample survey designed to generalize to the U.S. civilian non-institutionalized population. Released annually since 1996. Administered by the Agency for Healthcare Research and Quality. Recommended Reading Four Example Strengths & Limitations: ✔️ Detailed information about individual medical events ✔️ Detailed sources of health expenditures ❌ Methodological changes could make a notable impact on trend analyses for conditions ❌ Household-reported medical events may be undercounted Three Example Findings: In 2021, the top 1 percent of the population ranked by their healthcare expenditures accounted for 24.0 percent of total healthcare expenditures among the U.S. civilian noninstitutionalized population. Compared to those losing a job during the three prior years, nonelderly adults who lost a job during 2014 thru 2016 had a 6% net reduction in loss of health insurance coverage. Americans lose ~$1,500 per year (in 2013 USD) over their life-course due to bad health realizations. Two Methodology Documents: MEPS HC-224 2020 Full Year Consolidated Data File Wikipedia Entry One Haiku: # king dumpty's horsemen # ahrq stitches payors, bills, claims # fractured health system Function Definitions Define a function to download, unzip, and import each sas file: library(haven) meps_sas_import <- function( this_url ){ this_tf <- tempfile() download.file( this_url , this_tf , mode = 'wb' ) this_tbl <- read_sas( this_tf ) this_df <- data.frame( this_tbl ) names( this_df ) <- tolower( names( this_df ) ) this_df } Download, Import, Preparation Download and import the 2022 consolidated file and the replicate weights file: meps_cons_df <- meps_sas_import( "https://meps.ahrq.gov/mepsweb/data_files/pufs/h243/h243v9.zip" ) meps_brr_df <- meps_sas_import( "https://meps.ahrq.gov/mepsweb/data_files/pufs/h036brr/h36brr22v9.zip" ) Merge the consolidated file with the replicate weights: meps_df <- merge( meps_cons_df , meps_brr_df ) stopifnot( nrow( meps_df ) == nrow( meps_cons_df ) ) meps_df[ , 'one' ] <- 1 Save Locally   Save the object at any point: # meps_fn <- file.path( path.expand( "~" ) , "MEPS" , "this_file.rds" ) # saveRDS( meps_df , file = meps_fn , compress = FALSE ) Load the same object: # meps_df <- readRDS( meps_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) meps_design <- svrepdesign( data = meps_df , weights = ~ perwt22f , type = "BRR" , combined.weights = FALSE , repweights = "brr[1-9]+" , mse = TRUE ) Variable Recoding Add new columns to the data set: meps_design <- update( meps_design , one = 1 , insured_december_31st = ifelse( ins22x %in% 1:2 , as.numeric( ins22x == 1 ) , NA ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( meps_design , "sampling" ) != 0 ) svyby( ~ one , ~ region22 , meps_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , meps_design ) svyby( ~ one , ~ region22 , meps_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ totexp22 , meps_design ) svyby( ~ totexp22 , ~ region22 , meps_design , svymean ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ sex , meps_design ) svyby( ~ sex , ~ region22 , meps_design , svymean ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ totexp22 , meps_design ) svyby( ~ totexp22 , ~ region22 , meps_design , svytotal ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ sex , meps_design ) svyby( ~ sex , ~ region22 , meps_design , svytotal ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ totexp22 , meps_design , 0.5 ) svyby( ~ totexp22 , ~ region22 , meps_design , svyquantile , 0.5 , ci = TRUE ) Estimate a ratio: svyratio( numerator = ~ totmcd22 , denominator = ~ totexp22 , meps_design ) Subsetting Restrict the survey design to seniors: sub_meps_design <- subset( meps_design , agelast >= 65 ) Calculate the mean (average) of this subset: svymean( ~ totexp22 , sub_meps_design ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ totexp22 , meps_design ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ totexp22 , ~ region22 , meps_design , svymean ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( meps_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ totexp22 , meps_design ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ totexp22 , meps_design , deff = TRUE ) # SRS with replacement svymean( ~ totexp22 , meps_design , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ insured_december_31st , meps_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( totexp22 ~ insured_december_31st , meps_design ) Perform a chi-squared test of association for survey data: svychisq( ~ insured_december_31st + sex , meps_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( totexp22 ~ insured_december_31st + sex , meps_design ) summary( glm_result ) Replication Example This example matches the statistic and standard error shown under Analysis of the Total Population: library(foreign) xport_2002_tf <- tempfile() xport_2002_url <- "https://meps.ahrq.gov/data_files/pufs/h70ssp.zip" download.file( xport_2002_url , xport_2002_tf , mode = 'wb' ) unzipped_2002_xport <- unzip( xport_2002_tf , exdir = tempdir() ) meps_2002_df <- read.xport( unzipped_2002_xport ) names( meps_2002_df ) <- tolower( names( meps_2002_df ) ) meps_2002_design <- svydesign( ~ varpsu , strata = ~ varstr , weights = ~ perwt02f , data = meps_2002_df , nest = TRUE ) result <- svymean( ~ totexp02 , meps_2002_design ) stopifnot( round( coef( result ) , 2 ) == 2813.24 ) stopifnot( round( SE( result ) , 2 ) == 58.99 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for MEPS users, this code replicates previously-presented examples: library(srvyr) meps_srvyr_design <- as_survey( meps_design ) Calculate the mean (average) of a linear variable, overall and by groups: meps_srvyr_design %>% summarize( mean = survey_mean( totexp22 ) ) meps_srvyr_design %>% group_by( region22 ) %>% summarize( mean = survey_mean( totexp22 ) ) "],["medical-large-claims-experience-study-mlces.html", "Medical Large Claims Experience Study (MLCES) Recommended Reading Download, Import, Preparation Analysis Examples with base R   Replication Example Analysis Examples with dplyr   Analysis Examples with data.table   Analysis Examples with duckdb  ", " Medical Large Claims Experience Study (MLCES) A high quality dataset of medical claims from seven private health insurance companies. One table with one row per individual with nonzero total paid charges. A convenience sample of group (employer-sponsored) health insurers in the United States. 1997 thru 1999 with no expected updates in the future. Provided by the Society of Actuaries (SOA). Recommended Reading Two Methodology Documents: Group Medical Insurance Claims Database Collection and Analysis Report Claim Severities, Claim Relativities, and Age: Evidence from SOA Group Health Data One Haiku: # skewed by black swan tails # means, medians sing adieu # claims distribution Download, Import, Preparation Download and import the 1999 medical claims file: tf <- tempfile() this_url <- "https://www.soa.org/Files/Research/1999.zip" download.file( this_url , tf , mode = 'wb' ) unzipped_file <- unzip( tf , exdir = tempdir() ) mlces_df <- read.csv( unzipped_file ) names( mlces_df ) <- tolower( names( mlces_df ) ) Save Locally   Save the object at any point: # mlces_fn <- file.path( path.expand( "~" ) , "MLCES" , "this_file.rds" ) # saveRDS( mlces_df , file = mlces_fn , compress = FALSE ) Load the same object: # mlces_df <- readRDS( mlces_fn ) Variable Recoding Add new columns to the data set: mlces_df <- transform( mlces_df , one = 1 , claimant_relationship_to_policyholder = ifelse( relation == "E" , "covered employee" , ifelse( relation == "S" , "spouse of covered employee" , ifelse( relation == "D" , "dependent of covered employee" , NA ) ) ) , ppo_plan = as.numeric( ppo == 'Y' ) ) Analysis Examples with base R   Unweighted Counts Count the unweighted number of records in the table, overall and by groups: nrow( mlces_df ) table( mlces_df[ , "claimant_relationship_to_policyholder" ] , useNA = "always" ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: mean( mlces_df[ , "totpdchg" ] ) tapply( mlces_df[ , "totpdchg" ] , mlces_df[ , "claimant_relationship_to_policyholder" ] , mean ) Calculate the distribution of a categorical variable, overall and by groups: prop.table( table( mlces_df[ , "patsex" ] ) ) prop.table( table( mlces_df[ , c( "patsex" , "claimant_relationship_to_policyholder" ) ] ) , margin = 2 ) Calculate the sum of a linear variable, overall and by groups: sum( mlces_df[ , "totpdchg" ] ) tapply( mlces_df[ , "totpdchg" ] , mlces_df[ , "claimant_relationship_to_policyholder" ] , sum ) Calculate the median (50th percentile) of a linear variable, overall and by groups: quantile( mlces_df[ , "totpdchg" ] , 0.5 ) tapply( mlces_df[ , "totpdchg" ] , mlces_df[ , "claimant_relationship_to_policyholder" ] , quantile , 0.5 ) Subsetting Limit your data.frame to persons under 18: sub_mlces_df <- subset( mlces_df , ( ( claimyr - patbrtyr ) < 18 ) ) Calculate the mean (average) of this subset: mean( sub_mlces_df[ , "totpdchg" ] ) Measures of Uncertainty Calculate the variance, overall and by groups: var( mlces_df[ , "totpdchg" ] ) tapply( mlces_df[ , "totpdchg" ] , mlces_df[ , "claimant_relationship_to_policyholder" ] , var ) Regression Models and Tests of Association Perform a t-test: t.test( totpdchg ~ ppo_plan , mlces_df ) Perform a chi-squared test of association: this_table <- table( mlces_df[ , c( "ppo_plan" , "patsex" ) ] ) chisq.test( this_table ) Perform a generalized linear model: glm_result <- glm( totpdchg ~ ppo_plan + patsex , data = mlces_df ) summary( glm_result ) Replication Example This example matches statistics in Table II-A’s 1999 row numbers 52 and 53 from the Database: Match Claimants Exceeding Deductible: # $0 deductible stopifnot( nrow( mlces_df ) == 1591738 ) # $1,000 deductible mlces_above_1000_df <- subset( mlces_df , totpdchg > 1000 ) stopifnot( nrow( mlces_above_1000_df ) == 402550 ) Match the Excess Charges Above Deductible: # $0 deductible stopifnot( round( sum( mlces_df[ , 'totpdchg' ] ) , 0 ) == 2599356658 ) # $1,000 deductible stopifnot( round( sum( mlces_above_1000_df[ , 'totpdchg' ] - 1000 ) , 0 ) == 1883768786 ) Analysis Examples with dplyr   The R dplyr library offers an alternative grammar of data manipulation to base R and SQL syntax. dplyr offers many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, and the tidyverse style of non-standard evaluation. This vignette details the available features. As a starting point for MLCES users, this code replicates previously-presented examples: library(dplyr) mlces_tbl <- as_tibble( mlces_df ) Calculate the mean (average) of a linear variable, overall and by groups: mlces_tbl %>% summarize( mean = mean( totpdchg ) ) mlces_tbl %>% group_by( claimant_relationship_to_policyholder ) %>% summarize( mean = mean( totpdchg ) ) Analysis Examples with data.table   The R data.table library provides a high-performance version of base R’s data.frame with syntax and feature enhancements for ease of use, convenience and programming speed. data.table offers concise syntax: fast to type, fast to read, fast speed, memory efficiency, a careful API lifecycle management, an active community, and a rich set of features. This vignette details the available features. As a starting point for MLCES users, this code replicates previously-presented examples: library(data.table) mlces_dt <- data.table( mlces_df ) Calculate the mean (average) of a linear variable, overall and by groups: mlces_dt[ , mean( totpdchg ) ] mlces_dt[ , mean( totpdchg ) , by = claimant_relationship_to_policyholder ] Analysis Examples with duckdb   The R duckdb library provides an embedded analytical data management system with support for the Structured Query Language (SQL). duckdb offers a simple, feature-rich, fast, and free SQL OLAP management system. This vignette details the available features. As a starting point for MLCES users, this code replicates previously-presented examples: library(duckdb) con <- dbConnect( duckdb::duckdb() , dbdir = 'my-db.duckdb' ) dbWriteTable( con , 'mlces' , mlces_df ) Calculate the mean (average) of a linear variable, overall and by groups: dbGetQuery( con , 'SELECT AVG( totpdchg ) FROM mlces' ) dbGetQuery( con , 'SELECT claimant_relationship_to_policyholder , AVG( totpdchg ) FROM mlces GROUP BY claimant_relationship_to_policyholder' ) "],["national-agricultural-workers-survey-naws.html", "National Agricultural Workers Survey (NAWS) Recommended Reading Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " National Agricultural Workers Survey (NAWS) The primary face-to-face interview of currently-employed crop workers in the United States, with detailed questions on demographics, occupational injury, health surveillance, and seasonal and migrant labor. One cumulative table containing all interviews since 1989, with one row per sampled respondent. A complex sample designed to generalize to crop production workers employed by establishments engaged in Crop Production (NAICS 111) and Support Activities for Crop Production (NAICS 1151). Released biennially since 1989. Administered by the Employment and Training Administration, in partnership with JBS International. Recommended Reading Four Example Strengths & Limitations: ✔️ Employer-based sample increases the likelihood migrant workers will be interviewed ✔️ Seasonal sampling in order to avoid bias ❌ Respondents not followed over time ❌ Except for California, the data are not available at the state level Three Example Findings: Across 2019-2020, 49% of US crop workers said their most recent health care visit for preventive or routine care was to a community health center or migrant health clinic. Pesticide exposure increased between 2002 and 2016 among US crop workers. Hired crop workers who responded negatively to “employer provides clean drinking water and disposable cups every day” were at greater odds of injury between 2002 and 2015. Two Methodology Documents: Findings from the National Agricultural Workers Survey (NAWS) 2021–2022: A Demographic and Employment Profile of United States Crop Workers Statistical Methods of the National Agricultural Workers Survey One Haiku: # were i king, my court: # arcimboldo's vertumnus # jester juggling self Download, Import, Preparation The public access dataset does not currently include the variables needed to get design-adjusted estimates. Previous data releases contained replicate weights; however, those have been discontinued. Although the PUF allows external researchers to match weighted shares, the UCLA Statistical Consulting Group cautions ignoring the clustering will likely lead to standard errors that are underestimated, possibly leading to results that seem to be statistically significant, when in fact, they are not. In order for the Employment and Training Administration (ETA) to consider a request for offsite use of the restricted NAWS data file, send these items to the contact listed here for inquiries about the survey: A brief description of the research aims and how NAWS data will support the research; A statement as to why the NAWS public data file is insufficient to meet the research aims; A description of how and when the resulting findings will be disseminated; and A brief description of the analysis plan, so that NAWS staff may assess the suitability of the NAWS given the research aims and analysis plan. Upon receipt of this microdata, begin by loading the SAS file: library(haven) naws_tbl <- read_sas( file.path( path.expand( "~" ) , "nawscrtdvars2db22.sas7bdat" ) ) naws_df <- data.frame( naws_tbl ) names( naws_df ) <- tolower( names( naws_df ) ) Save Locally   Save the object at any point: # naws_fn <- file.path( path.expand( "~" ) , "NAWS" , "this_file.rds" ) # saveRDS( naws_df , file = naws_fn , compress = FALSE ) Load the same object: # naws_df <- readRDS( naws_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) options( survey.lonely.psu = "adjust" ) naws_design <- svydesign( id = ~ cluster , strata = ~ interaction( fpc_region , cycle ) , data = naws_df , weights = ~ pwtycrd, nest = TRUE ) Variable Recoding Add new columns to the data set: naws_design <- update( naws_design , one = 1 , country_of_birth = factor( findInterval( a07 , c( 3 , 4 , 5 , 100 ) ) , levels = 0:4 , labels = c( 'us or pr' , 'mexico' , 'central america' , 'south america, carribean, asia, or other' , 'missing' ) ) , gender = factor( gender , levels = 0:1 , labels = c( 'male' , 'female' ) ) , interview_cohort = factor( findInterval( fy , seq( 1989 , 2021 , 2 ) ) , levels = seq_along( seq( 1989 , 2021 , 2 ) ) , labels = paste( seq( 1989 , 2021 , 2 ) , seq( 1990 , 2022 , 2 ) , sep = '-' ) ) , authorized_to_work = ifelse( l01 < 9 , as.numeric( l01 < 5 ) , NA ) , hours_worked_last_week_at_farm_job = d04 ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( naws_design , "sampling" ) != 0 ) svyby( ~ one , ~ interview_cohort , naws_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , naws_design ) svyby( ~ one , ~ interview_cohort , naws_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ waget1 , naws_design , na.rm = TRUE ) svyby( ~ waget1 , ~ interview_cohort , naws_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ country_of_birth , naws_design , na.rm = TRUE ) svyby( ~ country_of_birth , ~ interview_cohort , naws_design , svymean , na.rm = TRUE ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ waget1 , naws_design , na.rm = TRUE ) svyby( ~ waget1 , ~ interview_cohort , naws_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ country_of_birth , naws_design , na.rm = TRUE ) svyby( ~ country_of_birth , ~ interview_cohort , naws_design , svytotal , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ waget1 , naws_design , 0.5 , na.rm = TRUE ) svyby( ~ waget1 , ~ interview_cohort , naws_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ hours_worked_last_week_at_farm_job , denominator = ~ numfempl , naws_design , na.rm = TRUE ) Subsetting Restrict the survey design to California, the only standalone state with adequate sample: sub_naws_design <- subset( naws_design , region12 == 'CA' ) Calculate the mean (average) of this subset: svymean( ~ waget1 , sub_naws_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ waget1 , naws_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ waget1 , ~ interview_cohort , naws_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( naws_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ waget1 , naws_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ waget1 , naws_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ waget1 , naws_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ authorized_to_work , naws_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( waget1 ~ authorized_to_work , naws_design ) Perform a chi-squared test of association for survey data: svychisq( ~ authorized_to_work + country_of_birth , naws_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( waget1 ~ authorized_to_work + country_of_birth , naws_design ) summary( glm_result ) Replication Example This example matches the unweighted counts and weighted percents of the gender rows shown on PDF page 90 of the most current research report; however, the restricted-use dataset does not include information to implement a finite population correction (FPC). Since a FPC always reduces the standard error, omitting it only makes results more conservative. JBS International shared standard errors and coefficients of variation omitting the FPC, this exercise precisely matches those numbers as well: # less conservative options( survey.lonely.psu = "remove" ) published_unweighted_counts <- c( 1823 , 775 ) published_percentages <- c( 0.68 , 0.32 ) unpublished_se <- c( 0.024 , 0.024 ) unpublished_cv <- c( 0.04 , 0.08 ) current_cohort <- subset( naws_design , interview_cohort == '2021-2022' ) ( unwtd_n <- svyby( ~ one , ~ gender , current_cohort , unwtd.count ) ) stopifnot( all( coef( unwtd_n ) == published_unweighted_counts ) ) ( results <- svymean( ~ gender , current_cohort ) ) stopifnot( all( round( coef( results ) , 2 ) == published_percentages ) ) stopifnot( all( round( SE( results ) , 3 ) == unpublished_se ) ) stopifnot( all( round( cv( results ) , 2 ) == unpublished_cv ) ) # more conservative options( survey.lonely.psu = "adjust" ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for NAWS users, this code replicates previously-presented examples: library(srvyr) naws_srvyr_design <- as_survey( naws_design ) Calculate the mean (average) of a linear variable, overall and by groups: naws_srvyr_design %>% summarize( mean = survey_mean( waget1 , na.rm = TRUE ) ) naws_srvyr_design %>% group_by( interview_cohort ) %>% summarize( mean = survey_mean( waget1 , na.rm = TRUE ) ) "],["national-beneficiary-survey-nbs.html", "National Beneficiary Survey (NBS) Recommended Reading Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " National Beneficiary Survey (NBS) The principal microdata for U.S. disability researchers interested in Social Security program performance. One table with one row per respondent. A complex sample designed to generalize to Americans between age 18 and full retirement age, covered by either Social Security Disability Insurance (SSDI) or Supplemental Security Income (SSI). Released at irregular intervals, with 2004, 2005, 2006, 2010, 2015, 2017, and 2019 available. Administered by the Social Security Administration. Recommended Reading Four Example Strengths & Limitations: ✔️ Instrument designed to reduce challenges related to communication, stamina, cognitive barriers ✔️ Longitudinal 2019 sample includes beneficiaries working at prior round (2017) interview ❌ Not designed to produce regional or state-level estimates ❌ May overstate beneficiary poverty status and understate beneficiary income Three Example Findings: Large gaps in income and expenditure between Social Security Disability Insurance recipient households and working households generally increase with the number of dependents. The share of Social Security Disability Insurance beneficiaries who had work goals or work expectations rose from 34% in 2005 to 43% in 2015. In 2010, 9% of disabled-worker beneficiaries had a 4-year degree, 28% less than high school. Two Methodology Documents: National Beneficiary Survey: Disability Statistics, 2015 National Beneficiary Survey - General Waves Round 7: User’s Guide One Haiku: # social safety net # poverty acrobatics # trap or trampoline Download, Import, Preparation Download and import the round 7 file: library(haven) zip_tf <- tempfile() zip_url <- "https://www.ssa.gov/disabilityresearch/documents/R7NBSPUF_STATA.zip" download.file( zip_url , zip_tf , mode = 'wb' ) nbs_tbl <- read_stata( zip_tf ) nbs_df <- data.frame( nbs_tbl ) names( nbs_df ) <- tolower( names( nbs_df ) ) nbs_df[ , 'one' ] <- 1 Save Locally   Save the object at any point: # nbs_fn <- file.path( path.expand( "~" ) , "NBS" , "this_file.rds" ) # saveRDS( nbs_df , file = nbs_fn , compress = FALSE ) Load the same object: # nbs_df <- readRDS( nbs_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) options( survey.lonely.psu = "adjust" ) # representative beneficiary sample nbs_design <- svydesign( id = ~ r7_a_psu_pub , strata = ~ r7_a_strata , weights = ~ r7_wtr7_ben , data = subset( nbs_df , r7_wtr7_ben > 0 ) ) # cross-sectional successful worker sample nbs_design <- svydesign( id = ~ r7_a_psu_pub , strata = ~ r7_a_strata , weights = ~ r7_wtr7_cssws , data = subset( nbs_df , r7_wtr7_cssws > 0 ) ) # longitudinal successful worker sample lngsws_design <- svydesign( id = ~ r7_a_psu_pub , strata = ~ r7_a_strata , weights = ~ r7_wtr7_lngsws , data = subset( nbs_df , r7_wtr7_lngsws > 0 ) ) Variable Recoding Add new columns to the data set: nbs_design <- update( nbs_design , male = as.numeric( r7_orgsampinfo_sex == 1 ) , age_categories = factor( r7_c_intage_pub , labels = c( "18-25" , "26-40" , "41-55" , "56 and older" ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( nbs_design , "sampling" ) != 0 ) svyby( ~ one , ~ age_categories , nbs_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , nbs_design ) svyby( ~ one , ~ age_categories , nbs_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ r7_n_totssbenlastmnth_pub , nbs_design , na.rm = TRUE ) svyby( ~ r7_n_totssbenlastmnth_pub , ~ age_categories , nbs_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ r7_c_hhsize_pub , nbs_design , na.rm = TRUE ) svyby( ~ r7_c_hhsize_pub , ~ age_categories , nbs_design , svymean , na.rm = TRUE ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ r7_n_totssbenlastmnth_pub , nbs_design , na.rm = TRUE ) svyby( ~ r7_n_totssbenlastmnth_pub , ~ age_categories , nbs_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ r7_c_hhsize_pub , nbs_design , na.rm = TRUE ) svyby( ~ r7_c_hhsize_pub , ~ age_categories , nbs_design , svytotal , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ r7_n_totssbenlastmnth_pub , nbs_design , 0.5 , na.rm = TRUE ) svyby( ~ r7_n_totssbenlastmnth_pub , ~ age_categories , nbs_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ r7_n_ssilastmnth_pub , denominator = ~ r7_n_totssbenlastmnth_pub , nbs_design , na.rm = TRUE ) Subsetting Restrict the survey design to currently covered by Medicare: sub_nbs_design <- subset( nbs_design , r7_c_curmedicare == 1 ) Calculate the mean (average) of this subset: svymean( ~ r7_n_totssbenlastmnth_pub , sub_nbs_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ r7_n_totssbenlastmnth_pub , nbs_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ r7_n_totssbenlastmnth_pub , ~ age_categories , nbs_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( nbs_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ r7_n_totssbenlastmnth_pub , nbs_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ r7_n_totssbenlastmnth_pub , nbs_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ r7_n_totssbenlastmnth_pub , nbs_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ male , nbs_design , method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( r7_n_totssbenlastmnth_pub ~ male , nbs_design ) Perform a chi-squared test of association for survey data: svychisq( ~ male + r7_c_hhsize_pub , nbs_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( r7_n_totssbenlastmnth_pub ~ male + r7_c_hhsize_pub , nbs_design ) summary( glm_result ) Replication Example This example matches the percentages and t-tests from the final ten rows of Exhibit 4: ex_4 <- data.frame( variable_label = c( 'coping with stress' , 'concentrating' , 'getting around outside of the home' , 'shopping for personal items' , 'preparing meals' , 'getting into or out of bed' , 'bathing or dressing' , 'getting along with others' , 'getting around inside the house' , 'eating' ) , variable_name = c( "r3_i60_i" , "r3_i59_i" , "r3_i47_i" , "r3_i53_i" , "r3_i55_i" , "r3_i49_i" , "r3_i51_i" , "r3_i61_i" , "r3_i45_i" , "r3_i57_i" ) , overall = c( 61 , 58 , 47 , 39 , 37 , 34 , 30 , 27 , 23 , 14 ) , di_only = c( 60 , 54 , 47 , 36 , 35 , 36 , 30 , 23 , 24 , 13 ) , concurrent = c( 63 , 63 , 47 , 43 , 41 , 34 , 33 , 31 , 23 , 15 ) , concurrent_vs_di = c( F , T , F , F , F , F , F , T , F , F ) , ssi = c( 61 , 62 , 47 , 40 , 39 , 33 , 29 , 31 , 22 , 15 ) , ssi_vs_di = c( F , T , F , F , F , F , F , T , F , F ) ) Download, import, and recode the round 3 file: r3_tf <- tempfile() r3_url <- "https://www.ssa.gov/disabilityresearch/documents/nbsr3pufstata.zip" download.file( r3_url , r3_tf , mode = 'wb' ) r3_tbl <- read_stata( r3_tf ) r3_df <- data.frame( r3_tbl ) names( r3_df ) <- tolower( names( r3_df ) ) r3_design <- svydesign( id = ~ r3_a_psu_pub , strata = ~ r3_a_strata , weights = ~ r3_wtr3_ben , data = subset( r3_df , r3_wtr3_ben > 0 ) ) r3_design <- update( r3_design , benefit_type = factor( r3_orgsampinfo_bstatus , levels = c( 2 , 3 , 1 ) , labels = c( 'di_only' , 'concurrent' , 'ssi' ) ) ) Calculate the final ten rows of exhibit 4 and confirm each statistics and t-test matches: for( i in seq( nrow( ex_4 ) ) ){ this_formula <- as.formula( paste( "~" , ex_4[ i , 'variable_name' ] ) ) overall_percent <- svymean( this_formula , r3_design ) stopifnot( 100 * round( coef( overall_percent ) , 2 ) == ex_4[ i , 'overall_percent' ] ) benefit_percent <- svyby( this_formula , ~ benefit_type , r3_design , svymean ) stopifnot( all.equal( 100 * as.numeric( round( coef( benefit_percent ) , 2 ) ) , as.numeric( ex_4[ i , c( 'di_only' , 'concurrent' , 'ssi' ) ] ) ) ) ttest_formula <- as.formula( paste( ex_4[ i , 'variable_name' ] , "~ benefit_type" ) ) di_only_con_design <- subset( r3_design , benefit_type %in% c( 'di_only' , 'concurrent' ) ) con_ttest <- svyttest( ttest_formula , di_only_con_design ) stopifnot( all.equal( as.logical( con_ttest$p.value < 0.05 ) , as.logical( ex_4[ i , 'concurrent_vs_di' ] ) ) ) di_only_ssi_design <- subset( r3_design , benefit_type %in% c( 'di_only' , 'ssi' ) ) ssi_ttest <- svyttest( ttest_formula , di_only_ssi_design ) stopifnot( all.equal( as.logical( ssi_ttest$p.value < 0.05 ) , as.logical( ex_4[ i , 'ssi_vs_di' ] ) ) ) } Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for NBS users, this code replicates previously-presented examples: library(srvyr) nbs_srvyr_design <- as_survey( nbs_design ) Calculate the mean (average) of a linear variable, overall and by groups: nbs_srvyr_design %>% summarize( mean = survey_mean( r7_n_totssbenlastmnth_pub , na.rm = TRUE ) ) nbs_srvyr_design %>% group_by( age_categories ) %>% summarize( mean = survey_mean( r7_n_totssbenlastmnth_pub , na.rm = TRUE ) ) "],["national-crime-victimization-survey-ncvs.html", "National Crime Victimization Survey (NCVS) Recommended Reading Function Definitions Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " National Crime Victimization Survey (NCVS) The primary information source on victims of nonfatal personal crimes and household property crimes (especially those not reported to the police), and also victim experience within the justice system. Three tables, the first one row per household per interview, the second one per person-interview, the third one per incident reported across each sampled household’s seven-interview, three-year period. A complex survey designed to generalize to civilian, non-institutional americans aged 12 and older. Released annually since its 1992 rename and redesign, related surveys dating to the early 1970s. Sponsored by the Bureau of Justics Statistics and administered by the US Census Bureau. Recommended Reading Four Example Strengths & Limitations: ✔️ Detailed taxonomy of crime incidents ✔️ Estimates available for 22 largest states starting in 2017 ❌ May undercount rape and sexual assault ❌ Six month reference period despite respondent recall more accurate looking back only three months Three Example Findings: Nonfatal firearm violence for persons age 12 or older declined 72% from 1993 to 2023. In 2013, less than half of violent crime incidents victimizing individuals aged 12+ reported to police. Offenders armed with firearms accounted for 38% of nonfatal carjackings between 2012 and 2021. Two Methodology Documents: National Crime Victimization Survey, 2016: Technical Documentation A New Measure of Prevalence for the National Crime Victimization Survey One Haiku: # saint peter's sports bar # evil deed instant replay # sinful thought jukebox Function Definitions Define a function to extract values stored in parentheses: ncvs_numeric_to_factor <- function( this_column ) as.numeric( gsub( "^\\\\(([0-9]+)\\\\) (.*)" , "\\\\1" , this_column ) ) Define a function to merge aggregated information onto main data.frame objects: left_join_zero_missings <- function( left_df , right_df ){ final_df <- merge( left_df , right_df , all.x = TRUE ) stopifnot( nrow( final_df ) == nrow( left_df ) ) for( this_column in setdiff( names( right_df ) , names( left_df ) ) ){ final_df[ is.na( final_df[ , this_column ] ) , this_column ] <- 0 } gc() final_df } Download, Import, Preparation Register for the National Archive of Criminal Justice Data at https://www.icpsr.umich.edu/web/NACJD/series/95 Choose National Crime Victimization Survey, Concatenated File, [United States], 1992-2022 (ICPSR 38604) Download the R version of the September 18, 2023 file. Import the three main files: ncvs_household_df_name <- load( file.path( path.expand( "~" ) , "ICPSR_38604/DS0001/38604-0001-Data.rda" ) ) ncvs_person_df_name <- load( file.path( path.expand( "~" ) , "ICPSR_38604/DS0002/38604-0002-Data.rda" ) ) ncvs_incident_df_name <- load( file.path( path.expand( "~" ) , "ICPSR_38604/DS0003/38604-0003-Data.rda" ) ) ncvs_household_df <- get( ncvs_household_df_name ) ncvs_person_df <- get( ncvs_person_df_name ) ncvs_incident_df <- get( ncvs_incident_df_name ) rm( list = ncvs_household_df_name ) ; gc() rm( list = ncvs_person_df_name ) ; gc() rm( list = ncvs_incident_df_name ) ; gc() names( ncvs_household_df ) <- tolower( names( ncvs_household_df ) ) names( ncvs_person_df ) <- tolower( names( ncvs_person_df ) ) names( ncvs_incident_df ) <- tolower( names( ncvs_incident_df ) ) Determine which variables from each table to retain: household_variables_to_keep <- c( 'year' , 'yearq' , 'idhh' , 'wgthhcy' , 'v2002' , 'sc214a' , 'v2026' , 'v2126a' , 'v2126b' , 'v2015' , 'v2017' , 'v2117' , 'v2118' , 'v2125' , 'v2071' , 'v2072' , 'v2127b' , 'v2129' ) person_variables_to_keep <- c( 'year' , 'yearq' , 'v3018' , 'v3014' , 'sc214a' , 'v3023' , 'v3023a' , 'v3024' , 'v3024a' , 'v2117' , 'v2118' , 'v3002' , 'idhh' , 'idper' , 'wgtpercy' , 'v3015' , 'v3033' , 'v2026' ) incident_variables_to_keep <- c( 'year' , 'yearq' , 'v2117' , 'v2118' , 'v4022' , paste0( 'v401' , 6:9 ) , 'v4399' , 'v4529' , 'v4049' , paste0( 'v405' , 0:8 ) , 'v4060' , 'v4062' , paste0( 'v41' , 11:22 ) , 'v4064' , paste0( 'v41' , 27:37 ) , 'v4467' , 'v4234' , 'v4245' , 'v4243' , 'v4241' , 'v4256' , 'v4258' , 'v4278' , 'v4262' , paste0( 'v42' , 59:61 ) , 'v4269' , 'v4270' , 'v4268' , 'v4267' , 'v4271' , 'v4266' , 'v4265' , 'wgtviccy' , 'idhh' , 'idper' , 'v4002' , 'v4288' , 'v4290' , 'v4400' , 'v4437' , 'v4422' , 'v4024' ) Limit columns in each data.frame to those specified above: ncvs_household_df <- ncvs_household_df[ , household_variables_to_keep ] ncvs_person_df <- ncvs_person_df[ , person_variables_to_keep ] ncvs_incident_df <- ncvs_incident_df[ , incident_variables_to_keep ] gc() In this example, limit the 1993-2022 data.frame to only the first & last years for quicker processing: ncvs_household_df <- ncvs_household_df[ ncvs_household_df[ , 'year' ] %in% c( 1994 , 2022 ) , ] ncvs_person_df <- ncvs_person_df[ ncvs_person_df[ , 'year' ] %in% c( 1994 , 2022 ) , ] ncvs_incident_df <- ncvs_incident_df[ ncvs_incident_df[ , 'year' ] %in% c( 1994 , 2022 ) , ] gc() Recode identifiers to character class: ncvs_household_df[ , 'idhh' ] <- as.character( ncvs_household_df[ , 'idhh' ] ) ncvs_person_df[ c( 'idhh' , 'idper' ) ] <- sapply( ncvs_person_df[ c( 'idhh' , 'idper' ) ] , as.character ) ncvs_incident_df[ c( 'idhh' , 'idper' ) ] <- sapply( ncvs_incident_df[ c( 'idhh' , 'idper' ) ] , as.character ) Recode factor variables to numeric values: ncvs_household_df[ sapply( ncvs_household_df , class ) == 'factor' ] <- sapply( ncvs_household_df[ sapply( ncvs_household_df , class ) == 'factor' ] , ncvs_numeric_to_factor , simplify = FALSE ) ncvs_person_df[ sapply( ncvs_person_df , class ) == 'factor' ] <- sapply( ncvs_person_df[ sapply( ncvs_person_df , class ) == 'factor' ] , ncvs_numeric_to_factor , simplify = FALSE ) ncvs_incident_df[ sapply( ncvs_incident_df , class ) == 'factor' ] <- sapply( ncvs_incident_df[ sapply( ncvs_incident_df , class ) == 'factor' ] , ncvs_numeric_to_factor , simplify = FALSE ) Add a column of ones to each data.frame: ncvs_household_df[ , 'one' ] <- 1 ncvs_person_df[ , 'one' ] <- 1 ncvs_incident_df[ , 'one' ] <- 1 Add a year group variable to each data.frame: ncvs_household_df[ , 'yr_grp' ] <- findInterval( ncvs_household_df[ , 'year' ] , c( 1992 , 1997 , 2006 , 2016 ) ) ncvs_person_df[ , 'yr_grp' ] <- findInterval( ncvs_person_df[ , 'year' ] , c( 1992 , 1997 , 2006 , 2016 ) ) ncvs_incident_df[ , 'yr_grp' ] <- findInterval( ncvs_incident_df[ , 'year' ] , c( 1992 , 1997 , 2006 , 2016 ) ) Add a flag indicating whether each incident occurred inside the country: ncvs_incident_df[ , 'exclude_outus' ] <- ncvs_incident_df[ , 'v4022' ] %in% 1 Add a half-year indicator to the incident data.frame: ncvs_incident_df <- transform( ncvs_incident_df , half_year = ifelse( substr( yearq , 6 , 6 ) %in% c( '1' , '2' ) , 1 , ifelse( substr( yearq , 6 , 6 ) %in% c( '3' , '4' ) , 2 , NA ) ) ) stopifnot( all( ncvs_incident_df[ , 'half_year' ] %in% 1:2 ) ) Define violent crimes on the incident data.frame: # rape and sexual assault ncvs_incident_df[ , 'rsa' ] <- ncvs_incident_df[ , 'v4529' ] %in% c( 1:4 , 15 , 16 , 18 , 19 ) # robbery ncvs_incident_df[ , 'rob' ] <- ncvs_incident_df[ , 'v4529' ] %in% 5:10 # assault ncvs_incident_df[ , 'ast' ] <- ncvs_incident_df[ , 'v4529' ] %in% c( 11:14 , 17 , 20 ) # simple assault ncvs_incident_df[ , 'sast' ] <- ncvs_incident_df[ , 'v4529' ] %in% c( 14 , 17 , 20 ) # aggravated assault ncvs_incident_df[ , 'aast' ] <- ncvs_incident_df[ , 'v4529' ] %in% 11:13 # violent crime ncvs_incident_df[ , 'violent' ] <- apply( ncvs_incident_df[ c( 'rsa' , 'rob' , 'ast' ) ] , 1 , any ) # violent crime excluding simple assault ncvs_incident_df[ , 'sviolent' ] <- apply( ncvs_incident_df[ , c( 'rsa' , 'rob' , 'aast' ) ] , 1 , any ) Define personal theft and then person-crime on the incident data.frame: ncvs_incident_df[ , 'ptft' ] <- ncvs_incident_df[ , 'v4529' ] %in% 21:23 ncvs_incident_df[ , 'personcrime' ] <- apply( ncvs_incident_df[ , c( 'violent' , 'ptft' ) ] , 1 , any ) Define property crimes on the incident data.frame: ncvs_incident_df[ , 'hhburg' ] <- ncvs_incident_df[ , 'v4529' ] %in% 31:33 # completed theft with something taken ncvs_incident_df[ , 'burg_ct' ] <- ( ncvs_incident_df[ , 'v4529' ] %in% 31:33 ) & ( ncvs_incident_df[ , 'v4288' ] %in% 1 ) # attempted theft ncvs_incident_df[ , 'burg_at' ] <- ( ncvs_incident_df[ , 'v4529' ] %in% 31:33 ) & ( ncvs_incident_df[ , 'v4290' ] %in% 1 ) ncvs_incident_df[ , 'burg_ncat' ] <- ( ncvs_incident_df[ , 'v4529' ] %in% 31:33 ) & ( ncvs_incident_df[ , 'v4288' ] %in% 2 ) & ( ncvs_incident_df[ , 'v4290' ] %in% 2 ) ncvs_incident_df[ , 'burgcats2' ] <- 0 ncvs_incident_df[ ncvs_incident_df[ , 'burg_ncat' ] , 'burgcats2' ] <- 2 ncvs_incident_df[ ncvs_incident_df[ , 'burg_ct' ] | ncvs_incident_df[ , 'burg_at' ] , 'burgcats2' ] <- 1 ncvs_incident_df[ , 'burg' ] <- ncvs_incident_df[ , 'burgcats2' ] %in% 1 # trespassing ncvs_incident_df[ , 'tres' ] <- ncvs_incident_df[ , 'burgcats2' ] %in% 2 # motor vehicle theft ncvs_incident_df[ , 'mvtft' ] <- ncvs_incident_df[ , 'v4529' ] %in% 40:41 # household theft ncvs_incident_df[ , 'hhtft' ] <- ncvs_incident_df[ , 'v4529' ] %in% 54:59 # property crime ncvs_incident_df[ , 'property' ] <- apply( ncvs_incident_df[ c( 'hhburg' , 'mvtft' , 'hhtft' ) ] , 1 , any ) Define a series weight on the incident data.frame: ncvs_incident_df[ , 'series' ] <- 2 ncvs_incident_df[ ncvs_incident_df[ , 'v4017' ] %in% c( 1 , 8 ) | ncvs_incident_df[ , 'v4018' ] %in% c( 2 , 8 ) | ncvs_incident_df[ , 'v4019' ] %in% c( 1 , 8 ) , 'series' ] <- 1 ncvs_incident_df[ , 'serieswgt' ] <- 1 ncvs_incident_df[ !( ncvs_incident_df[ , 'v4016' ] %in% 997:998 ) , 'n10v4016' ] <- pmin( ncvs_incident_df[ !( ncvs_incident_df[ , 'v4016' ] %in% 997:998 ) , 'v4016' ] , 10 ) ncvs_incident_df[ ncvs_incident_df[ , 'series' ] == 2 , 'serieswgt' ] <- ncvs_incident_df[ ncvs_incident_df[ , 'series' ] == 2 , 'n10v4016' ] ncvs_incident_df[ ncvs_incident_df[ , 'series' ] == 2 & is.na( ncvs_incident_df[ , 'n10v4016' ] ) , 'serieswgt' ] <- 6 Aggregate property-crimes to the household-interview level: summed_hh_crimes <- aggregate( cbind( property * serieswgt , hhburg * serieswgt , mvtft * serieswgt , burg * serieswgt , tres * serieswgt ) ~ yearq + idhh + v4002 + wgtviccy , data = subset( ncvs_incident_df , !exclude_outus & property ) , sum ) names( summed_hh_crimes ) <- c( 'yearq' , 'idhh' , 'v2002' , 'wgtviccy' , 'property' , 'hhburg' , 'mvtft' , 'burg' , 'tres' ) Merge aggregated property-crimes on to the household-interview data.frame: ncvs_household_df <- left_join_zero_missings( ncvs_household_df , summed_hh_crimes ) rm( summed_hh_crimes ) ; gc() Aggregate person-crimes to the person-interview level: summed_person_crimes <- aggregate( cbind( violent * serieswgt , sviolent * serieswgt , rsa * serieswgt , rob * serieswgt , aast * serieswgt , sast * serieswgt , ptft * serieswgt ) ~ yearq + idhh + v4002 + idper + wgtviccy , data = subset( ncvs_incident_df , !exclude_outus & personcrime ) , sum ) names( summed_person_crimes ) <- c( 'yearq' , 'idhh' , 'v3002' , 'idper' , 'wgtviccy' , 'violent' , 'sviolent' , 'rsa' , 'rob' , 'aast' , 'sast' , 'ptft' ) Merge aggregated property-crimes on to the person-interview data.frame: ncvs_person_df <- left_join_zero_missings( ncvs_person_df , summed_person_crimes ) rm( summed_person_crimes ) ; gc() Starting here, the weight calculation prepares an adjustment for all violence combined with the variables violent and violent_year. To calculate the prevalence rate of a subset of person-crimes, starting at this point, replace these two values with variables like rob and rob_year. Aggregate violent crimes to the person-year level: summed_person_year_violent_crimes <- aggregate( violent * serieswgt ~ idhh + idper + year , data = subset( ncvs_incident_df , !exclude_outus & violent ) , sum ) names( summed_person_year_violent_crimes )[ ncol( summed_person_year_violent_crimes ) ] <- 'violent_year' Merge aggregated person-year violent crime series weights on to the person-interview data.frame: ncvs_person_df <- left_join_zero_missings( ncvs_person_df , summed_person_year_violent_crimes ) rm( summed_person_year_violent_crimes ) ; gc() Aggregate violent crimes to the person-half-year level, then reshape into a wide data.frame: summed_person_half_year_violent_crimes <- aggregate( wgtviccy ~ idhh + idper + year + half_year , data = subset( ncvs_incident_df , !exclude_outus & violent ) , mean ) first_half_violent_crimes <- subset( summed_person_half_year_violent_crimes , half_year == 1 ) second_half_violent_crimes <- subset( summed_person_half_year_violent_crimes , half_year == 2 ) first_half_violent_crimes[ , 'half_year' ] <- second_half_violent_crimes[ , 'half_year' ] <- NULL names( first_half_violent_crimes )[ ncol( first_half_violent_crimes ) ] <- 'vwgt1' names( second_half_violent_crimes )[ ncol( second_half_violent_crimes ) ] <- 'vwgt2' wide_person_half_year_violent_crimes <- merge( first_half_violent_crimes , second_half_violent_crimes , all = TRUE ) Merge both violent crime weights on to the person-interview data.frame: ncvs_person_df <- left_join_zero_missings( ncvs_person_df , wide_person_half_year_violent_crimes ) rm( wide_person_half_year_violent_crimes ) ; gc() Find the maximum incident victim weight among three half-year periods: max_half_v_crimes <- aggregate( wgtviccy ~ idhh + idper + year + half_year + v4002 , data = subset( ncvs_incident_df , !exclude_outus & violent ) , max ) max_half_v_crimes <- max_half_v_crimes[ do.call( order , max_half_v_crimes[ c( 'idhh' , 'idper' , 'year' , 'half_year' ) ] ) , ] max_half_v_crimes[ , 'byvar' ] <- apply( max_half_v_crimes[ c( 'idhh' , 'idper' , 'year' , 'half_year' ) ] , 1 , paste , collapse = ' ' ) max_half_v_crimes[ 1 , 'id' ] <- 1 for( i in seq( 2 , nrow( max_half_v_crimes ) ) ){ if( max_half_v_crimes[ i , 'byvar' ] == max_half_v_crimes[ i - 1 , 'byvar' ] ){ max_half_v_crimes[ i , 'id' ] <- max_half_v_crimes[ i - 1 , 'id' ] + 1 } else { max_half_v_crimes[ i , 'id' ] <- 1 } } max_half_v_crimes[ , 'label' ] <- paste0( '_' , max_half_v_crimes[ , 'half_year' ] , '_' , max_half_v_crimes[ , 'id' ] ) max_half_v_crimes[ , 'byvar' ] <- NULL stopifnot( all( max_half_v_crimes[ , 'label' ] %in% c( '_1_1' , '_2_1' , '_1_2' ) ) ) h_1_1_df <- max_half_v_crimes[ max_half_v_crimes[ , 'label' ] == '_1_1' , c( 'idhh' , 'idper' , 'year' , 'wgtviccy' ) ] names( h_1_1_df )[ ncol( h_1_1_df ) ] <- 'wgtviccy_1_1' h_2_1_df <- max_half_v_crimes[ max_half_v_crimes[ , 'label' ] == '_2_1' , c( 'idhh' , 'idper' , 'year' , 'wgtviccy' ) ] names( h_2_1_df )[ ncol( h_2_1_df ) ] <- 'wgtviccy_2_1' h_1_2_df <- max_half_v_crimes[ max_half_v_crimes[ , 'label' ] == '_1_2' , c( 'idhh' , 'idper' , 'year' , 'wgtviccy' ) ] names( h_1_2_df )[ ncol( h_1_2_df ) ] <- 'wgtviccy_1_2' three_half_df <- Reduce( function( ... ) merge( ... , all = TRUE ) , list( h_1_1_df , h_2_1_df , h_1_2_df ) ) rm( h_1_1_df , h_2_1_df , h_1_2_df ) ; gc() Merge these three half-year period weights on to the person-interview data.frame: ncvs_person_df <- left_join_zero_missings( ncvs_person_df , three_half_df ) rm( three_half_df ) ; gc() Aggregate interview counts to the person-year level: summed_person_year_interviews <- aggregate( one ~ idhh + idper + year , data = subset( ncvs_person_df , wgtpercy > 0 ) , sum ) names( summed_person_year_interviews )[ ncol( summed_person_year_interviews ) ] <- 'interview_count' Merge interview_count on to the person-interview data.frame: ncvs_person_df <- left_join_zero_missings( ncvs_person_df , summed_person_year_interviews ) rm( summed_person_year_interviews ) ; gc() Apply Interview/Incident Groups: ncvs_person_df <- transform( ncvs_person_df , interview_incident_groups = ifelse( violent_year == 0 , 1 , ifelse( interview_count == 1 & ( ( as.numeric( vwgt1 > 0 ) + as.numeric( vwgt2 > 0 ) ) == 1 ) & wgtviccy > 0 , 2 , ifelse( interview_count == 2 & ( ( as.numeric( vwgt1 > 0 ) + as.numeric( vwgt2 > 0 ) ) == 1 ) , 3 , ifelse( interview_count == 2 & ( vwgt1 > 0 ) & ( vwgt2 > 0 ) & ( wgtviccy > 0 ) , 4 , ifelse( interview_count == 3 & ( ( as.numeric( wgtviccy_1_1 > 0 ) + as.numeric( wgtviccy_2_1 > 0 ) + as.numeric( wgtviccy_1_2 > 0 ) ) == 1 ) , 5 , ifelse( interview_count == 3 & ( wgtviccy_1_1 > 0 ) & ( wgtviccy_2_1 > 0 ) & ( wgtviccy_1_2 > 0 ) , 6 , ifelse( interview_count == 3 & ( wgtviccy_1_1 > 0 ) & ( wgtviccy_2_1 > 0 ) & substr( yearq , 6 , 6 ) %in% 1:2 , 7 , ifelse( interview_count == 3 & ( wgtviccy_1_1 > 0 ) & ( wgtviccy_2_1 > 0 ) & substr( yearq , 6 , 6 ) %in% 3:4 , 8 , 9 ) ) ) ) ) ) ) ) ) # confirm all records in group 9 have both a wgtviccy == 0 & wgtpercy == 0 stopifnot( nrow( subset( ncvs_person_df , interview_incident_groups == 9 & wgtviccy > 0 ) ) == 0 ) stopifnot( nrow( subset( ncvs_person_df , interview_incident_groups == 9 & wgtpercy > 0 ) ) == 0 ) ncvs_person_df <- transform( ncvs_person_df , prev_wgt0 = ifelse( interview_incident_groups == 1 , wgtpercy , ifelse( interview_incident_groups == 2 , wgtviccy / 2 , ifelse( interview_incident_groups == 3 , pmax( vwgt1 , vwgt2 , na.rm = TRUE ) / 2 , ifelse( interview_incident_groups == 4 , wgtviccy / 2 , ifelse( interview_incident_groups == 5 , pmax( wgtviccy_1_1 , wgtviccy_1_2 , wgtviccy_2_1 , na.rm = TRUE ) / 2 , ifelse( interview_incident_groups == 6 , wgtviccy / 2 , ifelse( interview_incident_groups == 7 , wgtviccy_1_1 / 2 , ifelse( interview_incident_groups == 8 , wgtviccy_2_1 / 2 , ifelse( interview_incident_groups == 9 , 0 , NA ) ) ) ) ) ) ) ) ) ) # matches table 8 # https://www.ojp.gov/pdffiles1/bjs/grants/308745.pdf#page=44 Aggregate wgtviccy and prev_wgt0 sums to the year level, then merge: summed_year_weights <- aggregate( cbind( wgtviccy , prev_wgt0 ) ~ year , data = subset( ncvs_person_df , violent_year == 1 ) , sum ) names( summed_year_weights ) <- c( 'year' , 'vwgt_1v' , 'prev_1v' ) ncvs_person_df <- left_join_zero_missings( ncvs_person_df , summed_year_weights ) rm( summed_year_weights ) ; gc() Calibrate so that the weight sums to wgtviccy for persons with exactly one victimization: ncvs_person_df <- transform( ncvs_person_df , prev_wgt1 = ifelse( violent_year == 0 , prev_wgt0 , ifelse( violent_year > 0 & wgtpercy > 0 , prev_wgt0 * ( vwgt_1v / prev_1v ) , 0 ) ) ) Aggregate wgtviccy and prev_wgt0 sums to the year level, then merge: summed_year_crimes <- aggregate( cbind( wgtpercy , ifelse( violent_year > 0 , prev_wgt1 , 0 ) , ifelse( violent_year == 0 , prev_wgt1 , 0 ) ) ~ year , data = ncvs_person_df , sum ) names( summed_year_crimes ) <- c( 'year' , 'total_persons' , 'prev_with_crime' , 'prev_no_crime' ) ncvs_person_df <- left_join_zero_missings( ncvs_person_df , summed_year_crimes ) rm( summed_year_crimes ) ; gc() Calibrate so that the weight sums to wgtpercy for all persons: ncvs_person_df <- transform( ncvs_person_df , prev_wgt = ifelse( violent_year == 0 , prev_wgt1 * ( ( total_persons - prev_with_crime ) / prev_no_crime ) , prev_wgt1 ) ) Save Locally   Save the object at any point: # ncvs_fn <- file.path( path.expand( "~" ) , "NCVS" , "this_file.rds" ) # saveRDS( ncvs_df , file = ncvs_fn , compress = FALSE ) Load the same object: # ncvs_df <- readRDS( ncvs_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) options('survey.lonely.psu' = 'adjust') # replace missing clusters ncvs_person_df[ is.na( ncvs_person_df[ , 'v2118' ] ) , 'v2118' ] <- -1 ncvs_person_df[ is.na( ncvs_person_df[ , 'v2117' ] ) , 'v2117' ] <- -1 # subset this dataset to only 2022 ncvs_df <- subset( ncvs_person_df , year == max( year ) ) ncvs_design <- svydesign( ~ v2118 , strata = ~ interaction( yr_grp , v2117 ) , data = ncvs_df , weights = ~ prev_wgt , nest = TRUE ) Variable Recoding Add new columns to the data set: ncvs_design <- update( ncvs_design , one = 1 , victim = as.numeric( violent_year > 0 ) , sex = factor( v3018 , levels = 1:2 , labels = c( 'male' , 'female' ) ) , linear_age = ifelse( v3014 == 99 , NA , v3014 ) , times_moved_in_prior_five_years = ifelse( v3033 == 99 , NA , v3033 ) , current_marital_status = factor( v3015 , levels = c( 1:5 , 8 ) , labels = c( 'married' , 'widowed' , 'divorced' , 'separated' , 'single' , 'residue' ) ) , household_income_starting_2015q1 = factor( findInterval( sc214a , c( 1 , 9 , 13 , 16 , 18 ) ) , levels = 1:5 , labels = c( 'less than $25,000' , '$25,000 - $49,999' , '$50,000 - $99,999' , '$100,000 - $199,999' , '$200,000 or more' ) ) , household_income_75k = ifelse( v2026 == 98 , NA , as.numeric( v2026 %in% 14:18 ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( ncvs_design , "sampling" ) != 0 ) svyby( ~ one , ~ sex , ncvs_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , ncvs_design ) svyby( ~ one , ~ sex , ncvs_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ victim , ncvs_design ) svyby( ~ victim , ~ sex , ncvs_design , svymean ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ current_marital_status , ncvs_design ) svyby( ~ current_marital_status , ~ sex , ncvs_design , svymean ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ victim , ncvs_design ) svyby( ~ victim , ~ sex , ncvs_design , svytotal ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ current_marital_status , ncvs_design ) svyby( ~ current_marital_status , ~ sex , ncvs_design , svytotal ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ victim , ncvs_design , 0.5 ) svyby( ~ victim , ~ sex , ncvs_design , svyquantile , 0.5 , ci = TRUE ) Estimate a ratio: svyratio( numerator = ~ times_moved_in_prior_five_years , denominator = ~ linear_age , ncvs_design , na.rm = TRUE ) Subsetting Restrict the survey design to elderly americans: sub_ncvs_design <- subset( ncvs_design , linear_age >= 65 ) Calculate the mean (average) of this subset: svymean( ~ victim , sub_ncvs_design ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ victim , ncvs_design ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ victim , ~ sex , ncvs_design , svymean ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( ncvs_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ victim , ncvs_design ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ victim , ncvs_design , deff = TRUE ) # SRS with replacement svymean( ~ victim , ncvs_design , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ household_income_75k , ncvs_design , method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( victim ~ household_income_75k , ncvs_design ) Perform a chi-squared test of association for survey data: svychisq( ~ household_income_75k + current_marital_status , ncvs_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( victim ~ household_income_75k + current_marital_status , ncvs_design ) summary( glm_result ) Replication Example This example matches the 1994 and 2022 victimization rates and SEs in Appendix Table 1: new_prevalence_design <- svydesign( ~ v2118 , strata = ~ interaction( yr_grp , v2117 ) , data = ncvs_person_df , weights = ~ prev_wgt , nest = TRUE ) new_prevalence_results <- svyby( ~ as.numeric( violent_year > 0 ) , ~ year , new_prevalence_design , svymean ) # match new method (wgt_ovam) 1994 and 2022 estimates stopifnot( round( coef( new_prevalence_results )[ c( 1 , nrow( new_prevalence_results ) ) ] , 4 ) == c( 0.0442 , 0.0151 ) ) # match new method (wgt_ovam) 1994 and 2022 standard errors stopifnot( round( SE( new_prevalence_results )[ c( 1 , nrow( new_prevalence_results ) ) ] , 5 ) == c( 0.0010 , 0.00054 ) ) old_prevalence_design <- svydesign( ~ v2118 , strata = ~ interaction( yr_grp , v2117 ) , data = ncvs_person_df , weights = ~ wgtpercy , nest = TRUE ) old_prevalence_results <- svyby( ~ as.numeric( violent_year > 0 ) , ~ year , old_prevalence_design , svymean ) # match old method (wgtpercy) 1994 and 2022 estimates stopifnot( round( coef( old_prevalence_results )[ c( 1 , nrow( old_prevalence_results ) ) ] , 4 ) == c( 0.0328 , 0.0124 ) ) # match old method (wgtpercy) 1994 and 2022 standard errors stopifnot( round( SE( old_prevalence_results )[ c( 1 , nrow( old_prevalence_results ) ) ] , 5 ) == c( 0.00075 , 0.00042 ) ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for NCVS users, this code replicates previously-presented examples: library(srvyr) ncvs_srvyr_design <- as_survey( ncvs_design ) Calculate the mean (average) of a linear variable, overall and by groups: ncvs_srvyr_design %>% summarize( mean = survey_mean( victim ) ) ncvs_srvyr_design %>% group_by( sex ) %>% summarize( mean = survey_mean( victim ) ) "],["national-financial-capability-study-nfcs.html", "National Financial Capability Study (NFCS) Recommended Reading Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " National Financial Capability Study (NFCS) A study of financial knowledge and behavior, like making ends meet, planning ahead, managing assets. One state-by-state survey table with one row per sampled respondent, a separate investor survey. An online non-probability sample of U.S. adults (18+) calibrated to the American Community Survey. Released triennially since 2009. Funded by the FINRA Investor Education Foundation and conducted by FGS Global. Recommended Reading Four Example Strengths & Limitations: ✔️ Comprehensive assessment of financial literacy ✔️ Questionnaire replicated by other studies ❌ Non-probability quota sampling from online panels ❌ Limited income and asset detail compared to CPS or SCF Three Example Findings: In 2018, 33% of Americans aged 51-61 were satisfied with their personal financial situations. The gender gap in financial literacy widened with age in 2021. Average scores on a test of five financial literacy questions declined between 2009 and 2021. Two Methodology Documents: 2021 National Financial Capability Study: State-by-State Survey Methodology Financial Capability Insights: What the NFCS Reveals One Haiku: # lady madonna # laid bank balance goose egg, loves # gold unrequited Download, Import, Preparation Download and import the latest state-by-state microdata: library(haven) zip_tf <- tempfile() zip_url <- 'https://finrafoundation.org/sites/finrafoundation/files/2021-SxS-Data-and-Data-Info.zip' download.file( zip_url , zip_tf , mode = 'wb' ) unzipped_files <- unzip( zip_tf , exdir = tempdir() ) stata_fn <- grep( "\\\\.dta$" , unzipped_files , value = TRUE ) nfcs_tbl <- read_dta( stata_fn ) nfcs_df <- data.frame( nfcs_tbl ) names( nfcs_df ) <- tolower( names( nfcs_df ) ) Add a column of all ones, add labels to state names, add labels to the rainy day fund question: nfcs_df[ , 'one' ] <- 1 nfcs_df[ , 'state_name' ] <- factor( nfcs_df[ , 'stateq' ] , levels = 1:51 , labels = sort( c( 'District of Columbia' , state.name ) ) ) nfcs_df[ , 'rainy_day_fund' ] <- factor( nfcs_df[ , 'j5' ] , levels = c( 1 , 2 , 98 , 99 ) , labels = c( 'Yes' , 'No' , "Don't Know" , "Prefer not to say" ) ) Save Locally   Save the object at any point: # nfcs_fn <- file.path( path.expand( "~" ) , "NFCS" , "this_file.rds" ) # saveRDS( nfcs_df , file = nfcs_fn , compress = FALSE ) Load the same object: # nfcs_df <- readRDS( nfcs_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) nfcs_design <- svydesign( ~ 1 , data = nfcs_df , weights = ~ wgt_n2 ) divison_design <- svydesign( ~ 1 , data = nfcs_df , weights = ~ wgt_d2 ) state_design <- svydesign( ~ 1 , data = nfcs_df , weights = ~ wgt_s3 ) Variable Recoding Add new columns to the data set: nfcs_design <- update( nfcs_design , satisfaction_w_finances = ifelse( j1 > 10 , NA , j1 ) , risk_taking = ifelse( j2 > 10 , NA , j2 ) , difficult_to_pay_bills = factor( j4 , levels = c( 1 , 2 , 3 , 98 , 99 ) , labels = c( 'Very difficult' , 'Somewhat difficult' , 'Not at all difficult' , "Don't know" , 'Prefer not to say' ) ) , spending_vs_income = factor( j3 , levels = c( 1 , 2 , 3 , 98 , 99 ) , labels = c( 'Spending less than income' , 'Spending more than income' , 'Spending about equal to income' , "Don't know" , 'Prefer not to say' ) ) , unpaid_medical_bills = ifelse( g20 > 2 , NA , as.numeric( g20 == 1 ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( nfcs_design , "sampling" ) != 0 ) svyby( ~ one , ~ spending_vs_income , nfcs_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , nfcs_design ) svyby( ~ one , ~ spending_vs_income , nfcs_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ satisfaction_w_finances , nfcs_design , na.rm = TRUE ) svyby( ~ satisfaction_w_finances , ~ spending_vs_income , nfcs_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ difficult_to_pay_bills , nfcs_design ) svyby( ~ difficult_to_pay_bills , ~ spending_vs_income , nfcs_design , svymean ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ satisfaction_w_finances , nfcs_design , na.rm = TRUE ) svyby( ~ satisfaction_w_finances , ~ spending_vs_income , nfcs_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ difficult_to_pay_bills , nfcs_design ) svyby( ~ difficult_to_pay_bills , ~ spending_vs_income , nfcs_design , svytotal ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ satisfaction_w_finances , nfcs_design , 0.5 , na.rm = TRUE ) svyby( ~ satisfaction_w_finances , ~ spending_vs_income , nfcs_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ satisfaction_w_finances , denominator = ~ risk_taking , nfcs_design , na.rm = TRUE ) Subsetting Restrict the survey design to persons receiving pandemic-related stimulus payment: sub_nfcs_design <- subset( nfcs_design , j50 == 1 ) Calculate the mean (average) of this subset: svymean( ~ satisfaction_w_finances , sub_nfcs_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ satisfaction_w_finances , nfcs_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ satisfaction_w_finances , ~ spending_vs_income , nfcs_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( nfcs_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ satisfaction_w_finances , nfcs_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ satisfaction_w_finances , nfcs_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ satisfaction_w_finances , nfcs_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ unpaid_medical_bills , nfcs_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( satisfaction_w_finances ~ unpaid_medical_bills , nfcs_design ) Perform a chi-squared test of association for survey data: svychisq( ~ unpaid_medical_bills + difficult_to_pay_bills , nfcs_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( satisfaction_w_finances ~ unpaid_medical_bills + difficult_to_pay_bills , nfcs_design ) summary( glm_result ) Replication Example This example matches the unweighted count shown on PDF page 4: stopifnot( nrow( nfcs_df ) == 27118 ) This example matches the PDF page 7 estimate that 53% have three months of rainy day funds: national_rainy_day <- svymean( ~ rainy_day_fund , nfcs_design ) stopifnot( round( coef( national_rainy_day )[ 'rainy_day_fundYes' ] , 2 ) == 0.53 ) This example matches counts and rainy day estimates from The Geography of Financial Capability: state_counts <- svyby( ~ one , ~ state_name , state_design , unwtd.count ) stopifnot( state_counts[ 'California' , 'counts' ] == 1252 ) stopifnot( state_counts[ 'Missouri' , 'counts' ] == 501 ) stopifnot( state_counts[ 'Oregon' , 'counts' ] == 1261 ) state_rainy_day <- svyby( ~ rainy_day_fund , ~ state_name , state_design , svymean ) stopifnot( round( state_rainy_day[ 'California' , 'rainy_day_fundYes' ] , 2 ) == 0.57 ) stopifnot( round( state_rainy_day[ 'Missouri' , 'rainy_day_fundYes' ] , 2 ) == 0.51 ) stopifnot( round( state_rainy_day[ 'Oregon' , 'rainy_day_fundYes' ] , 2 ) == 0.52 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for NFCS users, this code replicates previously-presented examples: library(srvyr) nfcs_srvyr_design <- as_survey( nfcs_design ) Calculate the mean (average) of a linear variable, overall and by groups: nfcs_srvyr_design %>% summarize( mean = survey_mean( satisfaction_w_finances , na.rm = TRUE ) ) nfcs_srvyr_design %>% group_by( spending_vs_income ) %>% summarize( mean = survey_mean( satisfaction_w_finances , na.rm = TRUE ) ) "],["national-health-and-nutrition-examination-survey-nhanes.html", "National Health and Nutrition Examination Survey (NHANES) Recommended Reading Download, Import, Preparation Analysis Examples with the survey library   Direct Method of Age-Adjustment Replication Example Analysis Examples with srvyr  ", " National Health and Nutrition Examination Survey (NHANES) Doctors and dentists accompany survey interviewers in a mobile medical center that travels the country. While survey researchers read the questionnaires, medical professionals administer laboratory tests and conduct a full medical examination. The blood work and in-person check-up allow epidemiologists to answer questions like, “how many people have diabetes but don’t know they have diabetes?” Many tables containing information from the various examinations, generally one row per respondent. A complex sample survey designed to generalize to the civilian non-institutionalized U.S. population. Released biennially since 1999-2000. Administered by the Centers for Disease Control and Prevention. Recommended Reading Four Example Strengths & Limitations: ✔️ Biospecimen, dietary, and laboratory data ✔️ Basis for growth charts found on the walls of pediatricians’ offices and clinics worldwide ❌ Mobile Examination Centers require 150 minutes per interview ❌ Narrow set of demographic and family relationship questions Three Example Findings: Among US adults with diabetes across 2007-2010 and 2015-2018, the share achieving glycemic control (glycated hemoglobin level, <7%) declined from 57.4% to 50.5%. Ten million Americans alive in 2015 had childhood blood lead levels 5x above the level of concern. Among US children aged 2-5 years interviewed between 2017 and March of 2020 with at least one primary tooth, 11% had at least one untreated decayed primary tooth. Two Methodology Documents: About the National Health and Nutrition Examination Survey NHANES Tutorials One Haiku: # doctor, dentist, labs # mobile examination #vanlife interviews Download, Import, Preparation Download and import the demographics (demo) and total cholesterol laboratory (tchol) data: library(haven) nhanes_2015_2016_demo_url <- "https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/DEMO_I.XPT" nhanes_2017_2018_demo_url <- "https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DEMO_J.XPT" nhanes_2015_2016_tchol_url <- "https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/TCHOL_I.XPT" nhanes_2017_2018_tchol_url <- "https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/TCHOL_J.XPT" nhanes_2015_2016_demo_tbl <- read_xpt( nhanes_2015_2016_demo_url ) nhanes_2017_2018_demo_tbl <- read_xpt( nhanes_2017_2018_demo_url ) nhanes_2015_2016_tchol_tbl <- read_xpt( nhanes_2015_2016_tchol_url ) nhanes_2017_2018_tchol_tbl <- read_xpt( nhanes_2017_2018_tchol_url ) nhanes_2015_2016_demo_df <- data.frame( nhanes_2015_2016_demo_tbl ) nhanes_2017_2018_demo_df <- data.frame( nhanes_2017_2018_demo_tbl ) nhanes_2015_2016_tchol_df <- data.frame( nhanes_2015_2016_tchol_tbl ) nhanes_2017_2018_tchol_df <- data.frame( nhanes_2017_2018_tchol_tbl ) Specify which variables to keep from both the demo and tchol data files, then stack the four years: demo_vars <- c( # unique person identifier (merge variable) "SEQN" , # the two-year interviewed + MEC examined weight "WTMEC2YR" , # note that this is a special weight for only # individuals who took the mobile examination center (MEC) exam # there is one other weight available - WTINT2YR - # that should be used when MEC variables are not part of the analysis # interviewed only or interviewed + MEC "RIDSTATR" , # primary sampling unit varaible, used in complex design "SDMVPSU" , # strata variable, used in complex design "SDMVSTRA" , # race / ethnicity "RIDRETH3" , # age "RIDAGEYR" , # gender "RIAGENDR" , # pregnant at interview "RIDEXPRG" ) nhanes_2015_2018_demo_df <- rbind( nhanes_2015_2016_demo_df[ , demo_vars ] , nhanes_2017_2018_demo_df[ , demo_vars ] ) tchol_vars <- c( # unique person identifier (merge variable) "SEQN" , # laboratory total cholesterol variable # https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/TCHOL_J.htm "LBXTC" ) nhanes_2015_2018_tchol_df <- rbind( nhanes_2015_2016_tchol_df[ , tchol_vars ] , nhanes_2017_2018_tchol_df[ , tchol_vars ] ) Merge the two pooled datasets, limit the data.frame to mobile examination component respondents: nhanes_full_df <- merge( nhanes_2015_2018_demo_df , nhanes_2015_2018_tchol_df , all = TRUE ) names( nhanes_full_df ) <- tolower( names( nhanes_full_df ) ) nhanes_df <- subset( nhanes_full_df , ridstatr %in% 2 ) Scale the mobile examination component two-year weight to generalize to the pooled, four year period: nhanes_df[ , 'wtmec4yr' ] <- nhanes_df[ , 'wtmec2yr' ] / 2 Save Locally   Save the object at any point: # nhanes_fn <- file.path( path.expand( "~" ) , "NHANES" , "this_file.rds" ) # saveRDS( nhanes_df , file = nhanes_fn , compress = FALSE ) Load the same object: # nhanes_df <- readRDS( nhanes_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) nhanes_design <- svydesign( id = ~ sdmvpsu , strata = ~ sdmvstra , nest = TRUE , weights = ~ wtmec4yr , data = nhanes_df ) Variable Recoding Add new columns to the data set: nhanes_design <- update( nhanes_design , one = 1 , # define high total cholesterol as 1 if mg/dL is at or above 240 and zero otherwise. hi_tchol = ifelse( lbxtc >= 240 , 1 , 0 ) , gender = factor( riagendr , levels = 1:2 , labels = c( 'male' , 'female' ) ) , age_categories = factor( 1 + findInterval( ridageyr , c( 20 , 40 , 60 ) ) , levels = 1:4 , labels = c( "0-19" , "20-39" , "40-59" , "60+" ) ) , # recode the ridreth3 variable as: # mexican american and other hispanic -> 4 # non-hispanic white -> 1 # non-hispanic black -> 2 # non-hispanic asian -> 3 # other race including multi-racial -> 5 race_ethnicity = factor( c( 4 , 4 , 1 , 2 , NA , 3 , 5 )[ ridreth3 ] , levels = 1:5 , labels = c( 'nh white' , 'nh black' , 'nh asian' , 'hispanic' , 'other' ) ) , pregnant_at_interview = ifelse( ridexprg %in% 1:2 , as.numeric( ridexprg == 1 ) , NA ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( nhanes_design , "sampling" ) != 0 ) svyby( ~ one , ~ race_ethnicity , nhanes_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , nhanes_design ) svyby( ~ one , ~ race_ethnicity , nhanes_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ lbxtc , nhanes_design , na.rm = TRUE ) svyby( ~ lbxtc , ~ race_ethnicity , nhanes_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ riagendr , nhanes_design ) svyby( ~ riagendr , ~ race_ethnicity , nhanes_design , svymean ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ lbxtc , nhanes_design , na.rm = TRUE ) svyby( ~ lbxtc , ~ race_ethnicity , nhanes_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ riagendr , nhanes_design ) svyby( ~ riagendr , ~ race_ethnicity , nhanes_design , svytotal ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ lbxtc , nhanes_design , 0.5 , na.rm = TRUE ) svyby( ~ lbxtc , ~ race_ethnicity , nhanes_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ lbxtc , denominator = ~ ridageyr , nhanes_design , na.rm = TRUE ) Subsetting Restrict the survey design to respondents aged 60 or older: sub_nhanes_design <- subset( nhanes_design , age_categories == "60+" ) Calculate the mean (average) of this subset: svymean( ~ lbxtc , sub_nhanes_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ lbxtc , nhanes_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ lbxtc , ~ race_ethnicity , nhanes_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( nhanes_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ lbxtc , nhanes_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ lbxtc , nhanes_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ lbxtc , nhanes_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ pregnant_at_interview , nhanes_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( lbxtc ~ pregnant_at_interview , nhanes_design ) Perform a chi-squared test of association for survey data: svychisq( ~ pregnant_at_interview + riagendr , nhanes_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( lbxtc ~ pregnant_at_interview + riagendr , nhanes_design ) summary( glm_result ) Direct Method of Age-Adjustment Replication Example This example matches the total cholesterol statistics and standard errors in Table 1 from Data Brief 363: Match the crude estimates in the footnote and also in the unadjusted age categories: crude_overall <- svymean( ~ hi_tchol , subset( nhanes_design , ridageyr >= 20 ) , na.rm = TRUE ) stopifnot( round( coef( crude_overall ) , 3 ) == 0.115 ) crude_by_gender <- svyby( ~ hi_tchol , ~ gender , subset( nhanes_design , ridageyr >= 20 ) , svymean , na.rm = TRUE ) stopifnot( round( coef( crude_by_gender )[ 1 ] , 3 ) == 0.103 ) stopifnot( round( coef( crude_by_gender )[ 2 ] , 3 ) == 0.126 ) crude_by_age <- svyby( ~ hi_tchol , ~ age_categories , subset( nhanes_design , ridageyr >= 20 ) , svymean , na.rm = TRUE ) stopifnot( round( coef( crude_by_age )[ 1 ] , 3 ) == 0.075 ) stopifnot( round( coef( crude_by_age )[ 2 ] , 3 ) == 0.157 ) stopifnot( round( coef( crude_by_age )[ 3 ] , 3 ) == 0.114 ) stopifnot( round( SE( crude_by_age )[ 1 ] , 3 ) == 0.005 ) stopifnot( round( SE( crude_by_age )[ 2 ] , 3 ) == 0.011 ) stopifnot( round( SE( crude_by_age )[ 3 ] , 3 ) == 0.008 ) Sum up 2000 Census totals based on the age groupings specified in footnote: pop_by_age <- data.frame( age_categories = c( "0-19" , "20-39" , "40-59" , "60+" ) , Freq = c( 78782657 , 77670618 , 72816615 , 45363752 ) ) Create a design with the nationwide population stratified to the above census counts: nhanes_age_adjusted <- postStratify( subset( nhanes_design , !is.na( hi_tchol ) ) , ~ age_categories , pop_by_age ) Match the overall adjusted estimates: results_overall <- svymean( ~ hi_tchol , subset( nhanes_age_adjusted , ridageyr >= 20 ) , na.rm = TRUE ) stopifnot( round( coef( results_overall ) , 3 ) == 0.114 ) stopifnot( round( SE( results_overall ) , 3 ) == 0.006 ) Create a design stratified to census counts broken out by gender, then match those estimates: nhanes_by_gender <- svystandardize( nhanes_design , by = ~ age_categories , # stratification variable over = ~ gender , # break out variable population = pop_by_age , # data.frame containing census populations excluding.missing = ~ hi_tchol # analysis variable of interest ) results_by_gender <- svyby( ~ hi_tchol , ~ gender , subset( nhanes_by_gender , ridageyr >= 20 ) , svymean , na.rm=TRUE ) stopifnot( round( coef( results_by_gender )[ 1 ] , 3 ) == 0.105 ) stopifnot( round( coef( results_by_gender )[ 2 ] , 3 ) == 0.121 ) stopifnot( round( SE( results_by_gender )[ 1 ] , 3 ) == 0.007 ) stopifnot( round( SE( results_by_gender )[ 2 ] , 3 ) == 0.008 ) Create a design stratified to census counts broken out by race/ethnicity, then match those estimates: nhanes_by_race <- svystandardize( nhanes_design , by = ~ age_categories , # stratification variable over = ~ race_ethnicity , # break out variable population = pop_by_age , # data.frame containing census populations excluding.missing = ~ hi_tchol # analysis variable of interest ) results_by_race_ethnicity <- svyby( ~ hi_tchol , ~ race_ethnicity , design = subset( nhanes_by_race , ridageyr >= 20 ) , svymean , na.rm=TRUE ) stopifnot( round( coef( results_by_race_ethnicity )[ 1 ] , 3 ) == 0.117 ) stopifnot( round( coef( results_by_race_ethnicity )[ 2 ] , 3 ) == 0.100 ) stopifnot( round( coef( results_by_race_ethnicity )[ 3 ] , 3 ) == 0.116 ) stopifnot( round( coef( results_by_race_ethnicity )[ 4 ] , 3 ) == 0.109 ) stopifnot( round( SE( results_by_race_ethnicity )[ 1 ] , 3 ) == 0.007 ) stopifnot( round( SE( results_by_race_ethnicity )[ 2 ] , 3 ) == 0.009 ) stopifnot( round( SE( results_by_race_ethnicity )[ 3 ] , 3 ) == 0.011 ) stopifnot( round( SE( results_by_race_ethnicity )[ 4 ] , 3 ) == 0.009 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for NHANES users, this code replicates previously-presented examples: library(srvyr) nhanes_srvyr_design <- as_survey( nhanes_design ) Calculate the mean (average) of a linear variable, overall and by groups: nhanes_srvyr_design %>% summarize( mean = survey_mean( lbxtc , na.rm = TRUE ) ) nhanes_srvyr_design %>% group_by( race_ethnicity ) %>% summarize( mean = survey_mean( lbxtc , na.rm = TRUE ) ) "],["national-health-interview-survey-nhis.html", "National Health Interview Survey (NHIS) Recommended Reading Function Definitions Download, Import, Preparation Analysis Examples with the survey library   Replication Example", " National Health Interview Survey (NHIS) America’s most detailed household survey of health status and medical experience. One table with one row per sampled adult (18+) within each sampled household, one table with one row per sample child (when available, same family not required), multiply-imputed income tables. A complex sample survey designed to generalize to the U.S. civilian non-institutionalized population. Released annually since 1963, the most recent major re-design in 2019. Conducted by the National Center for Health Statistics at the Centers for Disease Control. Recommended Reading Four Example Strengths & Limitations: ✔️ PRICSSA ✔️ Annual and rotating core questionnaires allow trend analysis ❌ High rate of missingness for family income questions ❌ 2019 redesign interviews only one adult and one child (if present) per household Three Example Findings: In 2022, 14% of US seniors met both aerobic and muscle-strengthening physical activity guidelines. Adults living alone in 2021 reported feelings of depression more often than those living with others. Among US adults aged 18+ in 2022, 3% were categorized as having severe anxiety symptoms. Two Methodology Documents: 2022 Survey Description Wikipedia Entry One Haiku: # excellent health poor # wealth. "sup, doc?" bugs, daft bills, free # laughs best medicine Function Definitions Define a function to download, unzip, and import each comma-separated value file: nhis_csv_import <- function( this_url ){ this_tf <- tempfile() download.file( this_url , this_tf , mode = 'wb' ) unzipped_files <- unzip( this_tf , exdir = tempdir() ) this_csv <- grep( '\\\\.csv$' , unzipped_files , value = TRUE ) this_df <- read.csv( this_csv ) file.remove( c( this_tf , unzipped_files ) ) names( this_df ) <- tolower( names( this_df ) ) this_df } Download, Import, Preparation Download and import the sample adult interview and imputed income files: nhis_df <- nhis_csv_import( "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NHIS/2021/adult21csv.zip" ) imputed_income_df <- nhis_csv_import( "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NHIS/2021/adultinc21csv.zip" ) Save Locally   Save the object at any point: # nhis_fn <- file.path( path.expand( "~" ) , "NHIS" , "this_file.rds" ) # saveRDS( nhis_df , file = nhis_fn , compress = FALSE ) Load the same object: # nhis_df <- readRDS( nhis_fn ) Survey Design Definition Construct a multiply-imputed, complex sample survey design: Reshape the imputed income data.frame into a list based on the implicate number: imputed_income_list <- split( imputed_income_df , imputed_income_df[ , 'impnum_a' ] ) Remove overlapping columns except the merge variable: variables_to_remove <- setdiff( intersect( names( nhis_df ) , names( imputed_income_df ) ) , 'hhx' ) nhis_df <- nhis_df[ , !( names( nhis_df ) %in% variables_to_remove ) ] Merge each implicate onto the sample adult table: nhis_list <- lapply( imputed_income_list , function( w ){ this_df <- merge( nhis_df , w ) stopifnot( nrow( this_df ) == nrow( nhis_df ) ) this_df } ) Define the design: library(survey) library(mitools) nhis_design <- svydesign( id = ~ ppsu , strata = ~ pstrat , nest = TRUE , weights = ~ wtfa_a , data = imputationList( nhis_list ) ) Variable Recoding Add new columns to the data set: nhis_design <- update( nhis_design , one = 1 , poverty_category = factor( findInterval( povrattc_a , c( 1 , 2 , 4 ) ) , labels = c( "below poverty" , "100-199%" , "200-399%" , "400%+" ) ) , fair_or_poor_reported_health = ifelse( phstat_a %in% 1:5 , as.numeric( phstat_a >= 4 ) , NA ) , sex_a = factor( sex_a , levels = 1:2 , labels = c( "male" , "female" ) ) , annual_premium_first_plan = ifelse( hicostr1_a > 40000 , NA , hicostr1_a ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: MIcombine( with( nhis_design , svyby( ~ one , ~ one , unwtd.count ) ) ) MIcombine( with( nhis_design , svyby( ~ one , ~ poverty_category , unwtd.count ) ) ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: MIcombine( with( nhis_design , svytotal( ~ one ) ) ) MIcombine( with( nhis_design , svyby( ~ one , ~ poverty_category , svytotal ) ) ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: MIcombine( with( nhis_design , svymean( ~ agep_a ) ) ) MIcombine( with( nhis_design , svyby( ~ agep_a , ~ poverty_category , svymean ) ) ) Calculate the distribution of a categorical variable, overall and by groups: MIcombine( with( nhis_design , svymean( ~ sex_a ) ) ) MIcombine( with( nhis_design , svyby( ~ sex_a , ~ poverty_category , svymean ) ) ) Calculate the sum of a linear variable, overall and by groups: MIcombine( with( nhis_design , svytotal( ~ agep_a ) ) ) MIcombine( with( nhis_design , svyby( ~ agep_a , ~ poverty_category , svytotal ) ) ) Calculate the weighted sum of a categorical variable, overall and by groups: MIcombine( with( nhis_design , svytotal( ~ sex_a ) ) ) MIcombine( with( nhis_design , svyby( ~ sex_a , ~ poverty_category , svytotal ) ) ) Calculate the median (50th percentile) of a linear variable, overall and by groups: MIcombine( with( nhis_design , svyquantile( ~ agep_a , 0.5 , se = TRUE ) ) ) MIcombine( with( nhis_design , svyby( ~ agep_a , ~ poverty_category , svyquantile , 0.5 , se = TRUE , ci = TRUE ) ) ) Estimate a ratio: MIcombine( with( nhis_design , svyratio( numerator = ~ annual_premium_first_plan , denominator = ~ agep_a , na.rm = TRUE ) ) ) Subsetting Restrict the survey design to uninsured: sub_nhis_design <- subset( nhis_design , notcov_a == 1 ) Calculate the mean (average) of this subset: MIcombine( with( sub_nhis_design , svymean( ~ agep_a ) ) ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- MIcombine( with( nhis_design , svymean( ~ agep_a ) ) ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- MIcombine( with( nhis_design , svyby( ~ agep_a , ~ poverty_category , svymean ) ) ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( nhis_design$designs[[1]] ) Calculate the complex sample survey-adjusted variance of any statistic: MIcombine( with( nhis_design , svyvar( ~ agep_a ) ) ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement MIcombine( with( nhis_design , svymean( ~ agep_a , deff = TRUE ) ) ) # SRS with replacement MIcombine( with( nhis_design , svymean( ~ agep_a , deff = "replace" ) ) ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: # MIsvyciprop( ~ fair_or_poor_reported_health , nhis_design , # method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: # MIsvyttest( agep_a ~ fair_or_poor_reported_health , nhis_design ) Perform a chi-squared test of association for survey data: # MIsvychisq( ~ fair_or_poor_reported_health + sex_a , nhis_design ) Perform a survey-weighted generalized linear model: glm_result <- MIcombine( with( nhis_design , svyglm( agep_a ~ fair_or_poor_reported_health + sex_a ) ) ) summary( glm_result ) Replication Example This example matches statistics and standard errors within 0.01% from Figure 3 of this Characteristics of Adults Aged 18–64 Who Did Not Take Medication as Prescribed to Reduce Costs Data Brief: results <- MIcombine( with( subset( nhis_design , agep_a < 65 ) , svyby( ~ as.numeric( rxsk12m_a == 1 | rxls12m_a == 1 | rxdl12m_a == 1 ) , ~ poverty_category , svymean , na.rm = TRUE ) ) ) stopifnot( all( as.numeric( round( coef( results ) , 3 ) ) == c( 0.145 , 0.138 , 0.099 , 0.039 ) ) ) stopifnot( all( as.numeric( round( SE( results ) , 5 ) ) - c( 0.0126 , 0.0098 , 0.0062 , 0.0031 ) < 0.0001 ) ) "],["national-household-travel-survey-nhts.html", "National Household Travel Survey (NHTS) Recommended Reading Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " National Household Travel Survey (NHTS) The authoritative source on travel behavior, recording characteristics of people and vehicles of all modes. Four core linkable tables, with one record per household, person, trip, and vehicle, respectively. A complex sample survey designed to generalize to the civilian non-institutional U.S. population. Released every five to eight years since 1969. Funded by the Federal Highway Administration, with data collected by Ipsos Public Affairs. Recommended Reading Four Example Strengths & Limitations: ✔️ Origin-Destination passively collected data complement traditional household survey ✔️ Sample supports analysis of metro areas within census divisions ❌ 2022 redesign uses retrospective recorded travel day (1 day prior) rather than travel log ❌ Long-distance trip questions do not estimate respondent’s annual behavior or volume Three Example Findings: Online-purchased home deliveries grew over 2017-2022, from 2.5 to 5.4 per person per month. In 2022, 53% of K-12 students were dropped off at school in a private vehicle or drove themselves. Nearly 9 in 10 US households had a vehicle available to drive in 2022. Two Methodology Documents: 2022 NHTS Data User Guide 2022 NHTS Weighting Memo One Haiku: # commuter patterns, # truckin'. what a long strange trip # who went when where why Download, Import, Preparation Download and unzip each the 2022 files: library(haven) tf <- tempfile() download.file( "https://nhts.ornl.gov/assets/2022/download/sas.zip" , tf , mode = 'wb' ) unzipped_files <- unzip( tf , exdir = tempdir() ) Import the tables containing one record per household, person, trip, and vehicle: nhts_import <- function( this_prefix , this_unzip ){ this_sas7bdat <- grep( paste0( this_prefix , "\\\\.sas7bdat$" ) , this_unzip , value = TRUE ) this_tbl <- read_sas( this_sas7bdat ) this_df <- data.frame( this_tbl ) names( this_df ) <- tolower( names( this_df ) ) this_df } hhpub_df <- nhts_import( "hhv2pub" , unzipped_files ) perpub_df <- nhts_import( "perv2pub" , unzipped_files ) trippub_df <- nhts_import( "tripv2pub" , unzipped_files ) vehpub_df <- nhts_import( "vehv2pub" , unzipped_files ) Add a column of ones to three of those tables, then a column of non-missing mileage to the trips table: hhpub_df[ , 'one' ] <- 1 perpub_df[ , 'one' ] <- 1 trippub_df[ , 'one' ] <- 1 trippub_df[ !( trippub_df[ , 'trpmiles' ] %in% -9 ) , 'wtd_tripmiles_no_nines' ] <- trippub_df[ !( trippub_df[ , 'trpmiles' ] %in% -9 ) , 'trpmiles' ] * trippub_df[ !( trippub_df[ , 'trpmiles' ] %in% -9 ) , 'wttrdfin' ] Sum the total trip count and mileage to the person-level, both overall and restricted to walking only: trips_per_person <- with( trippub_df , aggregate( cbind( wttrdfin , wtd_tripmiles_no_nines ) , list( houseid , personid ) , sum , na.rm = TRUE ) ) names( trips_per_person ) <- c( 'houseid' , 'personid' , 'wtd_trips' , 'wtd_miles' ) walks_per_person <- with( subset( trippub_df , trptrans == '20' ) , aggregate( cbind( wttrdfin , wtd_tripmiles_no_nines ) , list( houseid , personid ) , sum , na.rm = TRUE ) ) names( walks_per_person ) <- c( 'houseid' , 'personid' , 'wtd_walks' , 'wtd_walk_miles' ) Merge these trip count and mileage values on to the person-level file, replacing non-matches with zero: nhts_df <- merge( perpub_df , trips_per_person , all.x = TRUE ) nhts_df <- merge( nhts_df , walks_per_person , all.x = TRUE ) for( this_variable in c( 'wtd_trips' , 'wtd_miles' , 'wtd_walks' , 'wtd_walk_miles' ) ){ nhts_df[ is.na( nhts_df[ , this_variable ] ) , this_variable ] <- 0 } stopifnot( nrow( nhts_df ) == nrow( perpub_df ) ) Save Locally   Save the object at any point: # nhts_fn <- file.path( path.expand( "~" ) , "NHTS" , "this_file.rds" ) # saveRDS( nhts_df , file = nhts_fn , compress = FALSE ) Load the same object: # nhts_df <- readRDS( nhts_fn ) Survey Design Definition Construct a complex sample survey design: Define household-level, person-level, and trip-level designs: library(survey) hh_design <- svydesign( id = ~ houseid , strata = ~ stratumid , data = hhpub_df , weights = ~ wthhfin ) nhts_design <- svydesign( id = ~ houseid , strata = ~ stratumid , data = nhts_df , weights = ~ wtperfin ) trip_design <- svydesign( id = ~ houseid , strata = ~ stratumid , data = trippub_df , weights = ~ wttrdfin ) Variable Recoding Add new columns to the data set: hh_design <- update( hh_design , hhsize_categories = factor( findInterval( hhsize , 1:4 ) , levels = 1:4 , labels = c( 1:3 , '4 or more' ) ) ) nhts_design <- update( nhts_design , urban_area = as.numeric( urbrur == '01' ) , daily_person_trips = ( wtd_trips / ( 365 * wtperfin ) ) , daily_person_miles_of_travel = ( wtd_miles / ( 365 * wtperfin ) ) , daily_person_walks = ( wtd_walks / ( 365 * wtperfin ) ) , daily_person_walk_miles_of_travel = ( wtd_walk_miles / ( 365 * wtperfin ) ) , work_status = factor( as.numeric( worker ) , levels = 2:1 , labels = c( 'non-worker' , 'worker' ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( nhts_design , "sampling" ) != 0 ) svyby( ~ one , ~ r_sex_imp , nhts_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , nhts_design ) svyby( ~ one , ~ r_sex_imp , nhts_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ daily_person_walks , nhts_design ) svyby( ~ daily_person_walks , ~ r_sex_imp , nhts_design , svymean ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ work_status , nhts_design , na.rm = TRUE ) svyby( ~ work_status , ~ r_sex_imp , nhts_design , svymean , na.rm = TRUE ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ daily_person_walks , nhts_design ) svyby( ~ daily_person_walks , ~ r_sex_imp , nhts_design , svytotal ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ work_status , nhts_design , na.rm = TRUE ) svyby( ~ work_status , ~ r_sex_imp , nhts_design , svytotal , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ daily_person_walks , nhts_design , 0.5 ) svyby( ~ daily_person_walks , ~ r_sex_imp , nhts_design , svyquantile , 0.5 , ci = TRUE ) Estimate a ratio: svyratio( numerator = ~ daily_person_walk_miles_of_travel , denominator = ~ daily_person_miles_of_travel , nhts_design ) Subsetting Restrict the survey design to individuals who have used a bicycle in last 30 days: sub_nhts_design <- subset( nhts_design , last30_bike == '01' ) Calculate the mean (average) of this subset: svymean( ~ daily_person_walks , sub_nhts_design ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ daily_person_walks , nhts_design ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ daily_person_walks , ~ r_sex_imp , nhts_design , svymean ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( nhts_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ daily_person_walks , nhts_design ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ daily_person_walks , nhts_design , deff = TRUE ) # SRS with replacement svymean( ~ daily_person_walks , nhts_design , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ urban_area , nhts_design , method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( daily_person_walks ~ urban_area , nhts_design ) Perform a chi-squared test of association for survey data: svychisq( ~ urban_area + work_status , nhts_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( daily_person_walks ~ urban_area + work_status , nhts_design ) summary( glm_result ) Replication Example This example matches the 2022 Household Size counts from Table 2-1: hhsize_counts <- svytotal( ~ hhsize_categories , hh_design ) stopifnot( all( round( coef( hhsize_counts ) / 1000 , 0 ) == c( 36409 , 44751 , 19001 , 27384 ) ) ) hhsize_ci <- confint( hhsize_counts ) hhsize_moe <- hhsize_ci[ , 2 ] - coef( hhsize_counts ) stopifnot( all( round( hhsize_moe / 1000 , 0 ) == c( 1807 , 1760 , 1448 , 1742 ) ) ) This example matches the 2022 Average Daily Person Trips per Person from Table 2-9: this_mean <- svymean( ~ daily_person_trips , nhts_design ) stopifnot( round( coef( this_mean ) , 2 ) == 2.28 ) this_ci <- confint( this_mean ) this_moe <- this_ci[ , 2 ] - coef( this_mean ) stopifnot( round( this_moe , 2 ) == 0.06 ) This example matches the 2022 Average Daily PMT per Person from Table 2-9: this_mean <- svymean( ~ daily_person_miles_of_travel , nhts_design ) stopifnot( round( coef( this_mean ) , 2 ) == 28.55 ) this_ci <- confint( this_mean ) this_moe <- this_ci[ , 2 ] - coef( this_mean ) stopifnot( round( this_moe , 2 ) == 2.39 ) This example matches the 2022 Average Person Trip Length (Miles) from Table 2-9: this_mean <- svymean( ~ trpmiles , subset( trip_design , trpmiles >= 0 ) ) stopifnot( round( coef( this_mean ) , 2 ) == 12.56 ) this_ci <- confint( this_mean ) this_moe <- this_ci[ , 2 ] - coef( this_mean ) stopifnot( round( this_moe , 2 ) == 1.04 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for NHTS users, this code replicates previously-presented examples: library(srvyr) nhts_srvyr_design <- as_survey( nhts_design ) Calculate the mean (average) of a linear variable, overall and by groups: nhts_srvyr_design %>% summarize( mean = survey_mean( daily_person_walks ) ) nhts_srvyr_design %>% group_by( r_sex_imp ) %>% summarize( mean = survey_mean( daily_person_walks ) ) "],["national-immunization-survey-nis.html", "National Immunization Survey (NIS) Recommended Reading Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " National Immunization Survey (NIS) The vaccination coverage rate tracker for national, state, and selected local areas. One table with one row per sampled toddler. A complex sample survey designed to generalize to children aged 19-35 months in the United States. Released annually since 1995, plus an adolescent (13-17 years) sample since 2008. Administered by the Centers for Disease Control and Prevention. Recommended Reading Four Example Strengths & Limitations: ✔️ Both parents and medical providers interviewed ✔️ Detailed health insurance questions ❌ Low household response rates and only half-completed provider data during 2019–2023 ❌ Although national estimates are precise, estimates for state and local areas should be interpreted with caution because their sample sizes are smaller, confidence intervals wider than national estimates Three Example Findings: In 2014 in the general population in Ohio, vaccination coverage with at least one dose or at least two doses of MMR among young children and adolescents was 96% and 88%, respectively. Completion of a 7-vaccine series by 19 months of age increased from 52% in 2011 to 59% in 2021. HPV vaccination initiation by age 13 rose from 27% to 70% among those born in 1999 versus 2009. Two Methodology Documents: About NIS National Immunization Survey-Child: A User’s Guide for the 2023 Public-Use Data File One Haiku: # i hear babies cry # protesting lungs of iron # a wonderful world Download, Import, Preparation Download the fixed-width file: dat_tf <- tempfile() dat_url <- "https://ftp.cdc.gov/pub/Vaccines_NIS/NISPUF21.DAT" download.file( dat_url , dat_tf , mode = 'wb' ) Edit then execute the import script provided by the CDC: library(Hmisc) r_tf <- tempfile() r_script_url <- "https://ftp.cdc.gov/pub/Vaccines_NIS/NISPUF21.R" r_input_lines <- readLines( r_script_url ) # do not let the script do the save() r_input_lines <- gsub( "^save\\\\(" , "# save(" , r_input_lines ) # redirect the path to the flat file to the local save location of `dat_tf` r_input_lines <- gsub( '\\\\"path\\\\-to\\\\-file\\\\/(.*)\\\\.DAT\\\\"' , "dat_tf" , r_input_lines ) # save the edited script locally writeLines( r_input_lines , r_tf ) # run the edited script source( r_tf , echo = TRUE ) # rename the resultant data.frame object nis_df <- NISPUF21 names( nis_df ) <- tolower( names( nis_df ) ) nis_df[ , 'one' ] <- 1 Save Locally   Save the object at any point: # nis_fn <- file.path( path.expand( "~" ) , "NIS" , "this_file.rds" ) # saveRDS( nis_df , file = nis_fn , compress = FALSE ) Load the same object: # nis_df <- readRDS( nis_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) options( survey.lonely.psu = "adjust" ) nis_design <- svydesign( id = ~ seqnumhh , strata = ~ stratum , weights = ~ provwt_c , data = subset( nis_df , provwt_c > 0 ) ) Variable Recoding Add new columns to the data set: nis_design <- update( nis_design , first_fed_formula = ifelse( bf_formr20 %in% 888 , NA , bf_formr20 ) , dtap_3p = as.numeric( ( p_numdah >= 3 ) | ( p_numdhi >= 3 ) | ( p_numdih >= 3 ) | ( p_numdta >= 3 ) | ( p_numdtp >= 3 ) ) , dtap_4p = as.numeric( ( p_numdah >= 4 ) | ( p_numdhi >= 4 ) | ( p_numdih >= 4 ) | ( p_numdta >= 4 ) | ( p_numdtp >= 4 ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( nis_design , "sampling" ) != 0 ) svyby( ~ one , ~ state , nis_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , nis_design ) svyby( ~ one , ~ state , nis_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ first_fed_formula , nis_design , na.rm = TRUE ) svyby( ~ first_fed_formula , ~ state , nis_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ sex , nis_design , na.rm = TRUE ) svyby( ~ sex , ~ state , nis_design , svymean , na.rm = TRUE ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ first_fed_formula , nis_design , na.rm = TRUE ) svyby( ~ first_fed_formula , ~ state , nis_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ sex , nis_design , na.rm = TRUE ) svyby( ~ sex , ~ state , nis_design , svytotal , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ first_fed_formula , nis_design , 0.5 , na.rm = TRUE ) svyby( ~ first_fed_formula , ~ state , nis_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ bf_exclr06 , denominator = ~ bf_endr06 , nis_design , na.rm = TRUE ) Subsetting Restrict the survey design to toddlers up to date on polio shots: sub_nis_design <- subset( nis_design , p_utdpol == 1 ) Calculate the mean (average) of this subset: svymean( ~ first_fed_formula , sub_nis_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ first_fed_formula , nis_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ first_fed_formula , ~ state , nis_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( nis_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ first_fed_formula , nis_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ first_fed_formula , nis_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ first_fed_formula , nis_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ dtap_3p , nis_design , method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( first_fed_formula ~ dtap_3p , nis_design ) Perform a chi-squared test of association for survey data: svychisq( ~ dtap_3p + sex , nis_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( first_fed_formula ~ dtap_3p + sex , nis_design ) summary( glm_result ) Replication Example This example matches the statistics and standard errors from Data User’s Guide Table 4: results <- svyby( ~ p_utd431h314_rout_s , ~ raceethk , nis_design , svymean ) coefficients <- results[ , "p_utd431h314_rout_sUTD" , drop = FALSE ] standard_errors <- results[ , "se.p_utd431h314_rout_sUTD" , drop = FALSE ] stopifnot( round( coefficients[ "HISPANIC" , ] , 3 ) == .711 ) stopifnot( round( coefficients[ "NON-HISPANIC WHITE ONLY" , ] , 3 ) == .742 ) stopifnot( round( coefficients[ "NON-HISPANIC BLACK ONLY" , ] , 3 ) == .647 ) stopifnot( round( standard_errors[ "HISPANIC" , ] , 3 ) == .015 ) stopifnot( round( standard_errors[ "NON-HISPANIC WHITE ONLY" , ] , 3 ) == .009 ) stopifnot( round( standard_errors[ "NON-HISPANIC BLACK ONLY" , ] , 3 ) == .022 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for NIS users, this code replicates previously-presented examples: library(srvyr) nis_srvyr_design <- as_survey( nis_design ) Calculate the mean (average) of a linear variable, overall and by groups: nis_srvyr_design %>% summarize( mean = survey_mean( first_fed_formula , na.rm = TRUE ) ) nis_srvyr_design %>% group_by( state ) %>% summarize( mean = survey_mean( first_fed_formula , na.rm = TRUE ) ) "],["national-plan-and-provider-enumeration-system-nppes.html", "National Plan and Provider Enumeration System (NPPES) Recommended Reading Download, Import, Preparation Analysis Examples with base R   Analysis Examples with dplyr   Analysis Examples with data.table   Analysis Examples with duckdb  ", " National Plan and Provider Enumeration System (NPPES) The registry of every medical practitioner actively operating in the United States healthcare industry. A single large table with one row per enumerated health care provider. A census of individuals and organizations that bill for medical services in the United States. Updated weekly with new providers. Maintained by the United States Centers for Medicare & Medicaid Services (CMS) Recommended Reading Two Methodology Documents: NPI: What You Need To Know Wikipedia Entry One Haiku: # how many doctors # ranked sergeant, last name pepper # practice in the states? Download, Import, Preparation Download and import the national file: library(readr) tf <- tempfile() npi_datapage <- readLines( "http://download.cms.gov/nppes/NPI_Files.html" ) latest_files <- grep( 'NPPES_Data_Dissemination_' , npi_datapage , value = TRUE ) latest_files <- latest_files[ !grepl( 'Weekly Update' , latest_files ) ] this_url <- paste0( "http://download.cms.gov/nppes/", gsub( "(.*)(NPPES_Data_Dissemination_.*\\\\.zip)(.*)$", "\\\\2", latest_files ) ) download.file( this_url , tf , mode = 'wb' ) npi_files <- unzip( tf , exdir = tempdir() ) npi_filepath <- grep( "npidata_pfile_20050523-([0-9]+)\\\\.csv" , npi_files , value = TRUE ) column_names <- names( read.csv( npi_filepath , nrow = 1 )[ FALSE , , ] ) column_names <- gsub( "\\\\." , "_" , tolower( column_names ) ) column_types <- ifelse( grepl( "code" , column_names ) & !grepl( "country|state|gender|taxonomy|postal" , column_names ) , 'n' , 'c' ) columns_to_import <- c( "entity_type_code" , "provider_gender_code" , "provider_enumeration_date" , "is_sole_proprietor" , "provider_business_practice_location_address_state_name" ) stopifnot( all( columns_to_import %in% column_names ) ) # readr::read_csv() columns must match their order in the csv file columns_to_import <- columns_to_import[ order( match( columns_to_import , column_names ) ) ] nppes_tbl <- readr::read_csv( npi_filepath , col_names = columns_to_import , col_types = paste0( ifelse( column_names %in% columns_to_import , column_types , '_' ) , collapse = "" ) , skip = 1 ) nppes_df <- data.frame( nppes_tbl ) Save Locally   Save the object at any point: # nppes_fn <- file.path( path.expand( "~" ) , "NPPES" , "this_file.rds" ) # saveRDS( nppes_df , file = nppes_fn , compress = FALSE ) Load the same object: # nppes_df <- readRDS( nppes_fn ) Variable Recoding Add new columns to the data set: nppes_df <- transform( nppes_df , individual = as.numeric( entity_type_code ) , provider_enumeration_year = as.numeric( substr( provider_enumeration_date , 7 , 10 ) ) , state_name = provider_business_practice_location_address_state_name ) Analysis Examples with base R   Unweighted Counts Count the unweighted number of records in the table, overall and by groups: nrow( nppes_df ) table( nppes_df[ , "provider_gender_code" ] , useNA = "always" ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: mean( nppes_df[ , "provider_enumeration_year" ] , na.rm = TRUE ) tapply( nppes_df[ , "provider_enumeration_year" ] , nppes_df[ , "provider_gender_code" ] , mean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: prop.table( table( nppes_df[ , "is_sole_proprietor" ] ) ) prop.table( table( nppes_df[ , c( "is_sole_proprietor" , "provider_gender_code" ) ] ) , margin = 2 ) Calculate the sum of a linear variable, overall and by groups: sum( nppes_df[ , "provider_enumeration_year" ] , na.rm = TRUE ) tapply( nppes_df[ , "provider_enumeration_year" ] , nppes_df[ , "provider_gender_code" ] , sum , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: quantile( nppes_df[ , "provider_enumeration_year" ] , 0.5 , na.rm = TRUE ) tapply( nppes_df[ , "provider_enumeration_year" ] , nppes_df[ , "provider_gender_code" ] , quantile , 0.5 , na.rm = TRUE ) Subsetting Limit your data.frame to California: sub_nppes_df <- subset( nppes_df , state_name = 'CA' ) Calculate the mean (average) of this subset: mean( sub_nppes_df[ , "provider_enumeration_year" ] , na.rm = TRUE ) Measures of Uncertainty Calculate the variance, overall and by groups: var( nppes_df[ , "provider_enumeration_year" ] , na.rm = TRUE ) tapply( nppes_df[ , "provider_enumeration_year" ] , nppes_df[ , "provider_gender_code" ] , var , na.rm = TRUE ) Regression Models and Tests of Association Perform a t-test: t.test( provider_enumeration_year ~ individual , nppes_df ) Perform a chi-squared test of association: this_table <- table( nppes_df[ , c( "individual" , "is_sole_proprietor" ) ] ) chisq.test( this_table ) Perform a generalized linear model: glm_result <- glm( provider_enumeration_year ~ individual + is_sole_proprietor , data = nppes_df ) summary( glm_result ) Analysis Examples with dplyr   The R dplyr library offers an alternative grammar of data manipulation to base R and SQL syntax. dplyr offers many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, and the tidyverse style of non-standard evaluation. This vignette details the available features. As a starting point for NPPES users, this code replicates previously-presented examples: library(dplyr) nppes_tbl <- as_tibble( nppes_df ) Calculate the mean (average) of a linear variable, overall and by groups: nppes_tbl %>% summarize( mean = mean( provider_enumeration_year , na.rm = TRUE ) ) nppes_tbl %>% group_by( provider_gender_code ) %>% summarize( mean = mean( provider_enumeration_year , na.rm = TRUE ) ) Analysis Examples with data.table   The R data.table library provides a high-performance version of base R’s data.frame with syntax and feature enhancements for ease of use, convenience and programming speed. data.table offers concise syntax: fast to type, fast to read, fast speed, memory efficiency, a careful API lifecycle management, an active community, and a rich set of features. This vignette details the available features. As a starting point for NPPES users, this code replicates previously-presented examples: library(data.table) nppes_dt <- data.table( nppes_df ) Calculate the mean (average) of a linear variable, overall and by groups: nppes_dt[ , mean( provider_enumeration_year , na.rm = TRUE ) ] nppes_dt[ , mean( provider_enumeration_year , na.rm = TRUE ) , by = provider_gender_code ] Analysis Examples with duckdb   The R duckdb library provides an embedded analytical data management system with support for the Structured Query Language (SQL). duckdb offers a simple, feature-rich, fast, and free SQL OLAP management system. This vignette details the available features. As a starting point for NPPES users, this code replicates previously-presented examples: library(duckdb) con <- dbConnect( duckdb::duckdb() , dbdir = 'my-db.duckdb' ) dbWriteTable( con , 'nppes' , nppes_df ) Calculate the mean (average) of a linear variable, overall and by groups: dbGetQuery( con , 'SELECT AVG( provider_enumeration_year ) FROM nppes' ) dbGetQuery( con , 'SELECT provider_gender_code , AVG( provider_enumeration_year ) FROM nppes GROUP BY provider_gender_code' ) "],["national-survey-of-childrens-health-nsch.html", "National Survey of Children’s Health (NSCH) Function Definitions Download, Import, Preparation Analysis Examples with the survey library   Replication Example", " National Survey of Children’s Health (NSCH) Estimates of children’s health care and family environments to inform reports like Healthy People 2030. One screener table with one row per eligible child (1+ rows per household), one topical table with the sampled child (only one row per household) from three stacked age-specific questionnaires. A complex sample survey designed to generalize to non-institutionalized U.S. children under 18. Released every four or five years since 2003, annually since 2016. Sponsored by the Maternal and Child Health Bureau, Health Resources and Services Administration. Please skim before you begin: 2021 National Survey of Children’s Health Methodology Report 2021 National Survey of Children’s Health Data Users Frequently Asked Questions (FAQs) A haiku regarding this microdata: # "age but a number" # lied babe from crib. "your nose grows" # cried gramps changing bib Function Definitions Define a function to download, unzip, and import each comma-separated value file: library(haven) nsch_stata_import <- function( this_url ){ this_tf <- tempfile() download.file( this_url , this_tf , mode = 'wb' ) unzipped_files <- unzip( this_tf , exdir = tempdir() ) this_stata <- grep( '\\\\.dta$' , unzipped_files , value = TRUE ) this_tbl <- read_stata( this_stata ) this_df <- data.frame( this_tbl ) file.remove( c( this_tf , unzipped_files ) ) names( this_df ) <- tolower( names( this_df ) ) this_df } Download, Import, Preparation Download and import the sample adult interview and imputed income files: nsch_screener_url <- "https://www2.census.gov/programs-surveys/nsch/datasets/2021/nsch_2021_screener_Stata.zip" nsch_topical_url <- "https://www2.census.gov/programs-surveys/nsch/datasets/2021/nsch_2021_topical_Stata.zip" nsch_screener_df <- nsch_stata_import( nsch_screener_url ) nsch_df <- nsch_stata_import( nsch_topical_url ) Save Locally   Save the object at any point: # nsch_fn <- file.path( path.expand( "~" ) , "NSCH" , "this_file.rds" ) # saveRDS( nsch_df , file = nsch_fn , compress = FALSE ) Load the same object: # nsch_df <- readRDS( nsch_fn ) Survey Design Definition Construct a multiply-imputed, complex sample survey design: Remove the fpl columns from the main data.frame: fpl_columns <- grep( '^fpl_i[0-9]' , names( nsch_df ) , value = TRUE ) fpl_wide_df <- nsch_df[ c( 'hhid' , fpl_columns ) ] nsch_df[ fpl_columns ] <- NULL Reshape the fpl columns from wide to long: fpl_long_df <- reshape( fpl_wide_df , varying = list( fpl_columns ) , direction = 'long' , timevar = 'implicate' , idvar = 'hhid' ) names( fpl_long_df )[ ncol( fpl_long_df ) ] <- 'fpl' Merge the fpl table with multiple records per child onto the main table: nsch_long_df <- merge( nsch_df , fpl_long_df ) stopifnot( nrow( nsch_long_df ) == nrow( fpl_long_df ) ) stopifnot( nrow( nsch_long_df ) / length( fpl_columns ) == nrow( nsch_df ) ) Reshape the imputed income data.frame into a list based on the implicate number: nsch_list <- split( nsch_long_df , nsch_long_df[ , 'implicate' ] ) Define the design: library(survey) library(mitools) nsch_design <- svydesign( id = ~ 1 , strata = ~ fipsst + stratum , weights = ~ fwc , data = imputationList( nsch_list ) , nest = TRUE ) Variable Recoding Add new columns to the data set: nsch_design <- update( nsch_design , one = 1 , state_name = factor( fipsst , levels = c(1L, 2L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 53L, 54L, 55L, 56L) , labels = c("Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "District of Columbia", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming") ) , overall_health = factor( c( 1 , 1 , 2 , 3 , 3 )[ k2q01 ] , levels = 1:3 , labels = c( 'excellent or very good' , 'good' , 'fair or poor' ) ) , poverty_categories = factor( 1 + findInterval( fpl , c( 100 , 200 , 400 ) ) , labels = c( "below poverty" , "100-199% fpl" , "200-399% fpl" , "400%+ fpl" ) ) , under_six_ever_breastfed = as.numeric( k6q40 == 1 ) , sc_sex = factor( ifelse( sc_sex %in% 1:2 , sc_sex , NA ) , labels = c( "male" , "female" ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: MIcombine( with( nsch_design , svyby( ~ one , ~ one , unwtd.count ) ) ) MIcombine( with( nsch_design , svyby( ~ one , ~ state_name , unwtd.count ) ) ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: MIcombine( with( nsch_design , svytotal( ~ one ) ) ) MIcombine( with( nsch_design , svyby( ~ one , ~ state_name , svytotal ) ) ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: MIcombine( with( nsch_design , svymean( ~ sc_age_years ) ) ) MIcombine( with( nsch_design , svyby( ~ sc_age_years , ~ state_name , svymean ) ) ) Calculate the distribution of a categorical variable, overall and by groups: MIcombine( with( nsch_design , svymean( ~ poverty_categories ) ) ) MIcombine( with( nsch_design , svyby( ~ poverty_categories , ~ state_name , svymean ) ) ) Calculate the sum of a linear variable, overall and by groups: MIcombine( with( nsch_design , svytotal( ~ sc_age_years ) ) ) MIcombine( with( nsch_design , svyby( ~ sc_age_years , ~ state_name , svytotal ) ) ) Calculate the weighted sum of a categorical variable, overall and by groups: MIcombine( with( nsch_design , svytotal( ~ poverty_categories ) ) ) MIcombine( with( nsch_design , svyby( ~ poverty_categories , ~ state_name , svytotal ) ) ) Calculate the median (50th percentile) of a linear variable, overall and by groups: MIcombine( with( nsch_design , svyquantile( ~ sc_age_years , 0.5 , se = TRUE ) ) ) MIcombine( with( nsch_design , svyby( ~ sc_age_years , ~ state_name , svyquantile , 0.5 , se = TRUE , ci = TRUE ) ) ) Estimate a ratio: MIcombine( with( nsch_design , svyratio( numerator = ~ liveusa_yr , denominator = ~ sc_age_years , na.rm = TRUE ) ) ) Subsetting Restrict the survey design to only children: sub_nsch_design <- subset( nsch_design , agepos4 == 1 ) Calculate the mean (average) of this subset: MIcombine( with( sub_nsch_design , svymean( ~ sc_age_years ) ) ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- MIcombine( with( nsch_design , svymean( ~ sc_age_years ) ) ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- MIcombine( with( nsch_design , svyby( ~ sc_age_years , ~ state_name , svymean ) ) ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( nsch_design$designs[[1]] ) Calculate the complex sample survey-adjusted variance of any statistic: MIcombine( with( nsch_design , svyvar( ~ sc_age_years ) ) ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement MIcombine( with( nsch_design , svymean( ~ sc_age_years , deff = TRUE ) ) ) # SRS with replacement MIcombine( with( nsch_design , svymean( ~ sc_age_years , deff = "replace" ) ) ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: # MIsvyciprop( ~ under_six_ever_breastfed , nsch_design , # method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: # MIsvyttest( sc_age_years ~ under_six_ever_breastfed , nsch_design ) Perform a chi-squared test of association for survey data: # MIsvychisq( ~ under_six_ever_breastfed + poverty_categories , nsch_design ) Perform a survey-weighted generalized linear model: glm_result <- MIcombine( with( nsch_design , svyglm( sc_age_years ~ under_six_ever_breastfed + poverty_categories ) ) ) summary( glm_result ) Replication Example As noted in the bold red footnotes of their published table, this technique is not correct and should not be used. The technical documents recommend a method matching the MIcombine syntax shown above. Nonetheless, this code matches statistics and confidence intervals within 0.5% from the Excellent or very good column of Indicator 1.1: In general, how would you describe this child’s health?: results <- svyby( ~ as.numeric( overall_health == 'excellent or very good' ) , ~ poverty_categories , nsch_design$designs[[1]] , svymean , na.rm = TRUE ) published_proportions <- c( 0.833 , 0.859 , 0.907 , 0.955 ) published_lb <- c( 0.810 , 0.838 , 0.894 , 0.949 ) published_ub <- c( 0.854 , 0.878 , 0.919 , 0.961 ) stopifnot( all( abs( round( coef( results ) , 3 ) - published_proportions ) < 0.005 ) ) ( ci_results <- confint( results ) ) stopifnot( all( abs( ci_results[ , 1 ] - published_lb ) < 0.005 ) ) stopifnot( all( abs( ci_results[ , 2 ] - published_ub ) < 0.005 ) ) "],["national-survey-on-drug-use-and-health-nsduh.html", "National Survey on Drug Use and Health (NSDUH) Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " National Survey on Drug Use and Health (NSDUH) The primary survey to measure of prevalence of substance use and its correlates in the United States. One table with one row per sampled respondent. A complex survey designed to generalize to civilian, non-institutional americans aged 12 and older. Released periodically since 1979 and annually since 1990. Administered by the Substance Abuse and Mental Health Services Administration. Please skim before you begin: 2021 National Survey on Drug Use and Health (NSDUH): Public Use File Codebook 2021 National Survey on Drug Use and Health (NSDUH): Methodological Summary and Definitions A haiku regarding this microdata: # drinking and thinking # about your first time, were you # smoking and joking? Download, Import, Preparation Download and import the national file: zip_tf <- tempfile() zip_url <- paste0( "https://www.datafiles.samhsa.gov/sites/default/files/field-uploads-protected/" , "studies/NSDUH-2021/NSDUH-2021-datasets/NSDUH-2021-DS0001/" , "NSDUH-2021-DS0001-bundles-with-study-info/NSDUH-2021-DS0001-bndl-data-r_v3.zip" ) download.file( zip_url , zip_tf , mode = 'wb' ) nsduh_rdata <- unzip( zip_tf , exdir = tempdir() ) nsduh_rdata_contents <- load( nsduh_rdata ) nsduh_df_name <- grep( 'PUF' , nsduh_rdata_contents , value = TRUE ) nsduh_df <- get( nsduh_df_name ) names( nsduh_df ) <- tolower( names( nsduh_df ) ) nsduh_df[ , 'one' ] <- 1 Save Locally   Save the object at any point: # nsduh_fn <- file.path( path.expand( "~" ) , "NSDUH" , "this_file.rds" ) # saveRDS( nsduh_df , file = nsduh_fn , compress = FALSE ) Load the same object: # nsduh_df <- readRDS( nsduh_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) nsduh_design <- svydesign( id = ~ verep , strata = ~ vestr_c , data = nsduh_df , weights = ~ analwt_c , nest = TRUE ) Variable Recoding Add new columns to the data set: nsduh_design <- update( nsduh_design , one = 1 , health = factor( health , levels = 1:5 , labels = c( "excellent" , "very good" , "good" , "fair" , "poor" ) ) , age_first_cigarette = ifelse( cigtry > 99 , NA , cigtry ) , age_tried_cocaine = ifelse( cocage > 99 , NA , cocage ) , ever_used_marijuana = as.numeric( ifelse( mjever < 4 , mjever == 1 , NA ) ) , county_type = factor( coutyp4 , levels = 1:3 , labels = c( "large metro" , "small metro" , "nonmetro" ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( nsduh_design , "sampling" ) != 0 ) svyby( ~ one , ~ county_type , nsduh_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , nsduh_design ) svyby( ~ one , ~ county_type , nsduh_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ age_first_cigarette , nsduh_design , na.rm = TRUE ) svyby( ~ age_first_cigarette , ~ county_type , nsduh_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ health , nsduh_design , na.rm = TRUE ) svyby( ~ health , ~ county_type , nsduh_design , svymean , na.rm = TRUE ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ age_first_cigarette , nsduh_design , na.rm = TRUE ) svyby( ~ age_first_cigarette , ~ county_type , nsduh_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ health , nsduh_design , na.rm = TRUE ) svyby( ~ health , ~ county_type , nsduh_design , svytotal , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ age_first_cigarette , nsduh_design , 0.5 , na.rm = TRUE ) svyby( ~ age_first_cigarette , ~ county_type , nsduh_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ age_first_cigarette , denominator = ~ age_tried_cocaine , nsduh_design , na.rm = TRUE ) Subsetting Restrict the survey design to individuals who are pregnant: sub_nsduh_design <- subset( nsduh_design , preg == 1 ) Calculate the mean (average) of this subset: svymean( ~ age_first_cigarette , sub_nsduh_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ age_first_cigarette , nsduh_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ age_first_cigarette , ~ county_type , nsduh_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( nsduh_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ age_first_cigarette , nsduh_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ age_first_cigarette , nsduh_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ age_first_cigarette , nsduh_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ ever_used_marijuana , nsduh_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( age_first_cigarette ~ ever_used_marijuana , nsduh_design ) Perform a chi-squared test of association for survey data: svychisq( ~ ever_used_marijuana + health , nsduh_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( age_first_cigarette ~ ever_used_marijuana + health , nsduh_design ) summary( glm_result ) Replication Example This matches the prevalence and SE of alcohol use in the past month from Codebook Table G.2: result <- svymean( ~ alcmon , nsduh_design ) stopifnot( round( coef( result ) , 3 ) == 0.474 ) stopifnot( round( SE( result ) , 4 ) == 0.0043 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for NSDUH users, this code replicates previously-presented examples: library(srvyr) nsduh_srvyr_design <- as_survey( nsduh_design ) Calculate the mean (average) of a linear variable, overall and by groups: nsduh_srvyr_design %>% summarize( mean = survey_mean( age_first_cigarette , na.rm = TRUE ) ) nsduh_srvyr_design %>% group_by( county_type ) %>% summarize( mean = survey_mean( age_first_cigarette , na.rm = TRUE ) ) "],["national-survey-of-family-growth-nsfg.html", "National Survey of Family Growth (NSFG) Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " National Survey of Family Growth (NSFG) The principal survey to measure reproductive behavior in the United States population. Multiple tables with one row per respondent for the female and male tables, then a separate table with one row per pregnancy. A complex sample survey designed to generalize to the 15-49 year old population of the United States, by gender. Released every couple of years since 1973. Administered by the Centers for Disease Control and Prevention. Please skim before you begin: Sample Design Documentation Wikipedia Entry A haiku regarding this microdata: # family structure # questions cuz radar fails at # storks with bassinets Download, Import, Preparation library(SAScii) library(readr) dat_url <- "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NSFG/2017_2019_FemRespData.dat" sas_url <- file.path( dirname( dat_url ) , "sas/2017_2019_FemRespSetup.sas" ) sas_positions <- parse.SAScii( sas_url ) sas_positions[ , 'varname' ] <- tolower( sas_positions[ , 'varname' ] ) sas_positions[ , 'column_types' ] <- ifelse( sas_positions[ , 'char' ] , "c" , "d" ) nsfg_tbl <- read_fwf( dat_url , fwf_widths( abs( sas_positions[ , 'width' ] ) , col_names = sas_positions[ , 'varname' ] ) , col_types = paste0( sas_positions[ , 'column_types' ] , collapse = "" ) , na = c( "" , "." ) ) nsfg_df <- data.frame( nsfg_tbl ) Save Locally   Save the object at any point: # nsfg_fn <- file.path( path.expand( "~" ) , "NSFG" , "this_file.rds" ) # saveRDS( nsfg_df , file = nsfg_fn , compress = FALSE ) Load the same object: # nsfg_df <- readRDS( nsfg_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) options( survey.lonely.psu = "adjust" ) nsfg_design <- svydesign( id = ~ secu , strata = ~ sest , data = nsfg_df , weights = ~ wgt2017_2019 , nest = TRUE ) Variable Recoding Add new columns to the data set: nsfg_design <- update( nsfg_design , one = 1 , birth_control_pill = as.numeric( constat1 == 6 ) , age_categories = factor( findInterval( ager , c( 15 , 20 , 25 , 30 , 35 , 40 ) ) , labels = c( '15-19' , '20-24' , '25-29' , '30-34' , '35-39' , '40-49' ) ) , marstat = factor( marstat , levels = c( 1:6 , 8:9 ) , labels = c( "Married to a person of the opposite sex" , "Not married but living together with a partner of the opposite sex" , "Widowed" , "Divorced or annulled" , "Separated, because you and your spouse are not getting along" , "Never been married" , "Refused" , "Don't know" ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( nsfg_design , "sampling" ) != 0 ) svyby( ~ one , ~ age_categories , nsfg_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , nsfg_design ) svyby( ~ one , ~ age_categories , nsfg_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ pregnum , nsfg_design , na.rm = TRUE ) svyby( ~ pregnum , ~ age_categories , nsfg_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ marstat , nsfg_design ) svyby( ~ marstat , ~ age_categories , nsfg_design , svymean ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ pregnum , nsfg_design , na.rm = TRUE ) svyby( ~ pregnum , ~ age_categories , nsfg_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ marstat , nsfg_design ) svyby( ~ marstat , ~ age_categories , nsfg_design , svytotal ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ pregnum , nsfg_design , 0.5 , na.rm = TRUE ) svyby( ~ pregnum , ~ age_categories , nsfg_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ pregnum , denominator = ~ lbpregs , nsfg_design , na.rm = TRUE ) Subsetting Restrict the survey design to ever cohabited: sub_nsfg_design <- subset( nsfg_design , timescoh > 0 ) Calculate the mean (average) of this subset: svymean( ~ pregnum , sub_nsfg_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ pregnum , nsfg_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ pregnum , ~ age_categories , nsfg_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( nsfg_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ pregnum , nsfg_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ pregnum , nsfg_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ pregnum , nsfg_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ birth_control_pill , nsfg_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( pregnum ~ birth_control_pill , nsfg_design ) Perform a chi-squared test of association for survey data: svychisq( ~ birth_control_pill + marstat , nsfg_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( pregnum ~ birth_control_pill + marstat , nsfg_design ) summary( glm_result ) Replication Example This example matches the Variance Estimates for Percentages using SAS (9.4) and STATA (14): Match the sum of the weights: result <- svytotal( ~ one , nsfg_design ) stopifnot( round( coef( result ) , 0 ) == 72671926 ) stopifnot( round( SE( result ) , 0 ) == 3521465 ) Match row percentages of women currently using the pill by age: row_percents <- c( 19.5112 , 23.7833 , 19.6916 , 15.2800 , 6.4965 , 6.5215 ) std_err_row_percents <- c( 1.8670 , 2.1713 , 2.2773 , 1.7551 , 0.9895 , 1.0029 ) results <- svyby( ~ birth_control_pill , ~ age_categories , nsfg_design , svymean ) stopifnot( all( round( coef( results ) * 100 , 4 ) == row_percents ) ) stopifnot( all( round( SE( results ) * 100 , 4 ) == std_err_row_percents ) ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for NSFG users, this code replicates previously-presented examples: library(srvyr) nsfg_srvyr_design <- as_survey( nsfg_design ) Calculate the mean (average) of a linear variable, overall and by groups: nsfg_srvyr_design %>% summarize( mean = survey_mean( pregnum , na.rm = TRUE ) ) nsfg_srvyr_design %>% group_by( age_categories ) %>% summarize( mean = survey_mean( pregnum , na.rm = TRUE ) ) "],["national-sample-survey-of-registered-nurses-nssrn.html", "National Sample Survey of Registered Nurses (NSSRN) Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " National Sample Survey of Registered Nurses (NSSRN) The employment, education, and demographics of the RN, NP, and APRN workforce in the United States. One table with one row per surveyed registered nurse (RN) or nurse practitioner (NP). A complex sample designed to generalize to RNs and NPs at both state and national levels. Released every four to ten years since 1977. Administered by the Health Services and Resources Administration, in partnership with Census. Please skim before you begin: 2022 NSSRN Methodology Report Frequently Asked Questions A haiku regarding this microdata: # florence nightingale # founder of modern nursing # a statistician Download, Import, Preparation Download and import the state file: library(haven) nssrn_tf <- tempfile() nssrn_url <- "https://data.hrsa.gov/DataDownload/NSSRN/GeneralPUF22/2022_NSSRN_PUF_Stata_Package.zip" download.file( nssrn_url , nssrn_tf , mode = 'wb' ) nssrn_files <- unzip( nssrn_tf , exdir = tempdir() ) nssrn_dta <- grep( "\\\\.dta$" , nssrn_files , ignore.case = TRUE , value = TRUE ) nssrn_tbl <- read_dta( nssrn_dta ) nssrn_df <- data.frame( nssrn_tbl ) names( nssrn_df ) <- tolower( names( nssrn_df ) ) nssrn_df[ , 'one' ] <- 1 Save Locally   Save the object at any point: # nssrn_fn <- file.path( path.expand( "~" ) , "NSSRN" , "this_file.rds" ) # saveRDS( nssrn_df , file = nssrn_fn , compress = FALSE ) Load the same object: # nssrn_df <- readRDS( nssrn_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) nssrn_design <- svrepdesign( weight = ~rkrnwgta , repweights = 'rkrnwgta[0-9]+' , scale = 4 / 80 , rscales = rep( 1 , 80 ) , mse = TRUE , type = 'JK1' , data = nssrn_df ) Variable Recoding Add new columns to the data set: nssrn_design <- update( nssrn_design , # all advanced practice registered nurses # (including nurse practitioners) all_aprn = as.numeric( ed_lcrn == 2 ) , age_group = factor( findInterval( age_gp_puf , c( 0 , 3 , 5 , 7 , 9 ) ) , levels = 1:5 , labels = c( '34 or younger' , '35 to 44' , '45 to 54' , '55 to 64' , '65 or older' ) ) , primary_position_state = factor( as.numeric( pn_loc_code_puf ) , levels = c(1L, 2L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 53L, 54L, 55L, 56L, 72L, # note collapsed geographies from codebook 500L, 800L) , labels = c("Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "District of Columbia", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming", "Puerto Rico", # note collapsed geographies from codebook "District of Columbia & Delaware", "Montana & Wyoming") ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( nssrn_design , "sampling" ) != 0 ) svyby( ~ one , ~ age_group , nssrn_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , nssrn_design ) svyby( ~ one , ~ age_group , nssrn_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ pn_earn_puf , nssrn_design , na.rm = TRUE ) svyby( ~ pn_earn_puf , ~ age_group , nssrn_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ primary_position_state , nssrn_design , na.rm = TRUE ) svyby( ~ primary_position_state , ~ age_group , nssrn_design , svymean , na.rm = TRUE ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ pn_earn_puf , nssrn_design , na.rm = TRUE ) svyby( ~ pn_earn_puf , ~ age_group , nssrn_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ primary_position_state , nssrn_design , na.rm = TRUE ) svyby( ~ primary_position_state , ~ age_group , nssrn_design , svytotal , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ pn_earn_puf , nssrn_design , 0.5 , na.rm = TRUE ) svyby( ~ pn_earn_puf , ~ age_group , nssrn_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ pn_earn_puf , denominator = ~ hrs_yr_puf , nssrn_design , na.rm = TRUE ) Subsetting Restrict the survey design to individuals working as RNs or APRNs (excluding RNs working as LPNs): sub_nssrn_design <- subset( nssrn_design , pn_lcreq_none == 2 ) Calculate the mean (average) of this subset: svymean( ~ pn_earn_puf , sub_nssrn_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ pn_earn_puf , nssrn_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ pn_earn_puf , ~ age_group , nssrn_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( nssrn_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ pn_earn_puf , nssrn_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ pn_earn_puf , nssrn_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ pn_earn_puf , nssrn_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ all_aprn , nssrn_design , method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( pn_earn_puf ~ all_aprn , nssrn_design ) Perform a chi-squared test of association for survey data: svychisq( ~ all_aprn + primary_position_state , nssrn_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( pn_earn_puf ~ all_aprn + primary_position_state , nssrn_design ) summary( glm_result ) Replication Example This example matches statistics and relative standard errors from the “Demographics” tab of Nursing Workforce 2022 NSSRN Dashboard Data: unwtd_count_result <- svyby( ~ one , ~ age_group , nssrn_design , unwtd.count ) # cells L398 thru L402 stopifnot( coef( unwtd_count_result ) == c( 6693 , 12268 , 10804 , 10538 , 8811 ) ) wtd_n_result <- svytotal( ~ age_group , nssrn_design ) # cells J398 thru J402 stopifnot( round( coef( wtd_n_result ) , 0 ) == c( 861060 , 1078187 , 935778 , 834939 , 639412 ) ) share_result <- svymean( ~ age_group , nssrn_design ) # cells K398 thru K402 stopifnot( round( coef( share_result ) , 3 ) == c( 0.198 , 0.248 , 0.215 , 0.192 , 0.147 ) ) # cells M398 thru M402 stopifnot( round( SE( share_result ) / coef( share_result ) , 4 ) == c( 0.0206 , 0.0155 , 0.0192 , 0.0187 , 0.0125 ) ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for NSSRN users, this code replicates previously-presented examples: library(srvyr) nssrn_srvyr_design <- as_survey( nssrn_design ) Calculate the mean (average) of a linear variable, overall and by groups: nssrn_srvyr_design %>% summarize( mean = survey_mean( pn_earn_puf , na.rm = TRUE ) ) nssrn_srvyr_design %>% group_by( age_group ) %>% summarize( mean = survey_mean( pn_earn_puf , na.rm = TRUE ) ) "],["new-york-city-housing-and-vacancy-survey-nychvs.html", "New York City Housing and Vacancy Survey (NYCHVS) Function Definitions Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " New York City Housing and Vacancy Survey (NYCHVS) A city-wide assessment of the rental vacancy rate and other characteristics related to housing stock. One table with one record per occupied housing unit, a second table with one record per person inside each occupied housing unit, and a third table with one record per unoccupied housing unit. A complex sample designed to generalize to occupied & unoccupied housing in the five boroughs. Released more or less triennially since 1991. Funded by the NYC Dept. of Housing Preservation & Development, run by the Census Bureau. Please skim before you begin: Public Use File User Guide and Codebook Sample Design, Weighting, and Error Estimation A haiku regarding this microdata: # all i want is a # room somewhere / with clawfoot tub # and a frigidaire Function Definitions Define a function to download and import each comma-separated value file: nychvs_csv_import <- function( this_url ){ tf <- tempfile() download.file( this_url , tf , mode = 'wb' ) this_df <- read.csv( tf ) names( this_df ) <- tolower( names( this_df ) ) this_df } Download, Import, Preparation Download and import the all units, occupied units, person, and vacant units tables: datasets_url <- "https://www2.census.gov/programs-surveys/nychvs/datasets/2021/microdata/" all_units_df <- nychvs_csv_import( paste0( datasets_url , "allunits_puf_21.csv" ) ) occupied_units_df <- nychvs_csv_import( paste0( datasets_url , "occupied_puf_21.csv" ) ) person_df <- nychvs_csv_import( paste0( datasets_url , "person_puf_21.csv" ) ) vacant_units_df <- nychvs_csv_import( paste0( datasets_url , "vacant_puf_21.csv" ) ) stopifnot( nrow( all_units_df ) == nrow( occupied_units_df ) + nrow( vacant_units_df ) ) Merge the information stored in the all units table onto both the occupied and vacant unit tables, then merge the information (not related to weighting) from the occupied unit table onto the person table: before_nrow <- nrow( occupied_units_df ) occupied_units_df <- merge( all_units_df , occupied_units_df ) stopifnot( nrow( occupied_units_df ) == before_nrow ) before_nrow <- nrow( vacant_units_df ) vacant_units_df <- merge( all_units_df , vacant_units_df ) stopifnot( nrow( vacant_units_df ) == before_nrow ) before_nrow <- nrow( person_df ) weighting_variables <- grep( "^fw([0-9]+)?$" , names( occupied_units_df ) , value = TRUE ) person_df <- merge( occupied_units_df[ setdiff( names( occupied_units_df ) , weighting_variables ) ] , person_df ) stopifnot( nrow( person_df ) == before_nrow ) all_units_df[ , 'one' ] <- occupied_units_df[ , 'one' ] <- vacant_units_df[ , 'one' ] <- person_df[ , 'one' ] <- 1 Save Locally   Save the object at any point: # nychvs_fn <- file.path( path.expand( "~" ) , "NYCHVS" , "this_file.rds" ) # saveRDS( nychvs_df , file = nychvs_fn , compress = FALSE ) Load the same object: # nychvs_df <- readRDS( nychvs_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) all_units_design <- svrepdesign( weight = ~fw , repweights = 'fw[0-9]+' , scale = 4 / 80 , rscales = rep( 1 , 80 ) , mse = TRUE , type = 'JK1' , data = all_units_df ) occupied_units_design <- svrepdesign( weight = ~fw , repweights = 'fw[0-9]+' , scale = 4 / 80 , rscales = rep( 1 , 80 ) , mse = TRUE , type = 'JK1' , data = occupied_units_df ) vacant_units_design <- svrepdesign( weight = ~fw , repweights = 'fw[0-9]+' , scale = 4 / 80 , rscales = rep( 1 , 80 ) , mse = TRUE , type = 'JK1' , data = vacant_units_df ) person_design <- svrepdesign( weight = ~pw , repweights = 'pw[0-9]+' , scale = 4 / 80 , rscales = rep( 1 , 80 ) , mse = TRUE , type = 'JK1' , data = person_df ) nychvs_design <- occupied_units_design Variable Recoding Add new columns to the data set: nychvs_design <- update( nychvs_design , one = 1 , home_owners = as.numeric( tenure == 2 ) , yearly_household_income = hhinc_rec1 , rent_amount = ifelse( rent_amount == -2 , NA , rent_amount ) , borough = factor( boro , levels = 1:5 , labels = c( 'Bronx' , 'Brooklyn' , 'Manhattan' , 'Queens' , 'Staten Island' ) ) , food_insecurity = factor( foodinsecure , levels = 1:3 , labels = c( 'not insecure' , 'insecure' , 'very insecure' ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( nychvs_design , "sampling" ) != 0 ) svyby( ~ one , ~ borough , nychvs_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , nychvs_design ) svyby( ~ one , ~ borough , nychvs_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ hhinc_rec1 , nychvs_design , na.rm = TRUE ) svyby( ~ hhinc_rec1 , ~ borough , nychvs_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ food_insecurity , nychvs_design , na.rm = TRUE ) svyby( ~ food_insecurity , ~ borough , nychvs_design , svymean , na.rm = TRUE ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ hhinc_rec1 , nychvs_design , na.rm = TRUE ) svyby( ~ hhinc_rec1 , ~ borough , nychvs_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ food_insecurity , nychvs_design , na.rm = TRUE ) svyby( ~ food_insecurity , ~ borough , nychvs_design , svytotal , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ hhinc_rec1 , nychvs_design , 0.5 , na.rm = TRUE ) svyby( ~ hhinc_rec1 , ~ borough , nychvs_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ rent_amount , denominator = ~ hhinc_rec1 , nychvs_design , na.rm = TRUE ) Subsetting Restrict the survey design to rent burdened units (more than 30% of income paid toward rent alone): sub_nychvs_design <- subset( nychvs_design , rentburden_cat %in% 1:2 ) Calculate the mean (average) of this subset: svymean( ~ hhinc_rec1 , sub_nychvs_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ hhinc_rec1 , nychvs_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ hhinc_rec1 , ~ borough , nychvs_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( nychvs_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ hhinc_rec1 , nychvs_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ hhinc_rec1 , nychvs_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ hhinc_rec1 , nychvs_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ home_owners , nychvs_design , method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( hhinc_rec1 ~ home_owners , nychvs_design ) Perform a chi-squared test of association for survey data: svychisq( ~ home_owners + food_insecurity , nychvs_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( hhinc_rec1 ~ home_owners + food_insecurity , nychvs_design ) summary( glm_result ) Replication Example This example matches the estimate and standard error of the number of occupied housing units across the five boroughs shown at minute 6:05: result <- svytotal( ~ one , nychvs_design ) stopifnot( round( coef( result ) , 0 ) == 3157105 ) stopifnot( round( SE( result ) , 0 ) == 13439 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for NYCHVS users, this code replicates previously-presented examples: library(srvyr) nychvs_srvyr_design <- as_survey( nychvs_design ) Calculate the mean (average) of a linear variable, overall and by groups: nychvs_srvyr_design %>% summarize( mean = survey_mean( hhinc_rec1 , na.rm = TRUE ) ) nychvs_srvyr_design %>% group_by( borough ) %>% summarize( mean = survey_mean( hhinc_rec1 , na.rm = TRUE ) ) "],["pew-research-center-pew.html", "Pew Research Center (PEW) Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " Pew Research Center (PEW) Public opinion polling on U.S. Politics & Policy, Journalism & Media, Internet, Science & Tech, Religion & Public Life, Hispanic Trends, Global Attitudes & Trends, and Social & Demographic Trends. Generally one table per survey, with one row per sampled respondent. Complex samples generalizing to the noninstitutionalized adults in the nation(s) surveyed. Varying publication dates for both American Trends Panel surveys of the United States and also for International Surveys. National Public Opinion Reference Survey released annually since 2020. Administered by the Pew Research Center. Please skim before you begin: U.S. Surveys Country Specific Methodology, for example the 2022 Global Attitudes Survey A haiku regarding this microdata: # sock puppet pundit # throws 'ssue, cites pew-laced news, sighs # "unbutton your eyes!" Download, Import, Preparation Register for a Pew Research Center account at https://www.pewresearch.org/profile/registration/. DOWNLOAD THIS DATASET at https://www.pewresearch.org/global/dataset/spring-2022-survey-data/. Download the SPSS dataset Pew-Research-Center-Global-Attitudes-Spring-2022-Survey-Data.zip: library(haven) pew_fn <- file.path( path.expand( "~" ) , "Pew Research Center Global Attitudes Spring 2022 Dataset.sav" ) pew_tbl <- read_sav( pew_fn ) pew_label <- lapply( pew_tbl , function( w ) attributes( w )[['label']] ) pew_labels <- lapply( pew_tbl , function( w ) attributes( w )[['labels']] ) pew_tbl <- zap_labels( pew_tbl ) pew_df <- data.frame( pew_tbl ) names( pew_df ) <- tolower( names( pew_df ) ) Collapse country-specific cluster and strata variables into two all-country cluster and strata variables: # create the constructed psu and strata variables from among the # non-missing country-specific columns pew_df[ , 'psu_constructed' ] <- apply( pew_df[ , grep( "^psu_" , names( pew_df ) ) ] , 1 , function( w ) w[ which.min( is.na( w ) ) ] ) pew_df[ , 'stratum_constructed' ] <- apply( pew_df[ , grep( "^stratum_" , names( pew_df ) ) ] , 1 , function( w ) w[ which.min( is.na( w ) ) ] ) # for countries without clustering variables, give every record a unique identifier for the psu.. pew_df[ is.na( pew_df[ , 'psu_constructed' ] ) , 'psu_constructed' ] <- rownames( pew_df[ is.na( pew_df[ , 'psu_constructed' ] ) , ] ) # ..and zeroes for the stratum pew_df[ is.na( pew_df[ , 'stratum_constructed' ] ) , 'stratum_constructed' ] <- 0 Save Locally   Save the object at any point: # pew_fn <- file.path( path.expand( "~" ) , "PEW" , "this_file.rds" ) # saveRDS( pew_df , file = pew_fn , compress = FALSE ) Load the same object: # pew_df <- readRDS( pew_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) options( survey.lonely.psu = "adjust" ) pew_design <- svydesign( id = ~ psu_constructed , strata = ~ interaction( country , stratum_constructed ) , data = pew_df , weights = ~ weight , nest = TRUE ) Variable Recoding Add new columns to the data set: pew_design <- update( pew_design , one = 1 , topcoded_respondent_age = ifelse( age >= 99 , NA , ifelse( age >= 97 , 97 , age ) ) , human_rights_priority_with_china = ifelse( china_humanrights_priority > 2 , NA , as.numeric( china_humanrights_priority == 1 ) ) , favorable_unfavorable_one_to_four_us = ifelse( fav_us > 4 , NA , fav_us ) , favorable_unfavorable_one_to_four_un = ifelse( fav_un > 4 , NA , fav_un ) , country_name = factor( country , levels = as.integer( pew_labels[[ 'country' ]] ) , labels = names( pew_labels[['country']] ) ) , econ_sit = factor( econ_sit , levels = 1:4 , labels = c( 'Very good' , 'Somewhat good' , 'Somewhat bad' , 'Very bad' ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( pew_design , "sampling" ) != 0 ) svyby( ~ one , ~ country_name , pew_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , pew_design ) svyby( ~ one , ~ country_name , pew_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ topcoded_respondent_age , pew_design , na.rm = TRUE ) svyby( ~ topcoded_respondent_age , ~ country_name , pew_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ econ_sit , pew_design , na.rm = TRUE ) svyby( ~ econ_sit , ~ country_name , pew_design , svymean , na.rm = TRUE ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ topcoded_respondent_age , pew_design , na.rm = TRUE ) svyby( ~ topcoded_respondent_age , ~ country_name , pew_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ econ_sit , pew_design , na.rm = TRUE ) svyby( ~ econ_sit , ~ country_name , pew_design , svytotal , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ topcoded_respondent_age , pew_design , 0.5 , na.rm = TRUE ) svyby( ~ topcoded_respondent_age , ~ country_name , pew_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE , na.rm.all = TRUE ) Estimate a ratio: svyratio( numerator = ~ favorable_unfavorable_one_to_four_un , denominator = ~ favorable_unfavorable_one_to_four_us , pew_design , na.rm = TRUE ) Subsetting Restrict the survey design to : sub_pew_design <- subset( pew_design , country_name == 'South Korea' ) Calculate the mean (average) of this subset: svymean( ~ topcoded_respondent_age , sub_pew_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ topcoded_respondent_age , pew_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ topcoded_respondent_age , ~ country_name , pew_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( pew_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ topcoded_respondent_age , pew_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ topcoded_respondent_age , pew_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ topcoded_respondent_age , pew_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ human_rights_priority_with_china , pew_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( topcoded_respondent_age ~ human_rights_priority_with_china , pew_design ) Perform a chi-squared test of association for survey data: svychisq( ~ human_rights_priority_with_china + econ_sit , pew_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( topcoded_respondent_age ~ human_rights_priority_with_china + econ_sit , pew_design ) summary( glm_result ) Replication Example This matches statistics and standard errors from How to analyze Pew Research Center survey data in R: DOWNLOAD THIS DATASET at https://www.pewresearch.org/politics/dataset/april-2017-political-survey/. Download the SPSS dataset Apr17-public-4.3-update.zip dated 12/29/2017: political_survey_2017_fn <- file.path( path.expand( "~" ) , "Apr17 public.sav" ) political_survey_2017_tbl <- read_sav( political_survey_2017_fn ) political_survey_2017_df <- data.frame( political_survey_2017_tbl ) names( political_survey_2017_df ) <- tolower( names( political_survey_2017_df ) ) Construct a complex sample survey design: political_survey_2017_design <- svydesign( ~ 0 , data = political_survey_2017_df , weights = ~ weight ) Add new columns to the data set: political_survey_2017_design <- update( political_survey_2017_design , q1 = factor( q1 , levels = c( 1 , 2 , 9 ) , labels = c( 'Approve' , 'Disapprove' , 'DK/RF' ) ) ) Reproduce statistics and standard errors shown under Estimating frequencies with survey weights: result <- svymean( ~ q1 , political_survey_2017_design , na.rm = TRUE ) stopifnot( round( coef( result ) , 4 ) == c( 0.3940 , 0.5424 , 0.0636 ) ) stopifnot( round( SE( result ) , 4 ) == c( 0.0144 , 0.0147 , 0.0078 ) ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for PEW users, this code replicates previously-presented examples: library(srvyr) pew_srvyr_design <- as_survey( pew_design ) Calculate the mean (average) of a linear variable, overall and by groups: pew_srvyr_design %>% summarize( mean = survey_mean( topcoded_respondent_age , na.rm = TRUE ) ) pew_srvyr_design %>% group_by( country_name ) %>% summarize( mean = survey_mean( topcoded_respondent_age , na.rm = TRUE ) ) "],["programme-for-the-international-assessment-of-adult-competencies-piaac.html", "Programme for the International Assessment of Adult Competencies (PIAAC) Download, Import, Preparation Analysis Examples with the survey library   Replication Example", " Programme for the International Assessment of Adult Competencies (PIAAC) A cross-national study designed to understand the skills of workers in advanced-nation labor markets. One row per sampled adult. A multiply-imputed, complex sample survey designed to generalize to the population aged 16 to 65 across thirty three OECD nations. No expected release timeline. Administered by the Organisation for Economic Co-operation and Development. Please skim before you begin: Technical Report of the Survey of Adult Skills Wikipedia Entry A haiku regarding this microdata: # what color collar # workforce poets, potters, or # pythagoreans Download, Import, Preparation library(haven) library(httr) tf <- tempfile() this_url <- "https://webfs.oecd.org/piaac/puf-data/SAS/SAS7BDAT/prgusap1_2012.sas7bdat" GET( this_url , write_disk( tf ) , progress() ) piaac_tbl <- read_sas( tf ) piaac_df <- data.frame( piaac_tbl ) names( piaac_df ) <- tolower( names( piaac_df ) ) Save Locally   Save the object at any point: # piaac_fn <- file.path( path.expand( "~" ) , "PIAAC" , "this_file.rds" ) # saveRDS( piaac_df , file = piaac_fn , compress = FALSE ) Load the same object: # piaac_df <- readRDS( piaac_fn ) Survey Design Definition Construct a multiply-imputed, complex sample survey design: library(survey) library(mitools) pvals <- c( "pvlit" , "pvnum" , "pvpsl" ) pvars <- outer( pvals , 1:10 , paste0 ) non.pvals <- names(piaac_df)[ !( names(piaac_df) %in% pvars ) ] for(k in 1:10){ piaac_imp <- piaac_df[ , c( non.pvals , paste0( pvals , k ) ) ] for( j in pvals ){ piaac_imp[ , j ] <- piaac_imp[ , paste0( j , k ) ] piaac_imp[ , paste0( j , k ) ] <- NULL } if( k == 1 ){ piaac_mi <- list( piaac_imp ) } else { piaac_mi <- c( piaac_mi , list( piaac_imp ) ) } } jk.method <- unique( piaac_df[ , 'vemethod' ] ) stopifnot(length(jk.method) == 1) stopifnot(jk.method %in% c("JK1", "JK2")) if (jk.method == "JK2") jk.method <- "JKn" piaac_design <- svrepdesign( weights = ~spfwt0 , repweights = "spfwt[1-9]" , rscales = rep( 1 , 80 ) , scale = ifelse( jk.method == "JKn" , 1 , 79/80 ) , type = jk.method , data = imputationList( piaac_mi ) , mse = TRUE ) Variable Recoding Add new columns to the data set: piaac_design <- update( piaac_design , one = 1 , sex = factor( gender_r , labels = c( "male" , "female" ) ) , age_categories = factor( ageg10lfs , levels = 1:5 , labels = c( "24 or less" , "25-34" , "35-44" , "45-54" , "55 plus" ) ) , working_at_paid_job_last_week = as.numeric( c_q01a == 1 ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: MIcombine( with( piaac_design , svyby( ~ one , ~ one , unwtd.count ) ) ) MIcombine( with( piaac_design , svyby( ~ one , ~ age_categories , unwtd.count ) ) ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: MIcombine( with( piaac_design , svytotal( ~ one ) ) ) MIcombine( with( piaac_design , svyby( ~ one , ~ age_categories , svytotal ) ) ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: MIcombine( with( piaac_design , svymean( ~ pvnum , na.rm = TRUE ) ) ) MIcombine( with( piaac_design , svyby( ~ pvnum , ~ age_categories , svymean , na.rm = TRUE ) ) ) Calculate the distribution of a categorical variable, overall and by groups: MIcombine( with( piaac_design , svymean( ~ sex ) ) ) MIcombine( with( piaac_design , svyby( ~ sex , ~ age_categories , svymean ) ) ) Calculate the sum of a linear variable, overall and by groups: MIcombine( with( piaac_design , svytotal( ~ pvnum , na.rm = TRUE ) ) ) MIcombine( with( piaac_design , svyby( ~ pvnum , ~ age_categories , svytotal , na.rm = TRUE ) ) ) Calculate the weighted sum of a categorical variable, overall and by groups: MIcombine( with( piaac_design , svytotal( ~ sex ) ) ) MIcombine( with( piaac_design , svyby( ~ sex , ~ age_categories , svytotal ) ) ) Calculate the median (50th percentile) of a linear variable, overall and by groups: MIcombine( with( piaac_design , svyquantile( ~ pvnum , 0.5 , se = TRUE , na.rm = TRUE ) ) ) MIcombine( with( piaac_design , svyby( ~ pvnum , ~ age_categories , svyquantile , 0.5 , se = TRUE , ci = TRUE , na.rm = TRUE ) ) ) Estimate a ratio: MIcombine( with( piaac_design , svyratio( numerator = ~ pvnum , denominator = ~ pvlit , na.rm = TRUE ) ) ) Subsetting Restrict the survey design to self-reported fair or poor health: sub_piaac_design <- subset( piaac_design , i_q08 %in% 4:5 ) Calculate the mean (average) of this subset: MIcombine( with( sub_piaac_design , svymean( ~ pvnum , na.rm = TRUE ) ) ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- MIcombine( with( piaac_design , svymean( ~ pvnum , na.rm = TRUE ) ) ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- MIcombine( with( piaac_design , svyby( ~ pvnum , ~ age_categories , svymean , na.rm = TRUE ) ) ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( piaac_design$designs[[1]] ) Calculate the complex sample survey-adjusted variance of any statistic: MIcombine( with( piaac_design , svyvar( ~ pvnum , na.rm = TRUE ) ) ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement MIcombine( with( piaac_design , svymean( ~ pvnum , na.rm = TRUE , deff = TRUE ) ) ) # SRS with replacement MIcombine( with( piaac_design , svymean( ~ pvnum , na.rm = TRUE , deff = "replace" ) ) ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: # MIsvyciprop( ~ working_at_paid_job_last_week , piaac_design , # method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: # MIsvyttest( pvnum ~ working_at_paid_job_last_week , piaac_design ) Perform a chi-squared test of association for survey data: # MIsvychisq( ~ working_at_paid_job_last_week + sex , piaac_design ) Perform a survey-weighted generalized linear model: glm_result <- MIcombine( with( piaac_design , svyglm( pvnum ~ working_at_paid_job_last_week + sex ) ) ) summary( glm_result ) Replication Example This example matches the statistics and standard errors from OECD’s Technical Report Table 18.9: usa_pvlit <- MIcombine( with( piaac_design , svymean( ~ pvlit , na.rm = TRUE ) ) ) usa_pvnum <- MIcombine( with( piaac_design , svymean( ~ pvnum , na.rm = TRUE ) ) ) usa_pvpsl <- MIcombine( with( piaac_design , svymean( ~ pvpsl , na.rm = TRUE ) ) ) stopifnot( round( coef( usa_pvlit ) ) == 270 ) stopifnot( round( SE( usa_pvlit ) , 1 ) == 1.0 ) stopifnot( round( coef( usa_pvnum ) ) == 253 ) stopifnot( round( SE( usa_pvnum ) , 1 ) == 1.2 ) stopifnot( round( coef( usa_pvpsl ) ) == 277 ) stopifnot( round( SE( usa_pvpsl ) , 1 ) == 1.1 ) "],["progress-in-international-reading-literacy-study-pirls.html", "Progress in International Reading Literacy Study (PIRLS) Function Definitions Download, Import, Preparation Analysis Examples with the survey library   Replication Example", " Progress in International Reading Literacy Study (PIRLS) A comparative study of student achievement in reading and literacy across more than 50 nations. Grade-specific tables with one record per school, student, teacher, plus files containing student achievement, home background, student-teacher linkage, and within-country scoring reliability. A complex survey generalizing to fourth-grade populations of participating countries. Released quinquennially since 2001. Funded by the International Association for the Evaluation of Educational Achievement, run at BC. Please skim before you begin: PIRLS 2021 User Guide for the International Database Methods and Procedures: PIRLS 2021 Technical Report A haiku regarding this microdata: # lascaux canary # glyph jump reveal caged bard notes # cryogenesis Function Definitions This survey uses a multiply-imputed variance estimation technique described in Methods Chapter 13. Most users do not need to study this function carefully. Define a function specific to only this dataset: pirls_MIcombine <- function (results, variances, call = sys.call(), df.complete = Inf, ...) { m <- length(results) oldcall <- attr(results, "call") if (missing(variances)) { variances <- suppressWarnings(lapply(results, vcov)) results <- lapply(results, coef) } vbar <- variances[[1]] cbar <- results[[1]] for (i in 2:m) { cbar <- cbar + results[[i]] vbar <- vbar + variances[[i]] } cbar <- cbar/m vbar <- vbar/m # MODIFICATION # evar <- var(do.call("rbind", results)) evar <- sum( ( unlist( results ) - cbar )^2 / 4 ) r <- (1 + 1/m) * evar/vbar df <- (m - 1) * (1 + 1/r)^2 if (is.matrix(df)) df <- diag(df) if (is.finite(df.complete)) { dfobs <- ((df.complete + 1)/(df.complete + 3)) * df.complete * vbar/(vbar + evar) if (is.matrix(dfobs)) dfobs <- diag(dfobs) df <- 1/(1/dfobs + 1/df) } if (is.matrix(r)) r <- diag(r) rval <- list(coefficients = cbar, variance = vbar + evar * (m + 1)/m, call = c(oldcall, call), nimp = m, df = df, missinfo = (r + 2/(df + 3))/(r + 1)) class(rval) <- "MIresult" rval } Download, Import, Preparation Download and unzip the 2021 fourth grade international database: library(httr) tf <- tempfile() this_url <- "https://pirls2021.org/data/downloads/P21_Data_R.zip" GET( this_url , write_disk( tf ) , progress() ) unzipped_files <- unzip( tf , exdir = tempdir() ) Import and stack each of the student context data files for Abu Dhabi through Bulgaria: library(haven) # limit unzipped files to those starting with `asg` followed by three letters followed by `r5` asg_fns <- unzipped_files[ grepl( '^asg[a-z][a-z][a-z]r5' , basename( unzipped_files ) , ignore.case = TRUE ) ] # further limit asg files to the first ten countries countries_thru_bulgaria <- c("aad", "adu", "alb", "are", "aus", "aut", "aze", "bfl", "bfr", "bgr") fns_thru_bulgaria <- paste0( paste0( '^asg' , countries_thru_bulgaria , 'r5' ) , collapse = "|" ) asg_aad_bgr_fns <- asg_fns[ grepl( fns_thru_bulgaria , basename( asg_fns ) , ignore.case = TRUE ) ] pirls_df <- NULL for( rdata_fn in asg_aad_bgr_fns ){ this_tbl_name <- load( rdata_fn ) this_tbl <- get( this_tbl_name ) ; rm( this_tbl_name ) this_tbl <- zap_labels( this_tbl ) this_df <- data.frame( this_tbl ) names( this_df ) <- tolower( names( this_df ) ) pirls_df <- rbind( pirls_df , this_df ) } # order the data.frame by unique student id pirls_df <- pirls_df[ with( pirls_df , order( idcntry , idstud ) ) , ] Save Locally   Save the object at any point: # pirls_fn <- file.path( path.expand( "~" ) , "PIRLS" , "this_file.rds" ) # saveRDS( pirls_df , file = pirls_fn , compress = FALSE ) Load the same object: # pirls_df <- readRDS( pirls_fn ) Survey Design Definition Construct a multiply-imputed, complex sample survey design: From among possibly plausible values, determine all columns that are multiply-imputed plausible values: # identify all columns ending with `01` thru `05` ppv <- grep( "(.*)0[1-5]$" , names( pirls_df ) , value = TRUE ) # remove those ending digits ppv_prefix <- gsub( "0[1-5]$" , "" , ppv ) # identify each of the possibilities with exactly five matches (five implicates) pv <- names( table( ppv_prefix )[ table( ppv_prefix ) == 5 ] ) # identify each of the `01` thru `05` plausible value columns pv_columns <- grep( paste0( "^" , pv , "0[1-5]$" , collapse = "|" ) , names( pirls_df ) , value = TRUE ) Extract those multiply-imputed columns into a separate data.frame, then remove them from the source: pv_wide_df <- pirls_df[ c( 'idcntry' , 'idstud' , pv_columns ) ] pirls_df[ pv_columns ] <- NULL Reshape these columns from one record per student to one record per student per implicate: pv_long_df <- reshape( pv_wide_df , varying = lapply( paste0( pv , '0' ) , paste0 , 1:5 ) , direction = 'long' , timevar = 'implicate' , idvar = c( 'idcntry' , 'idstud' ) ) names( pv_long_df ) <- gsub( "01$" , "" , names( pv_long_df ) ) Merge the columns from the source data.frame onto the one record per student per implicate data.frame: pirls_long_df <- merge( pirls_df , pv_long_df ) pirls_long_df <- pirls_long_df[ with( pirls_long_df , order( idcntry , idstud ) ) , ] stopifnot( nrow( pirls_long_df ) == nrow( pv_long_df ) ) stopifnot( nrow( pirls_long_df ) / 5 == nrow( pirls_df ) ) Divide the five plausible value implicates into a list with five data.frames based on the implicate number: pirls_list <- split( pirls_long_df , pirls_long_df[ , 'implicate' ] ) Construct a replicate weights table following the estimation technique described in Methods Chapter 13: weights_df <- pirls_df[ c( 'jkrep' , 'jkzone' ) ] for( j in 1:75 ){ for( i in 0:1 ){ weights_df[ weights_df[ , 'jkzone' ] != j , paste0( 'rw' , i , j ) ] <- 1 weights_df[ weights_df[ , 'jkzone' ] == j , paste0( 'rw' , i , j ) ] <- 2 * ( weights_df[ weights_df[ , 'jkzone' ] == j , 'jkrep' ] == i ) } } weights_df[ c( 'jkrep' , 'jkzone' ) ] <- NULL Define the design: library(survey) library(mitools) pirls_design <- svrepdesign( weights = ~totwgt , repweights = weights_df , data = imputationList( pirls_list ) , type = "other" , scale = 0.5 , rscales = rep( 1 , 150 ) , combined.weights = FALSE , mse = TRUE ) Variable Recoding Add new columns to the data set: pirls_design <- update( pirls_design , one = 1 , countries_thru_bulgaria = factor( as.numeric( idcntry ) , levels = c(7842L, 7841L, 8L, 784L, 36L, 40L, 31L, 956L, 957L, 100L) , labels = c("Abu Dhabi, UAE", "Dubai, UAE", "Albania", "UAE", "Australia", "Austria", "Azerbaijan", "Belgium (Flemish)", "Belgium (French)","Bulgaria") ) , sex = factor( itsex , levels = 1:2 , labels = c( "female" , "male" ) ) , always_speak_language_of_test_at_home = ifelse( asbg03 %in% 1:4 , as.numeric( asbg03 == 1 ) , NA ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: pirls_MIcombine( with( pirls_design , svyby( ~ one , ~ one , unwtd.count ) ) ) pirls_MIcombine( with( pirls_design , svyby( ~ one , ~ sex , unwtd.count ) ) ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: pirls_MIcombine( with( pirls_design , svytotal( ~ one ) ) ) pirls_MIcombine( with( pirls_design , svyby( ~ one , ~ sex , svytotal ) ) ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: pirls_MIcombine( with( pirls_design , svymean( ~ asrrea , na.rm = TRUE ) ) ) pirls_MIcombine( with( pirls_design , svyby( ~ asrrea , ~ sex , svymean , na.rm = TRUE ) ) ) Calculate the distribution of a categorical variable, overall and by groups: pirls_MIcombine( with( pirls_design , svymean( ~ countries_thru_bulgaria ) ) ) pirls_MIcombine( with( pirls_design , svyby( ~ countries_thru_bulgaria , ~ sex , svymean ) ) ) Calculate the sum of a linear variable, overall and by groups: pirls_MIcombine( with( pirls_design , svytotal( ~ asrrea , na.rm = TRUE ) ) ) pirls_MIcombine( with( pirls_design , svyby( ~ asrrea , ~ sex , svytotal , na.rm = TRUE ) ) ) Calculate the weighted sum of a categorical variable, overall and by groups: pirls_MIcombine( with( pirls_design , svytotal( ~ countries_thru_bulgaria ) ) ) pirls_MIcombine( with( pirls_design , svyby( ~ countries_thru_bulgaria , ~ sex , svytotal ) ) ) Calculate the median (50th percentile) of a linear variable, overall and by groups: pirls_MIcombine( with( pirls_design , svyquantile( ~ asrrea , 0.5 , se = TRUE , na.rm = TRUE ) ) ) pirls_MIcombine( with( pirls_design , svyby( ~ asrrea , ~ sex , svyquantile , 0.5 , se = TRUE , ci = TRUE , na.rm = TRUE ) ) ) Estimate a ratio: pirls_MIcombine( with( pirls_design , svyratio( numerator = ~ asrlit , denominator = ~ asrrea ) ) ) Subsetting Restrict the survey design to Australia, Austria, Azerbaijan, Belgium (French): sub_pirls_design <- subset( pirls_design , idcntry %in% c( 36 , 40 , 31 , 956 ) ) Calculate the mean (average) of this subset: pirls_MIcombine( with( sub_pirls_design , svymean( ~ asrrea , na.rm = TRUE ) ) ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- pirls_MIcombine( with( pirls_design , svymean( ~ asrrea , na.rm = TRUE ) ) ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- pirls_MIcombine( with( pirls_design , svyby( ~ asrrea , ~ sex , svymean , na.rm = TRUE ) ) ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( pirls_design$designs[[1]] ) Calculate the complex sample survey-adjusted variance of any statistic: pirls_MIcombine( with( pirls_design , svyvar( ~ asrrea , na.rm = TRUE ) ) ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement pirls_MIcombine( with( pirls_design , svymean( ~ asrrea , na.rm = TRUE , deff = TRUE ) ) ) # SRS with replacement pirls_MIcombine( with( pirls_design , svymean( ~ asrrea , na.rm = TRUE , deff = "replace" ) ) ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: # MIsvyciprop( ~ always_speak_language_of_test_at_home , pirls_design , # method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: # MIsvyttest( asrrea ~ always_speak_language_of_test_at_home , pirls_design ) Perform a chi-squared test of association for survey data: # MIsvychisq( ~ always_speak_language_of_test_at_home + countries_thru_bulgaria , pirls_design ) Perform a survey-weighted generalized linear model: glm_result <- pirls_MIcombine( with( pirls_design , svyglm( asrrea ~ always_speak_language_of_test_at_home + countries_thru_bulgaria ) ) ) summary( glm_result ) Replication Example This example matches the mean proficiency and standard error of the Australia row of the Summary Statistics and Standard Errors for Proficiency in Overall Reading table from the Appendix 13A: Summary Statistics and Standard Errors for Proficiency in Reading: australia_design <- subset( pirls_design , countries_thru_bulgaria %in% "Australia" ) stopifnot( nrow( australia_design ) == 5487 ) result <- pirls_MIcombine( with( australia_design , svymean( ~ asrrea ) ) ) stopifnot( round( coef( result ) , 3 ) == 540.134 ) stopifnot( round( SE( result ) , 3 ) == 1.728 ) This example matches the jackknife sampling, imputation, and total variances of the same row: australia_fn <- unzipped_files[ grepl( 'ASGAUS' , basename( unzipped_files ) ) ] australia_tbl_name <- load( australia_fn ) australia_tbl <- get( australia_tbl_name ) ; rm( australia_tbl_name ) australia_tbl <- zap_labels( australia_tbl ) australia_df <- data.frame( australia_tbl ) names( australia_df ) <- tolower( names( australia_df ) ) estimate <- mean( c( with( australia_df , weighted.mean( asrrea01 , totwgt ) ) , with( australia_df , weighted.mean( asrrea02 , totwgt ) ) , with( australia_df , weighted.mean( asrrea03 , totwgt ) ) , with( australia_df , weighted.mean( asrrea04 , totwgt ) ) , with( australia_df , weighted.mean( asrrea05 , totwgt ) ) ) ) stopifnot( round( estimate , 3 ) == 540.134 ) for( k in 1:5 ){ this_variance <- 0 for( j in 1:75 ){ for( i in 0:1 ){ this_variance <- this_variance + ( weighted.mean( australia_df[ , paste0( 'asrrea0' , k ) ] , ifelse( j == australia_df[ , 'jkzone' ] , australia_df[ , 'totwgt' ] * 2 * ( australia_df[ , 'jkrep' ] == i ) , australia_df[ , 'totwgt' ] ) ) - weighted.mean( australia_df[ , paste0( 'asrrea0' , k ) ] , australia_df[ , 'totwgt' ] ) )^2 } } assign( paste0( 'v' , k ) , this_variance * 0.5 ) } sampling_variance <- mean( c( v1 , v2 , v3 , v4 , v5 ) ) stopifnot( round( sampling_variance , 3 ) == 2.653 ) imputation_variance <- ( 6 / 5 ) * ( ( ( with( australia_df , weighted.mean( asrrea01 , totwgt ) ) - estimate )^2 / 4 ) + ( ( with( australia_df , weighted.mean( asrrea02 , totwgt ) ) - estimate )^2 / 4 ) + ( ( with( australia_df , weighted.mean( asrrea03 , totwgt ) ) - estimate )^2 / 4 ) + ( ( with( australia_df , weighted.mean( asrrea04 , totwgt ) ) - estimate )^2 / 4 ) + ( ( with( australia_df , weighted.mean( asrrea05 , totwgt ) ) - estimate )^2 / 4 ) ) stopifnot( round( imputation_variance , 3 ) == 0.333 ) stopifnot( round( sampling_variance + imputation_variance , 3 ) == 2.987 ) "],["public-libraries-survey-pls.html", "Public Libraries Survey (PLS) Recommended Reading Download, Import, Preparation Analysis Examples with base R   Replication Example Analysis Examples with dplyr   Analysis Examples with data.table   Analysis Examples with duckdb  ", " Public Libraries Survey (PLS) A comprehensive compilation of administrative information on all public libraries in the United States. Two tables, with one record per library system and one record per library building or bookmobile. Released annually since 1992. Conducted by the Institute of Museum and Library Services (IMLS), collected by the Census Bureau. Recommended Reading Two Methodology Documents: Data File Documentation and User’s Guide README FY #### PLS PUD.txt included in each zipped file One Haiku: # census, not survey. # dewey decimal index # finger to lips shush Download, Import, Preparation Download and import the most recent administrative entity csv file: this_tf <- tempfile() csv_url <- "https://www.imls.gov/sites/default/files/2023-06/pls_fy2021_csv.zip" download.file( csv_url , this_tf, mode = 'wb' ) unzipped_files <- unzip( this_tf , exdir = tempdir() ) administrative_entity_csv_fn <- unzipped_files[ grepl( 'AE(.*)csv$' , basename( unzipped_files ) ) ] pls_df <- read.csv( administrative_entity_csv_fn ) names( pls_df ) <- tolower( names( pls_df ) ) pls_df[ , 'one' ] <- 1 Recode missing values as described in the readme included with each zipped file: for( this_col in names( pls_df ) ){ if( class( pls_df[ , this_col ] ) == 'character' ){ pls_df[ pls_df[ , this_col ] %in% 'M' , this_col ] <- NA } if( ( class( pls_df[ , this_col ] ) == 'numeric' ) | ( this_col %in% c( 'phone' , 'startdat' , 'enddate' ) ) ){ pls_df[ pls_df[ , this_col ] %in% c( -1 , -3 , -4 , -9 ) , this_col ] <- NA } } Save Locally   Save the object at any point: # pls_fn <- file.path( path.expand( "~" ) , "PLS" , "this_file.rds" ) # saveRDS( pls_df , file = pls_fn , compress = FALSE ) Load the same object: # pls_df <- readRDS( pls_fn ) Variable Recoding Add new columns to the data set: pls_df <- transform( pls_df , c_relatn = factor( c_relatn , levels = c( "HQ" , "ME" , "NO" ) , c( "HQ-Headquarters of a federation or cooperative" , "ME-Member of a federation or cooperative" , "NO-Not a member of a federation or cooperative" ) ) , more_than_one_librarian = as.numeric( libraria > 1 ) ) Analysis Examples with base R   Unweighted Counts Count the unweighted number of records in the table, overall and by groups: nrow( pls_df ) table( pls_df[ , "stabr" ] , useNA = "always" ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: mean( pls_df[ , "popu_lsa" ] , na.rm = TRUE ) tapply( pls_df[ , "popu_lsa" ] , pls_df[ , "stabr" ] , mean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: prop.table( table( pls_df[ , "c_relatn" ] ) ) prop.table( table( pls_df[ , c( "c_relatn" , "stabr" ) ] ) , margin = 2 ) Calculate the sum of a linear variable, overall and by groups: sum( pls_df[ , "popu_lsa" ] , na.rm = TRUE ) tapply( pls_df[ , "popu_lsa" ] , pls_df[ , "stabr" ] , sum , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: quantile( pls_df[ , "popu_lsa" ] , 0.5 , na.rm = TRUE ) tapply( pls_df[ , "popu_lsa" ] , pls_df[ , "stabr" ] , quantile , 0.5 , na.rm = TRUE ) Subsetting Limit your data.frame to more than one million annual visits: sub_pls_df <- subset( pls_df , visits > 1000000 ) Calculate the mean (average) of this subset: mean( sub_pls_df[ , "popu_lsa" ] , na.rm = TRUE ) Measures of Uncertainty Calculate the variance, overall and by groups: var( pls_df[ , "popu_lsa" ] , na.rm = TRUE ) tapply( pls_df[ , "popu_lsa" ] , pls_df[ , "stabr" ] , var , na.rm = TRUE ) Regression Models and Tests of Association Perform a t-test: t.test( popu_lsa ~ more_than_one_librarian , pls_df ) Perform a chi-squared test of association: this_table <- table( pls_df[ , c( "more_than_one_librarian" , "c_relatn" ) ] ) chisq.test( this_table ) Perform a generalized linear model: glm_result <- glm( popu_lsa ~ more_than_one_librarian + c_relatn , data = pls_df ) summary( glm_result ) Replication Example This example matches Interlibrary Relationship Frequencies on PDF page 169 of the User’s Guide: # remove closed and temporarily closed libraries results <- table( pls_df[ !( pls_df[ , 'statstru' ] %in% c( 3 , 23 ) ) , 'c_relatn' ] ) stopifnot( results[ "HQ-Headquarters of a federation or cooperative" ] == 112 ) stopifnot( results[ "ME-Member of a federation or cooperative" ] == 6859 ) stopifnot( results[ "NO-Not a member of a federation or cooperative" ] == 2236 ) Analysis Examples with dplyr   The R dplyr library offers an alternative grammar of data manipulation to base R and SQL syntax. dplyr offers many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, and the tidyverse style of non-standard evaluation. This vignette details the available features. As a starting point for PLS users, this code replicates previously-presented examples: library(dplyr) pls_tbl <- as_tibble( pls_df ) Calculate the mean (average) of a linear variable, overall and by groups: pls_tbl %>% summarize( mean = mean( popu_lsa , na.rm = TRUE ) ) pls_tbl %>% group_by( stabr ) %>% summarize( mean = mean( popu_lsa , na.rm = TRUE ) ) Analysis Examples with data.table   The R data.table library provides a high-performance version of base R’s data.frame with syntax and feature enhancements for ease of use, convenience and programming speed. data.table offers concise syntax: fast to type, fast to read, fast speed, memory efficiency, a careful API lifecycle management, an active community, and a rich set of features. This vignette details the available features. As a starting point for PLS users, this code replicates previously-presented examples: library(data.table) pls_dt <- data.table( pls_df ) Calculate the mean (average) of a linear variable, overall and by groups: pls_dt[ , mean( popu_lsa , na.rm = TRUE ) ] pls_dt[ , mean( popu_lsa , na.rm = TRUE ) , by = stabr ] Analysis Examples with duckdb   The R duckdb library provides an embedded analytical data management system with support for the Structured Query Language (SQL). duckdb offers a simple, feature-rich, fast, and free SQL OLAP management system. This vignette details the available features. As a starting point for PLS users, this code replicates previously-presented examples: library(duckdb) con <- dbConnect( duckdb::duckdb() , dbdir = 'my-db.duckdb' ) dbWriteTable( con , 'pls' , pls_df ) Calculate the mean (average) of a linear variable, overall and by groups: dbGetQuery( con , 'SELECT AVG( popu_lsa ) FROM pls' ) dbGetQuery( con , 'SELECT stabr , AVG( popu_lsa ) FROM pls GROUP BY stabr' ) "],["pesquisa-nacional-por-amostra-de-domicilios-pnad.html", "Pesquisa Nacional por Amostra de Domicilios (PNAD) Download, Import, Preparation Analysis Examples with the survey library   Replication Example Poverty and Inequality Estimation with convey   Analysis Examples with srvyr  ", " Pesquisa Nacional por Amostra de Domicilios (PNAD) Brazil’s principal labor force survey, measuring education, employment, income, housing characteristics. One consolidated table with one row per individual within each sampled household. A complex sample survey designed to generalize to the civilian non-institutional population of Brazil. Released quarterly since 2012, with microdata available both quarterly and annually. Administered by the Instituto Brasileiro de Geografia e Estatistica. Please skim before you begin: Conceitos e métodos Wikipedia Entry A haiku regarding this microdata: # mineiro data # love verdade gave to me # twelve karaoke.. Download, Import, Preparation Download and import the dictionary file: dictionary_tf <- tempfile() dictionary_url <- paste0( "https://ftp.ibge.gov.br/Trabalho_e_Rendimento/" , "Pesquisa_Nacional_por_Amostra_de_Domicilios_continua/" , "Trimestral/Microdados/Documentacao/Dicionario_e_input_20221031.zip" ) download.file( dictionary_url , dictionary_tf , mode = 'wb' ) dictionary_files <- unzip( dictionary_tf , exdir = tempdir() ) sas_fn <- grep( '\\\\.sas$' , dictionary_files , value = TRUE ) sas_lines <- readLines( sas_fn , encoding = 'latin1' ) Determine fixed-width file positions from the SAS import script: sas_start <- grep( '@0001' , sas_lines ) sas_end <- grep( ';' , sas_lines ) sas_end <- sas_end[ sas_end > sas_start ][ 1 ] sas_lines <- sas_lines[ seq( sas_start , sas_end - 1 ) ] # remove SAS comments sas_lines <- gsub( "\\\\/\\\\*(.*)" , "" , sas_lines ) # remove multiple spaces and spaces at the end of each string sas_lines <- gsub( "( +)" , " " , sas_lines ) sas_lines <- gsub( " $" , "" , sas_lines ) sas_df <- read.table( textConnection( sas_lines ) , sep = ' ' , col.names = c( 'position' , 'column_name' , 'length' ) , header = FALSE ) sas_df[ , 'character' ] <- grepl( '\\\\$' , sas_df[ , 'length' ] ) sas_df[ , 'position' ] <- as.integer( gsub( "\\\\@" , "" , sas_df[ , 'position' ] ) ) sas_df[ , 'length' ] <- as.integer( gsub( "\\\\$" , "" , sas_df[ , 'length' ] ) ) stopifnot( sum( sas_df[ , 'length' ] ) == ( sas_df[ nrow( sas_df ) , 'position' ] + sas_df[ nrow( sas_df ) , 'length' ] - 1 ) ) Download the latest quarterly file: this_tf <- tempfile() this_url <- paste0( "https://ftp.ibge.gov.br/Trabalho_e_Rendimento/" , "Pesquisa_Nacional_por_Amostra_de_Domicilios_continua/" , "Trimestral/Microdados/2023/PNADC_012023.zip" ) download.file( this_url , this_tf , mode = 'wb' ) Import the latest quarterly file: library(readr) pnad_tbl <- read_fwf( this_tf , fwf_widths( widths = sas_df[ , 'length' ] , col_names = sas_df[ , 'column_name' ] ) , col_types = paste0( ifelse( sas_df[ , 'character' ] , "c" , "d" ) , collapse = '' ) ) pnad_df <- data.frame( pnad_tbl ) names( pnad_df ) <- tolower( names( pnad_df ) ) pnad_df[ , 'one' ] <- 1 Save Locally   Save the object at any point: # pnad_fn <- file.path( path.expand( "~" ) , "PNAD" , "this_file.rds" ) # saveRDS( pnad_df , file = pnad_fn , compress = FALSE ) Load the same object: # pnad_df <- readRDS( pnad_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) pnad_design <- svrepdesign( data = pnad_df , weight = ~ v1028 , type = 'bootstrap' , repweights = 'v1028[0-9]+' , mse = TRUE , ) Variable Recoding Add new columns to the data set: pnad_design <- update( pnad_design , pia = as.numeric( v2009 >= 14 ) ) pnad_design <- update( pnad_design , ocup_c = ifelse( pia == 1 , as.numeric( vd4002 %in% 1 ) , NA ) , desocup30 = ifelse( pia == 1 , as.numeric( vd4002 %in% 2 ) , NA ) ) pnad_design <- update( pnad_design , uf_name = factor( as.numeric( uf ) , levels = c(11L, 12L, 13L, 14L, 15L, 16L, 17L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 31L, 32L, 33L, 35L, 41L, 42L, 43L, 50L, 51L, 52L, 53L) , labels = c("Rondonia", "Acre", "Amazonas", "Roraima", "Para", "Amapa", "Tocantins", "Maranhao", "Piaui", "Ceara", "Rio Grande do Norte", "Paraiba", "Pernambuco", "Alagoas", "Sergipe", "Bahia", "Minas Gerais", "Espirito Santo", "Rio de Janeiro", "Sao Paulo", "Parana", "Santa Catarina", "Rio Grande do Sul", "Mato Grosso do Sul", "Mato Grosso", "Goias", "Distrito Federal") ) , age_categories = factor( 1 + findInterval( v2009 , seq( 5 , 60 , 5 ) ) ) , male = as.numeric( v2007 == 1 ) , region = substr( uf , 1 , 1 ) , # calculate usual income from main job # (rendimento habitual do trabalho principal) vd4016n = ifelse( pia %in% 1 & vd4015 %in% 1 , vd4016 , NA ) , # calculate effective income from main job # (rendimento efetivo do trabalho principal) vd4017n = ifelse( pia %in% 1 & vd4015 %in% 1 , vd4017 , NA ) , # calculate usual income from all jobs # (variavel rendimento habitual de todos os trabalhos) vd4019n = ifelse( pia %in% 1 & vd4015 %in% 1 , vd4019 , NA ) , # calculate effective income from all jobs # (rendimento efetivo do todos os trabalhos) vd4020n = ifelse( pia %in% 1 & vd4015 %in% 1 , vd4020 , NA ) , # determine the potential labor force pea_c = as.numeric( ocup_c == 1 | desocup30 == 1 ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( pnad_design , "sampling" ) != 0 ) svyby( ~ one , ~ uf_name , pnad_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , pnad_design ) svyby( ~ one , ~ uf_name , pnad_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ vd4020n , pnad_design , na.rm = TRUE ) svyby( ~ vd4020n , ~ uf_name , pnad_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ age_categories , pnad_design ) svyby( ~ age_categories , ~ uf_name , pnad_design , svymean ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ vd4020n , pnad_design , na.rm = TRUE ) svyby( ~ vd4020n , ~ uf_name , pnad_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ age_categories , pnad_design ) svyby( ~ age_categories , ~ uf_name , pnad_design , svytotal ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ vd4020n , pnad_design , 0.5 , na.rm = TRUE ) svyby( ~ vd4020n , ~ uf_name , pnad_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ ocup_c , denominator = ~ pea_c , pnad_design , na.rm = TRUE ) Subsetting Restrict the survey design to employed persons: sub_pnad_design <- subset( pnad_design , ocup_c == 1 ) Calculate the mean (average) of this subset: svymean( ~ vd4020n , sub_pnad_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ vd4020n , pnad_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ vd4020n , ~ uf_name , pnad_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( pnad_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ vd4020n , pnad_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ vd4020n , pnad_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ vd4020n , pnad_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ male , pnad_design , method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( vd4020n ~ male , pnad_design ) Perform a chi-squared test of association for survey data: svychisq( ~ male + age_categories , pnad_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( vd4020n ~ male + age_categories , pnad_design ) summary( glm_result ) Replication Example This example matches statistics and coefficients of variation from Tabela 4092 - Pessoas de 14 anos ou mais de idade, por condição em relação à força de trabalho e condição de ocupação: nationwide_adult_population <- svytotal( ~ pia , pnad_design , na.rm = TRUE ) stopifnot( round( coef( nationwide_adult_population ) / 1000000 , 3 ) == 174.228 ) stopifnot( round( cv( nationwide_adult_population ) / 1000000 , 3 ) == 0 ) nationwide_labor_force <- svytotal( ~ pea_c , pnad_design , na.rm = TRUE ) stopifnot( round( coef( nationwide_labor_force ) / 1000000 , 3 ) == 107.257 ) stopifnot( round( cv( nationwide_labor_force ) * 100 , 1 ) == 0.2 ) nationwide_employed <- svytotal( ~ ocup_c , pnad_design , na.rm = TRUE ) stopifnot( round( coef( nationwide_employed ) / 1000000 , 3 ) == 97.825 ) stopifnot( round( cv( nationwide_employed ) * 100 , 1 ) == 0.2 ) nationwide_unemployed <- svytotal( ~ desocup30 , pnad_design , na.rm = TRUE ) stopifnot( round( coef( nationwide_unemployed ) / 1000000 , 3 ) == 9.432 ) stopifnot( round( cv( nationwide_unemployed ) * 100 , 1 ) == 1.2 ) nationwide_not_in_labor_force <- svytotal( ~ as.numeric( pia & !pea_c ) , pnad_design , na.rm = TRUE ) stopifnot( round( coef( nationwide_not_in_labor_force ) / 1000000 , 3 ) == 66.972 ) stopifnot( round( cv( nationwide_not_in_labor_force ) * 100 , 1 ) == 0.3 ) Poverty and Inequality Estimation with convey   The R convey library estimates measures of income concentration, poverty, inequality, and wellbeing. This textbook details the available features. As a starting point for PNAD users, this code calculates the gini coefficient on complex sample survey data: library(convey) pnad_design <- convey_prep( pnad_design ) svygini( ~ vd4020n , pnad_design , na.rm = TRUE ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for PNAD users, this code replicates previously-presented examples: library(srvyr) pnad_srvyr_design <- as_survey( pnad_design ) Calculate the mean (average) of a linear variable, overall and by groups: pnad_srvyr_design %>% summarize( mean = survey_mean( vd4020n , na.rm = TRUE ) ) pnad_srvyr_design %>% group_by( uf_name ) %>% summarize( mean = survey_mean( vd4020n , na.rm = TRUE ) ) "],["pesquisa-nacional-de-saude-pns.html", "Pesquisa Nacional de Saude (PNS) Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " Pesquisa Nacional de Saude (PNS) Brazil’s health survey, measuring medical conditions, risk behaviors, access to and use of care. One consolidated table with one row per individual within each sampled household. A complex sample survey designed to generalize to Brazil’s civilian population. Released at approximately five year intervals starting in 2013. Administered by Instituto Brasileiro de Geografia e Estatistica partnered with the Ministério da Saúde. Please skim before you begin: Conceitos e métodos Wikipedia Entry A haiku regarding this microdata: # cheer the ministry! # with each caipirinha, or # fail sex life module Download, Import, Preparation Download and import the dictionary file: dictionary_tf <- tempfile() dictionary_url <- "https://ftp.ibge.gov.br/PNS/2019/Microdados/Documentacao/Dicionario_e_input_20220530.zip" download.file( dictionary_url , dictionary_tf , mode = 'wb' ) dictionary_files <- unzip( dictionary_tf , exdir = tempdir() ) sas_fn <- grep( '\\\\.sas$' , dictionary_files , value = TRUE ) sas_lines <- readLines( sas_fn , encoding = 'latin1' ) Determine fixed-width file positions from the SAS import script: sas_start <- grep( '@00001' , sas_lines ) sas_end <- grep( ';' , sas_lines ) sas_end <- sas_end[ sas_end > sas_start ][ 1 ] sas_lines <- sas_lines[ seq( sas_start , sas_end - 1 ) ] # remove SAS comments sas_lines <- gsub( "\\\\/\\\\*(.*)" , "" , sas_lines ) # remove tabs, multiple spaces and spaces at the end of each string sas_lines <- gsub( "\\t" , " " , sas_lines ) sas_lines <- gsub( "( +)" , " " , sas_lines ) sas_lines <- gsub( " $" , "" , sas_lines ) sas_df <- read.table( textConnection( sas_lines ) , sep = ' ' , col.names = c( 'position' , 'column_name' , 'length' ) , header = FALSE ) sas_df[ , 'character' ] <- grepl( '\\\\$' , sas_df[ , 'length' ] ) sas_df[ , 'position' ] <- as.integer( gsub( "\\\\@" , "" , sas_df[ , 'position' ] ) ) sas_df[ , 'length' ] <- as.integer( gsub( "\\\\$" , "" , sas_df[ , 'length' ] ) ) stopifnot( sum( sas_df[ , 'length' ] ) == ( sas_df[ nrow( sas_df ) , 'position' ] + sas_df[ nrow( sas_df ) , 'length' ] - 1 ) ) Download the latest data file: this_tf <- tempfile() this_url <- "https://ftp.ibge.gov.br/PNS/2019/Microdados/Dados/PNS_2019_20220525.zip" download.file( this_url , this_tf , mode = 'wb' ) Import the latest data file: library(readr) pns_tbl <- read_fwf( this_tf , fwf_widths( widths = sas_df[ , 'length' ] , col_names = sas_df[ , 'column_name' ] ) , col_types = paste0( ifelse( sas_df[ , 'character' ] , "c" , "d" ) , collapse = '' ) ) pns_df <- data.frame( pns_tbl ) names( pns_df ) <- tolower( names( pns_df ) ) pns_df[ , 'one' ] <- 1 Save Locally   Save the object at any point: # pns_fn <- file.path( path.expand( "~" ) , "PNS" , "this_file.rds" ) # saveRDS( pns_df , file = pns_fn , compress = FALSE ) Load the same object: # pns_df <- readRDS( pns_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) options( survey.lonely.psu = "adjust" ) pns_prestratified_design <- svydesign( id = ~ upa_pns , strata = ~v0024 , data = subset( pns_df , !is.na( v0028 ) ) , weights = ~v0028 , nest = TRUE ) popc.types <- data.frame( v00283 = as.character( unique( pns_df[ , 'v00283' ] ) ) , Freq = as.numeric( unique( pns_df[ , 'v00282' ] ) ) ) popc.types <- popc.types[ order( popc.types[ , 'v00283' ] ) , ] pns_design <- postStratify( pns_prestratified_design , strata = ~v00283 , population = popc.types ) Variable Recoding Add new columns to the data set: pns_design <- update( pns_design , medical_insurance = ifelse( i00102 %in% 1:2 , as.numeric( i00102 == 1 ) , NA ) , uf_name = factor( as.numeric( v0001 ) , levels = c(11L, 12L, 13L, 14L, 15L, 16L, 17L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 31L, 32L, 33L, 35L, 41L, 42L, 43L, 50L, 51L, 52L, 53L) , labels = c("Rondonia", "Acre", "Amazonas", "Roraima", "Para", "Amapa", "Tocantins", "Maranhao", "Piaui", "Ceara", "Rio Grande do Norte", "Paraiba", "Pernambuco", "Alagoas", "Sergipe", "Bahia", "Minas Gerais", "Espirito Santo", "Rio de Janeiro", "Sao Paulo", "Parana", "Santa Catarina", "Rio Grande do Sul", "Mato Grosso do Sul", "Mato Grosso", "Goias", "Distrito Federal") ) , age_categories = factor( 1 + findInterval( c008 , seq( 5 , 90 , 5 ) ) ) , male = as.numeric( v006 == 1 ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( pns_design , "sampling" ) != 0 ) svyby( ~ one , ~ uf_name , pns_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , pns_design ) svyby( ~ one , ~ uf_name , pns_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ e01602 , pns_design , na.rm = TRUE ) svyby( ~ e01602 , ~ uf_name , pns_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ c006 , pns_design ) svyby( ~ c006 , ~ uf_name , pns_design , svymean ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ e01602 , pns_design , na.rm = TRUE ) svyby( ~ e01602 , ~ uf_name , pns_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ c006 , pns_design ) svyby( ~ c006 , ~ uf_name , pns_design , svytotal ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ e01602 , pns_design , 0.5 , na.rm = TRUE ) svyby( ~ e01602 , ~ uf_name , pns_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ p00104 , denominator = ~ p00404 , pns_design , na.rm = TRUE ) Subsetting Restrict the survey design to individuals that exercise three or more days per week: sub_pns_design <- subset( pns_design , p035 %in% 3:7 ) Calculate the mean (average) of this subset: svymean( ~ e01602 , sub_pns_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ e01602 , pns_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ e01602 , ~ uf_name , pns_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( pns_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ e01602 , pns_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ e01602 , pns_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ e01602 , pns_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ medical_insurance , pns_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( e01602 ~ medical_insurance , pns_design ) Perform a chi-squared test of association for survey data: svychisq( ~ medical_insurance + c006 , pns_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( e01602 ~ medical_insurance + c006 , pns_design ) summary( glm_result ) Replication Example This example matches Estimando totais of gross monthly income from the official PNSIBGE R package: total_renda <- svytotal( ~ e01602 , pns_design , na.rm = TRUE ) stopifnot( round( coef( total_renda ) , 0 ) == 213227874692 ) stopifnot( round( SE( total_renda ) , 0 ) == 3604489769 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for PNS users, this code replicates previously-presented examples: library(srvyr) pns_srvyr_design <- as_survey( pns_design ) Calculate the mean (average) of a linear variable, overall and by groups: pns_srvyr_design %>% summarize( mean = survey_mean( e01602 , na.rm = TRUE ) ) pns_srvyr_design %>% group_by( uf_name ) %>% summarize( mean = survey_mean( e01602 , na.rm = TRUE ) ) "],["pesquisa-de-orcamentos-familiares-pof.html", "Pesquisa de Orcamentos Familiares (POF) Download, Import, Preparation Analysis Examples with the survey library   Replication Example Poverty and Inequality Estimation with convey   Analysis Examples with srvyr  ", " Pesquisa de Orcamentos Familiares (POF) Brazil’s household budget survey designed to guide major economic indicators like the Contas nacionais. Various tables with one record per sampled household, resident, job, expenditure. A complex sample survey designed to generalize to the civilian population of Brazil. Released at irregular intervals, 2002-2003, 2008-2009, and 2017-2018 microdata available. Administered by the Instituto Brasileiro de Geografia e Estatistica. Please skim before you begin: Pesquisa de Orçamentos Familiares 2017-2018 Perfil das despesas no Brasil Conceitos e métodos A haiku regarding this microdata: # shopping na praia # roupa, comida, pede # tres havaianas Download, Import, Preparation Download the dictionary files: library(archive) dictionary_tf <- tempfile() dictionary_url <- paste0( "https://ftp.ibge.gov.br/Orcamentos_Familiares/" , "Pesquisa_de_Orcamentos_Familiares_2017_2018/Microdados/Documentacao_20230713.zip" ) download.file( dictionary_url , dictionary_tf , mode = 'wb' ) dictionary_files <- archive_extract( dictionary_tf , dir = tempdir() ) Import the household variable dictionary: library(readxl) dictionary_fn <- file.path( tempdir() , "Dicionários de váriaveis.xls" ) domicilio_dictionary_tbl <- read_excel( dictionary_fn , sheet = "Domicílio" , skip = 3 ) domicilio_dictionary_df <- data.frame( domicilio_dictionary_tbl ) names( domicilio_dictionary_df ) <- c( 'position' , 'length' , 'decimals' , 'column_name' , 'description' , 'variable_labels' ) domicilio_dictionary_df[ c( 'position' , 'length' , 'decimals' ) ] <- sapply( domicilio_dictionary_df[ c( 'position' , 'length' , 'decimals' ) ] , as.integer ) domicilio_dictionary_df <- subset( domicilio_dictionary_df , !is.na( position ) ) Import the resident variable dictionary: morador_dictionary_tbl <- read_excel( dictionary_fn , sheet = "Morador" , skip = 3 ) morador_dictionary_df <- data.frame( morador_dictionary_tbl ) names( morador_dictionary_df ) <- c( 'position' , 'length' , 'decimals' , 'column_name' , 'description' , 'variable_labels' ) morador_dictionary_df[ c( 'position' , 'length' , 'decimals' ) ] <- sapply( morador_dictionary_df[ c( 'position' , 'length' , 'decimals' ) ] , as.integer ) morador_dictionary_df <- subset( morador_dictionary_df , !is.na( position ) ) Import the post-stratification totals: post_stratification_fn <- file.path( tempdir() , "Pos_estratos_totais.xlsx" ) post_stratification_tbl <- read_excel( post_stratification_fn , skip = 5 ) post_stratification_df <- data.frame( post_stratification_tbl ) names( post_stratification_df ) <- c( 'estrato_pof' , 'pos_estrato' , 'total_pessoas' , 'uf' , 'cod_upa' ) Download the full dataset: this_tf <- tempfile() this_url <- paste0( "https://ftp.ibge.gov.br/Orcamentos_Familiares/" , "Pesquisa_de_Orcamentos_Familiares_2017_2018/Microdados/Dados_20230713.zip" ) download.file( this_url , this_tf , mode = 'wb' ) unzipped_files <- unzip( this_tf , exdir = tempdir() ) Import the household table: library(readr) domicilio_fn <- grep( 'DOMICILIO\\\\.txt$' , unzipped_files , value = TRUE ) domicilio_tbl <- read_fwf( domicilio_fn , fwf_widths( widths = domicilio_dictionary_df[ , 'length' ] , col_names = domicilio_dictionary_df[ , 'column_name' ] ) ) domicilio_df <- data.frame( domicilio_tbl ) names( domicilio_df ) <- tolower( names( domicilio_df ) ) Import the resident table: morador_fn <- grep( 'MORADOR\\\\.txt$' , unzipped_files , value = TRUE ) morador_tbl <- read_fwf( morador_fn , fwf_widths( widths = morador_dictionary_df[ , 'length' ] , col_names = morador_dictionary_df[ , 'column_name' ] ) ) morador_df <- data.frame( morador_tbl ) names( morador_df ) <- tolower( names( morador_df ) ) Merge one household-level variable and also the post-stratification info onto the person-level table: dom_mor_df <- merge( domicilio_df[ c( 'cod_upa' , 'num_dom' , 'v6199' ) ] , morador_df ) pof_df <- merge( dom_mor_df , post_stratification_df ) stopifnot( nrow( pof_df ) == nrow( morador_df ) ) Save Locally   Save the object at any point: # pof_fn <- file.path( path.expand( "~" ) , "POF" , "this_file.rds" ) # saveRDS( pof_df , file = pof_fn , compress = FALSE ) Load the same object: # pof_df <- readRDS( pof_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) options( survey.lonely.psu = "adjust" ) pre_stratified_design <- svydesign( id = ~ cod_upa , strata = ~ estrato_pof , weights = ~ peso , data = pof_df , nest = TRUE ) population_totals <- aggregate( peso_final ~ pos_estrato , data = pof_df , sum ) names( population_totals ) <- c( 'pos_estrato' , 'Freq' ) pof_design <- postStratify( pre_stratified_design , ~ pos_estrato , population_totals ) Variable Recoding Add new columns to the data set: pof_design <- update( pof_design , one = 1 , food_security = factor( v6199 , levels = 1:4 , labels = c( 'food secure' , 'mild' , 'moderate' , 'severe' ) ) , age_categories = factor( 1 + findInterval( v0403 , c( 20 , 25 , 30 , 35 , 45 , 55 , 65 , 75 ) ) , levels = 1:9 , labels = c( "under 20" , "20-24" , "25-29" , "30-34" , "35-44" , "45-54" , "55-64" , "65-74" , "75+" ) ) , sex = factor( v0404 , levels = 1:2 , labels = c( 'male' , 'female' ) ) , urban = as.numeric( tipo_situacao_reg == 1 ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( pof_design , "sampling" ) != 0 ) svyby( ~ one , ~ sex , pof_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , pof_design ) svyby( ~ one , ~ sex , pof_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ renda_total , pof_design ) svyby( ~ renda_total , ~ sex , pof_design , svymean ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ age_categories , pof_design ) svyby( ~ age_categories , ~ sex , pof_design , svymean ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ renda_total , pof_design ) svyby( ~ renda_total , ~ sex , pof_design , svytotal ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ age_categories , pof_design ) svyby( ~ age_categories , ~ sex , pof_design , svytotal ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ renda_total , pof_design , 0.5 ) svyby( ~ renda_total , ~ sex , pof_design , svyquantile , 0.5 , ci = TRUE ) Estimate a ratio: svyratio( numerator = ~ renda_total , denominator = ~ anos_estudo , pof_design , na.rm = TRUE ) Subsetting Restrict the survey design to credit card holders: sub_pof_design <- subset( pof_design , v0409 > 0 ) Calculate the mean (average) of this subset: svymean( ~ renda_total , sub_pof_design ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ renda_total , pof_design ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ renda_total , ~ sex , pof_design , svymean ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( pof_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ renda_total , pof_design ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ renda_total , pof_design , deff = TRUE ) # SRS with replacement svymean( ~ renda_total , pof_design , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ urban , pof_design , method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( renda_total ~ urban , pof_design ) Perform a chi-squared test of association for survey data: svychisq( ~ urban + age_categories , pof_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( renda_total ~ urban + age_categories , pof_design ) summary( glm_result ) Replication Example This example matches the 2017-2018 person-level food security estimates from Tabela 3: person_level_food_security <- svymean( ~ food_security , pof_design , na.rm = TRUE ) stopifnot( all.equal( round( coef( person_level_food_security ) , 2 ) , c( 0.59 , 0.27 , 0.09 , 0.05 ) , check.attributes = FALSE ) ) Poverty and Inequality Estimation with convey   The R convey library estimates measures of income concentration, poverty, inequality, and wellbeing. This textbook details the available features. As a starting point for POF users, this code calculates the gini coefficient on complex sample survey data: library(convey) pof_design <- convey_prep( pof_design ) svygini( ~ renda_total , pof_design , na.rm = TRUE ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for POF users, this code replicates previously-presented examples: library(srvyr) pof_srvyr_design <- as_survey( pof_design ) Calculate the mean (average) of a linear variable, overall and by groups: pof_srvyr_design %>% summarize( mean = survey_mean( renda_total ) ) pof_srvyr_design %>% group_by( sex ) %>% summarize( mean = survey_mean( renda_total ) ) "],["residential-energy-consumption-survey-recs.html", "Residential Energy Consumption Survey (RECS) Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " Residential Energy Consumption Survey (RECS) A periodic study conducted to provide detailed information about energy usage in U.S. homes. One table with one row per sampled housing unit. A complex sample survey designed to generalize to U.S. homes occupied as primary residences. Released approximately every five years since 1979. Prepared by the Energy Information Administration, with help from IMG-Crown and RTI International. Please skim before you begin: Annual Energy Outlook 2023 Household Characteristics Technical Documentation Summary A haiku regarding this microdata: # housing code dogma # even satan ceased sweat since # eighth sin: central air Download, Import, Preparation Download and import the most recent sas file: library(haven) sas_tf <- tempfile() sas_url <- "https://www.eia.gov/consumption/residential/data/2020/sas/recs2020_public_v2.zip" download.file( sas_url , sas_tf , mode = 'wb' ) recs_tbl <- read_sas( sas_tf ) recs_df <- data.frame( recs_tbl ) names( recs_df ) <- tolower( names( recs_df ) ) recs_df[ , 'one' ] <- 1 Save Locally   Save the object at any point: # recs_fn <- file.path( path.expand( "~" ) , "RECS" , "this_file.rds" ) # saveRDS( recs_df , file = recs_fn , compress = FALSE ) Load the same object: # recs_df <- readRDS( recs_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) recs_design <- svrepdesign( data = recs_df , weight = ~ nweight , repweights = 'nweight[1-9]+' , type = 'JK1' , combined.weights = TRUE , scale = 59 / 60 , mse = TRUE ) Variable Recoding Add new columns to the data set: recs_design <- update( recs_design , main_heating_fuel = factor( fuelheat , levels = c( -2 , 5 , 1 , 2 , 3 , 7 , 99 ) , labels = c( 'Not applicable' , 'Electricity' , 'Natural gas from underground pipes' , 'Propane (bottled gas)' , 'Fuel oil' , 'Wood or pellets' , 'Other' ) ) , rooftype = factor( rooftype , levels = c( -2 , 1:6 , 99 ) , labels = c( 'Not applicable' , 'Ceramic or clay tiles' , 'Wood shingles/shakes' , 'Metal' , 'Slate or synthetic slate' , 'Shingles (composition or asphalt)' , 'Concrete tiles' , 'Other' ) ) , swimpool_binary = ifelse( swimpool %in% 0:1 , swimpool , NA ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( recs_design , "sampling" ) != 0 ) svyby( ~ one , ~ main_heating_fuel , recs_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , recs_design ) svyby( ~ one , ~ main_heating_fuel , recs_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ totsqft_en , recs_design ) svyby( ~ totsqft_en , ~ main_heating_fuel , recs_design , svymean ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ rooftype , recs_design ) svyby( ~ rooftype , ~ main_heating_fuel , recs_design , svymean ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ totsqft_en , recs_design ) svyby( ~ totsqft_en , ~ main_heating_fuel , recs_design , svytotal ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ rooftype , recs_design ) svyby( ~ rooftype , ~ main_heating_fuel , recs_design , svytotal ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ totsqft_en , recs_design , 0.5 ) svyby( ~ totsqft_en , ~ main_heating_fuel , recs_design , svyquantile , 0.5 , ci = TRUE ) Estimate a ratio: svyratio( numerator = ~ totcsqft , denominator = ~ totsqft_en , recs_design ) Subsetting Restrict the survey design to households that cook three or more hot meals per day: sub_recs_design <- subset( recs_design , nummeal == 1 ) Calculate the mean (average) of this subset: svymean( ~ totsqft_en , sub_recs_design ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ totsqft_en , recs_design ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ totsqft_en , ~ main_heating_fuel , recs_design , svymean ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( recs_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ totsqft_en , recs_design ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ totsqft_en , recs_design , deff = TRUE ) # SRS with replacement svymean( ~ totsqft_en , recs_design , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ swimpool_binary , recs_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( totsqft_en ~ swimpool_binary , recs_design ) Perform a chi-squared test of association for survey data: svychisq( ~ swimpool_binary + rooftype , recs_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( totsqft_en ~ swimpool_binary + rooftype , recs_design ) summary( glm_result ) Replication Example This example matches the statistic, standard error, and relative standard error shown on PDF page 8 of Using the microdata file to compute estimates and relative standard errors (RSEs) sas_v1_tf <- tempfile() sas_v1_url <- "https://www.eia.gov/consumption/residential/data/2020/sas/recs2020_public_v1.zip" download.file( sas_v1_url , sas_v1_tf , mode = 'wb' ) recs_v1_tbl <- read_sas( sas_v1_tf ) recs_v1_df <- data.frame( recs_v1_tbl ) names( recs_v1_df ) <- tolower( names( recs_v1_df ) ) recs_v1_design <- svrepdesign( data = recs_v1_df , weight = ~ nweight , repweights = 'nweight[1-9]+' , type = 'JK1' , combined.weights = TRUE , scale = 59 / 60 , mse = TRUE ) recs_v1_design <- update( recs_v1_design , natural_gas_mainspace_heat = as.numeric( fuelheat == 1 ) ) result <- svytotal( ~ natural_gas_mainspace_heat , recs_v1_design ) stopifnot( round( coef( result ) , 0 ) == 56245389 ) stopifnot( round( SE( result ) , 0 ) == 545591 ) stopifnot( round( 100 * SE( result ) / coef( result ) , 2 ) == 0.97 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for RECS users, this code replicates previously-presented examples: library(srvyr) recs_srvyr_design <- as_survey( recs_design ) Calculate the mean (average) of a linear variable, overall and by groups: recs_srvyr_design %>% summarize( mean = survey_mean( totsqft_en ) ) recs_srvyr_design %>% group_by( main_heating_fuel ) %>% summarize( mean = survey_mean( totsqft_en ) ) "],["rapid-surveys-system-rss.html", "Rapid Surveys System (RSS) Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " Rapid Surveys System (RSS) The standardized platform to answer time-sensitive questions about emerging and priority health issues. One table with one row per AmeriSpeak or KnowledgePanel respondent. A cross-sectional survey generalizing to the noninstitutionalized adult population of the U.S. Releases expected four times per year. Conducted by the National Center for Health Statistics at the Centers for Disease Control. Please skim before you begin: NCHS Rapid Surveys System (RSS): Round 1 Survey Description Quality Profile, Rapid Surveys System Round 1 A haiku regarding this microdata: # first response heroes # question design thru publish # time 'doxed by zeno Download, Import, Preparation Download and import the first round: library(haven) sas_url <- "https://www.cdc.gov/nchs/data/rss/rss1_puf_t1.sas7bdat" rss_tbl <- read_sas( sas_url ) rss_df <- data.frame( rss_tbl ) names( rss_df ) <- tolower( names( rss_df ) ) rss_df[ , 'one' ] <- 1 Save Locally   Save the object at any point: # rss_fn <- file.path( path.expand( "~" ) , "RSS" , "this_file.rds" ) # saveRDS( rss_df , file = rss_fn , compress = FALSE ) Load the same object: # rss_df <- readRDS( rss_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) options( survey.lonely.psu = "adjust" ) rss_design <- svydesign( ~ p_psu , strata = ~ p_strata , data = rss_df , weights = ~ weight_m1 , nest = TRUE ) Variable Recoding Add new columns to the data set: rss_design <- update( rss_design , how_often_use_cleaner_purifier = factor( ven_use , levels = c( -9:-6 , 0:3 ) , labels = c( "Don't Know" , "Question not asked" , "Explicit refusal/REF" , "Skipped/Implied refusal" , "Never" , "Rarely" , "Sometimes" , "Always" ) ) , has_health_insurance = ifelse( p_insur >= 0 , p_insur , NA ) , metropolitan = factor( as.numeric( p_metro_r == 1 ) , levels = 0:1 , labels = c( 'No' , 'Yes' ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( rss_design , "sampling" ) != 0 ) svyby( ~ one , ~ metropolitan , rss_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , rss_design ) svyby( ~ one , ~ metropolitan , rss_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ p_hhsize_r , rss_design ) svyby( ~ p_hhsize_r , ~ metropolitan , rss_design , svymean ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ how_often_use_cleaner_purifier , rss_design ) svyby( ~ how_often_use_cleaner_purifier , ~ metropolitan , rss_design , svymean ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ p_hhsize_r , rss_design ) svyby( ~ p_hhsize_r , ~ metropolitan , rss_design , svytotal ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ how_often_use_cleaner_purifier , rss_design ) svyby( ~ how_often_use_cleaner_purifier , ~ metropolitan , rss_design , svytotal ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ p_hhsize_r , rss_design , 0.5 ) svyby( ~ p_hhsize_r , ~ metropolitan , rss_design , svyquantile , 0.5 , ci = TRUE ) Estimate a ratio: svyratio( numerator = ~ p_agec_r , denominator = ~ p_hhsize_r , rss_design ) Subsetting Restrict the survey design to adults that most of the time or always wear sunscreen: sub_rss_design <- subset( rss_design , sun_useface >= 3 ) Calculate the mean (average) of this subset: svymean( ~ p_hhsize_r , sub_rss_design ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ p_hhsize_r , rss_design ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ p_hhsize_r , ~ metropolitan , rss_design , svymean ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( rss_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ p_hhsize_r , rss_design ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ p_hhsize_r , rss_design , deff = TRUE ) # SRS with replacement svymean( ~ p_hhsize_r , rss_design , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ has_health_insurance , rss_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( p_hhsize_r ~ has_health_insurance , rss_design ) Perform a chi-squared test of association for survey data: svychisq( ~ has_health_insurance + how_often_use_cleaner_purifier , rss_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( p_hhsize_r ~ has_health_insurance + how_often_use_cleaner_purifier , rss_design ) summary( glm_result ) Replication Example This example matches the statistic and confidence intervals from the “Ever uses a portable air cleaner or purifier in home” page of the Air cleaners and purifiers dashboard: result <- svymean( ~ as.numeric( ven_use > 0 ) , subset( rss_design , ven_use >= 0 ) ) stopifnot( round( coef( result ) , 3 ) == .379 ) stopifnot( round( confint( result )[1] , 3 ) == 0.366 ) stopifnot( round( confint( result )[2] , 3 ) == 0.393 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for RSS users, this code replicates previously-presented examples: library(srvyr) rss_srvyr_design <- as_survey( rss_design ) Calculate the mean (average) of a linear variable, overall and by groups: rss_srvyr_design %>% summarize( mean = survey_mean( p_hhsize_r ) ) rss_srvyr_design %>% group_by( metropolitan ) %>% summarize( mean = survey_mean( p_hhsize_r ) ) "],["survey-of-business-owners-sbo.html", "Survey of Business Owners (SBO) Function Definitions Download, Import, Preparation Analysis Examples with the survey library   Replication Example", " Survey of Business Owners (SBO) Before its replacement in 2018 by the Annual Business Survey, nearly every tax-filing sole proprietorship, partnership, and corporation nationwide completed this questionnaire, with 2007 the only microdata year. One table with one row per firm per state per industry, except eight collapsed geographies. A complex sample survey designed to generalize to most firms in the United States, public microdata includes classifiable (non-identifiable) firms, i.e. nearly all businesses but only about half of workers. Released quinquennially from 1972 until 2012 in the Economic Census with no updates expected. Administered by the U.S. Census Bureau. Annual Business Survey now conducted jointly with the National Center for Science and Engineering Statistics within the National Science Foundation. Please skim before you begin: 2007 Survey of Business Owners (SBO) Public Use Microdata Sample (PUMS) Data Users Guide Comparability to the Annual Business Survey (ABS), the Nonemployer Statistics by Demographics (NES-D) series, and the Annual Survey of Entrepreneurs (ASE) At a Glance A haiku regarding this microdata: # butchers, chandlers, baked # sea shanty, filial pie # call your mom and pop Function Definitions This survey uses a dual design variance estimation technique described in the Data Users Guide. Most users do not need to study these functions carefully. Define functions specific to only this dataset: sbo_MIcombine <- function( x , adjustment = 1.992065 ){ # pull the structure of a variance-covariance matrix variance.shell <- suppressWarnings( vcov( x$var[[1]] ) ) # initiate a function that will overwrite the diagonals. diag.replacement <- function( z ){ diag( variance.shell ) <- coef( z ) variance.shell } # overwrite all the diagonals in the variance this_design object coef.variances <- lapply( x$var , diag.replacement ) # add then divide by ten midpoint <- Reduce( '+' , coef.variances ) / 10 # initiate another function that takes some object, # subtracts the midpoint, squares it, divides by ninety midpoint.var <- function( z ){ 1/10 * ( ( midpoint - z )^2 / 9 ) } # sum up all the differences into a single object variance <- Reduce( '+' , lapply( coef.variances , midpoint.var ) ) # adjust every number with the factor in the user guide adj_var <- adjustment * variance # construct a result that looks like other sbo_MIcombine methods rval <- list( coefficients = coef( x$coef ) , variance = adj_var ) # call it an MIresult class, like other sbo_MIcombine results class( rval ) <- 'MIresult' rval } sbo_with <- function ( this_design , expr , ... ){ pf <- parent.frame() expr <- substitute( expr ) expr$design <- as.name(".design") # this pulls in means, medians, totals, etc. # notice it uses this_design$coef results <- eval( expr , list( .design = this_design$coef ) ) # this is used to calculate the variance, adjusted variance, standard error # notice it uses the this_design$var object variances <- lapply( this_design$var$designs , function( .design ){ eval( expr , list( .design = .design ) , enclos = pf ) } ) # combine both results.. rval <- list( coef = results , var = variances ) # ..into a brand new object class class( rval ) <- 'imputationResultList' rval } sbo_subset <- function( x , ... ){ # subset the survey object coef.sub <- subset( x$coef , ... ) # replicate `var.sub` so it's got all the same attributes as `x$var` var.sub <- x$var # but then overwrite the `designs` attribute with a subset var.sub$designs <- lapply( x$var$designs , subset , ... ) # now re-create the `sbosvyimputationList` just as before.. sub.svy <- list( coef = coef.sub , var = var.sub ) # ..and give it the same class sub.svy$call <- sys.call(-1) sub.svy } sbo_update <- function( x , ... ){ # update the survey object that's going to be used for # means, medians, totals, etc. coef.upd <- update( x$coef , ... ) # replicate `var.upd` so it's got all the same attributes as `x$var` var.upd <- x$var # but then overwrite the `designs` attribute with an update var.upd$designs <- lapply( x$var$designs , update , ... ) # now re-create the `sbosvyimputationList` just as before upd.svy <- list( coef = coef.upd , var = var.upd ) upd.svy } sbo_degf <- function( x ) degf( x$coef ) Download, Import, Preparation Download and import the file containing records for both coefficient estimates and variance estimation: library(httr) library(readr) tf <- tempfile() this_url <- "https://www2.census.gov/programs-surveys/sbo/datasets/2007/pums_csv.zip" GET( this_url , write_disk( tf ) , progress() ) sbo_tbl <- read_csv( tf ) sbo_df <- data.frame( sbo_tbl ) names( sbo_df ) <- tolower( names( sbo_df ) ) sbo_df[ , 'one' ] <- 1 Calculate the weights used for variance estimation: sbo_df[ , 'newwgt' ] <- 10 * sbo_df[ , 'tabwgt' ] * sqrt( 1 - 1 / sbo_df[ , 'tabwgt' ] ) Add business ownership percentages for both gender and ethnicity: # replace percent missings with zeroes for( i in 1:4 ) sbo_df[ is.na( sbo_df[ , paste0( 'pct' , i ) ] ) , paste0( 'pct' , i ) ] <- 0 # sum up ownership ethnicity and gender sbo_df[ , 'hispanic_pct' ] <- sbo_df[ , 'nonhispanic_pct' ] <- 0 sbo_df[ , 'male_pct' ] <- sbo_df[ , 'female_pct' ] <- 0 # loop through the first four owners' ethnicity and sex variables for( i in 1:4 ) { sbo_df[ sbo_df[ , paste0( 'eth' , i ) ] %in% 'H' , 'hispanic_pct' ] <- sbo_df[ sbo_df[ , paste0( 'eth' , i ) ] %in% 'H' , 'hispanic_pct' ] + sbo_df[ sbo_df[ , paste0( 'eth' , i ) ] %in% 'H' , paste0( 'pct' , i ) ] sbo_df[ sbo_df[ , paste0( 'eth' , i ) ] %in% 'N' , 'nonhispanic_pct' ] <- sbo_df[ sbo_df[ , paste0( 'eth' , i ) ] %in% 'N' , 'nonhispanic_pct' ] + sbo_df[ sbo_df[ , paste0( 'eth' , i ) ] %in% 'N' , paste0( 'pct' , i ) ] sbo_df[ sbo_df[ , paste0( 'sex' , i ) ] %in% 'M' , 'male_pct' ] <- sbo_df[ sbo_df[ , paste0( 'sex' , i ) ] %in% 'M' , 'male_pct' ] + sbo_df[ sbo_df[ , paste0( 'sex' , i ) ] %in% 'M' , paste0( 'pct' , i ) ] sbo_df[ sbo_df[ , paste0( 'sex' , i ) ] %in% 'F' , 'female_pct' ] <- sbo_df[ sbo_df[ , paste0( 'sex' , i ) ] %in% 'F' , 'female_pct' ] + sbo_df[ sbo_df[ , paste0( 'sex' , i ) ] %in% 'F' , paste0( 'pct' , i ) ] } Save Locally   Save the object at any point: # sbo_fn <- file.path( path.expand( "~" ) , "SBO" , "this_file.rds" ) # saveRDS( sbo_df , file = sbo_fn , compress = FALSE ) Load the same object: # sbo_df <- readRDS( sbo_fn ) Survey Design Definition Construct a multiply-imputed, complex sample survey design: library(survey) library(mitools) # break random groups into ten separate data.frame objects within a list var_list <- NULL for( i in 1:10 ) { var_list <- c( var_list , list( subset( sbo_df , rg == i ) ) ) } sbo_coef <- svydesign( id = ~ 1 , weight = ~ tabwgt , data = sbo_df ) sbo_var <- svydesign( id = ~ 1 , weight = ~ newwgt , data = imputationList( var_list ) ) sbo_design <- list( coef = sbo_coef , var = sbo_var ) class( sbo_design ) <- 'sbosvyimputationList' Variable Recoding Add new columns to the data set: sbo_design <- sbo_update( sbo_design , established_before_2000 = ifelse( established %in% c( '0' , 'A' ) , NA , as.numeric( established < 4 ) ) , healthins = factor( healthins , levels = 1:2 , labels = c( "offered health insurance" , "did not offer health insurance" ) ) , hispanic_ownership = factor( ifelse( hispanic_pct == nonhispanic_pct , 2 , ifelse( hispanic_pct > nonhispanic_pct , 1 , ifelse( nonhispanic_pct > hispanic_pct , 3 , NA ) ) ) , levels = 1:3 , labels = c( 'hispanic' , 'equally hisp/non' , 'non-hispanic' ) ) , gender_ownership = factor( ifelse( male_pct == female_pct , 2 , ifelse( male_pct > female_pct , 1 , ifelse( female_pct > male_pct , 3 , NA ) ) ) , levels = 1:3 , labels = c( 'male' , 'equally male/female' , 'female' ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sbo_MIcombine( sbo_with( sbo_design , svyby( ~ one , ~ one , unwtd.count ) ) ) sbo_MIcombine( sbo_with( sbo_design , svyby( ~ one , ~ gender_ownership , unwtd.count ) ) ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: sbo_MIcombine( sbo_with( sbo_design , svytotal( ~ one ) ) ) sbo_MIcombine( sbo_with( sbo_design , svyby( ~ one , ~ gender_ownership , svytotal ) ) ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: sbo_MIcombine( sbo_with( sbo_design , svymean( ~ receipts_noisy ) ) ) sbo_MIcombine( sbo_with( sbo_design , svyby( ~ receipts_noisy , ~ gender_ownership , svymean ) ) ) Calculate the distribution of a categorical variable, overall and by groups: sbo_MIcombine( sbo_with( sbo_design , svymean( ~ n07_employer , na.rm = TRUE ) ) ) sbo_MIcombine( sbo_with( sbo_design , svyby( ~ n07_employer , ~ gender_ownership , svymean , na.rm = TRUE ) ) ) Calculate the sum of a linear variable, overall and by groups: sbo_MIcombine( sbo_with( sbo_design , svytotal( ~ receipts_noisy ) ) ) sbo_MIcombine( sbo_with( sbo_design , svyby( ~ receipts_noisy , ~ gender_ownership , svytotal ) ) ) Calculate the weighted sum of a categorical variable, overall and by groups: sbo_MIcombine( sbo_with( sbo_design , svytotal( ~ n07_employer , na.rm = TRUE ) ) ) sbo_MIcombine( sbo_with( sbo_design , svyby( ~ n07_employer , ~ gender_ownership , svytotal , na.rm = TRUE ) ) ) Calculate the median (50th percentile) of a linear variable, overall and by groups: sbo_MIcombine( sbo_with( sbo_design , svyquantile( ~ receipts_noisy , 0.5 , se = TRUE ) ) ) sbo_MIcombine( sbo_with( sbo_design , svyby( ~ receipts_noisy , ~ gender_ownership , svyquantile , 0.5 , se = TRUE , ci = TRUE ) ) ) Estimate a ratio: sbo_MIcombine( sbo_with( sbo_design , svyratio( numerator = ~ receipts_noisy , denominator = ~ employment_noisy ) ) ) Subsetting Restrict the survey design to jointly owned by husband and wife: sub_sbo_design <- sbo_subset( sbo_design , husbwife %in% 1:3 ) Calculate the mean (average) of this subset: sbo_MIcombine( sbo_with( sub_sbo_design , svymean( ~ receipts_noisy ) ) ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- sbo_MIcombine( sbo_with( sbo_design , svymean( ~ receipts_noisy ) ) ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- sbo_MIcombine( sbo_with( sbo_design , svyby( ~ receipts_noisy , ~ gender_ownership , svymean ) ) ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: sbo_degf( sbo_design ) Calculate the complex sample survey-adjusted variance of any statistic: sbo_MIcombine( sbo_with( sbo_design , svyvar( ~ receipts_noisy ) ) ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement sbo_MIcombine( sbo_with( sbo_design , svymean( ~ receipts_noisy , deff = TRUE ) ) ) # SRS with replacement sbo_MIcombine( sbo_with( sbo_design , svymean( ~ receipts_noisy , deff = "replace" ) ) ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: # # sbo_MIsvyciprop( ~ established_before_2000 , sbo_design , # method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: # # sbo_MIsvyttest( receipts_noisy ~ established_before_2000 , sbo_design ) Perform a chi-squared test of association for survey data: # # sbo_MIsvychisq( ~ established_before_2000 + n07_employer , sbo_design ) Perform a survey-weighted generalized linear model: glm_result <- sbo_MIcombine( sbo_with( sbo_design , svyglm( receipts_noisy ~ established_before_2000 + n07_employer ) ) ) glm_result Replication Example This example matches the statistics and relative standard errors from three Appendix B columns: hispanic_receipts_result <- sbo_MIcombine( sbo_with( sbo_design , svyby( ~ receipts_noisy , ~ hispanic_ownership , svytotal ) ) ) hispanic_payroll_result <- sbo_MIcombine( sbo_with( sbo_design , svyby( ~ payroll_noisy , ~ hispanic_ownership , svytotal ) ) ) hispanic_employment_result <- sbo_MIcombine( sbo_with( sbo_design , svyby( ~ employment_noisy , ~ hispanic_ownership , svytotal ) ) ) Estimates at the U.S. Level using the PUMS Tables for: stopifnot( round( coef( hispanic_receipts_result )[ 'hispanic' ] , 0 ) == 350763923 ) stopifnot( round( coef( hispanic_receipts_result )[ 'equally hisp/non' ] , 0 ) == 56166354 ) stopifnot( round( coef( hispanic_receipts_result )[ 'non-hispanic' ] , 0 ) == 10540609303 ) stopifnot( round( coef( hispanic_payroll_result )[ 'hispanic' ] , 0 ) == 54367702 ) stopifnot( round( coef( hispanic_payroll_result )[ 'equally hisp/non' ] , 0 ) == 11083148 ) stopifnot( round( coef( hispanic_payroll_result )[ 'non-hispanic' ] , 0 ) == 1875353228 ) stopifnot( round( coef( hispanic_employment_result )[ 'hispanic' ] , 0 ) == 2026406 ) stopifnot( round( coef( hispanic_employment_result )[ 'equally hisp/non' ] , 0 ) == 400152 ) stopifnot( round( coef( hispanic_employment_result )[ 'non-hispanic' ] , 0 ) == 56889606 ) Relative Standard Errors of Estimates at the U.S. Level using the PUMS Tables for: stopifnot( round( cv( hispanic_receipts_result )[ 'hispanic' ] , 2 ) == 0.02 ) stopifnot( round( cv( hispanic_receipts_result )[ 'equally hisp/non' ] , 2 ) == 0.06 ) stopifnot( round( cv( hispanic_receipts_result )[ 'non-hispanic' ] , 2 ) == 0 ) stopifnot( round( cv( hispanic_payroll_result )[ 'hispanic' ] , 2 ) == 0.01 ) stopifnot( round( cv( hispanic_payroll_result )[ 'equally hisp/non' ] , 2 ) == 0.06 ) stopifnot( round( cv( hispanic_payroll_result )[ 'non-hispanic' ] , 2 ) == 0 ) stopifnot( round( cv( hispanic_employment_result )[ 'hispanic' ] , 2 ) == 0.01 ) stopifnot( round( cv( hispanic_employment_result )[ 'equally hisp/non' ] , 2 ) == 0.05 ) stopifnot( round( cv( hispanic_employment_result )[ 'non-hispanic' ] , 2 ) == 0 ) "],["survey-of-consumer-finances-scf.html", "Survey of Consumer Finances (SCF) Function Definitions Download, Import, Preparation Analysis Examples with the survey library   Replication Example Poverty and Inequality Estimation with convey  ", " Survey of Consumer Finances (SCF) A comprehensive survey of household wealth, the U.S. central bank studies net worth across the country by asking about both active and passive income, mortgages, pensions, credit card debt, even car leases. Five implicates, each containing one row per sampled household to account for statistical uncertainty. A complex sample survey designed to generalize to the civilian non-institutional U.S. population. Released triennially since 1989. Administered by the Board of Governors of the Federal Reserve System. Please skim before you begin: Measuring Income and Wealth at the Top Using Administrative and Survey Data Wikipedia Entry A haiku regarding this microdata: # incomes, assets, debts # high net worth oversample # pig bank laproscope Function Definitions This survey uses a multiply-imputed variance estimation technique described in the 2004 Codebook. Most users do not need to study this function carefully. Define a function specific to only this dataset: scf_MIcombine <- function (results, variances, call = sys.call(), df.complete = Inf, ...) { m <- length(results) oldcall <- attr(results, "call") if (missing(variances)) { variances <- suppressWarnings(lapply(results, vcov)) results <- lapply(results, coef) } vbar <- variances[[1]] cbar <- results[[1]] for (i in 2:m) { cbar <- cbar + results[[i]] # MODIFICATION: # vbar <- vbar + variances[[i]] } cbar <- cbar/m # MODIFICATION: # vbar <- vbar/m evar <- var(do.call("rbind", results)) r <- (1 + 1/m) * evar/vbar df <- (m - 1) * (1 + 1/r)^2 if (is.matrix(df)) df <- diag(df) if (is.finite(df.complete)) { dfobs <- ((df.complete + 1)/(df.complete + 3)) * df.complete * vbar/(vbar + evar) if (is.matrix(dfobs)) dfobs <- diag(dfobs) df <- 1/(1/dfobs + 1/df) } if (is.matrix(r)) r <- diag(r) rval <- list(coefficients = cbar, variance = vbar + evar * (m + 1)/m, call = c(oldcall, call), nimp = m, df = df, missinfo = (r + 2/(df + 3))/(r + 1)) class(rval) <- "MIresult" rval } Define a function to download and import each stata file: library(haven) scf_dta_import <- function( this_url ){ this_tf <- tempfile() download.file( this_url , this_tf , mode = 'wb' ) this_tbl <- read_dta( this_tf ) this_df <- data.frame( this_tbl ) file.remove( this_tf ) names( this_df ) <- tolower( names( this_df ) ) this_df } Download, Import, Preparation Download and import the full, summary extract, and replicate weights tables: scf_df <- scf_dta_import( "https://www.federalreserve.gov/econres/files/scf2022s.zip" ) ext_df <- scf_dta_import( "https://www.federalreserve.gov/econres/files/scfp2022s.zip" ) scf_rw_df <- scf_dta_import( "https://www.federalreserve.gov/econres/files/scf2022rw1s.zip" ) Confirm both the full public data and the summary extract contain five records per family: stopifnot( nrow( scf_df ) == nrow( scf_rw_df ) * 5 ) stopifnot( nrow( scf_df ) == nrow( ext_df ) ) Confirm only the primary economic unit and the five implicate identifiers overlap: stopifnot( all( sort( intersect( names( scf_df ) , names( ext_df ) ) ) == c( 'y1' , 'yy1' ) ) ) stopifnot( all( sort( intersect( names( scf_df ) , names( scf_rw_df ) ) ) == c( 'y1' , 'yy1' ) ) ) stopifnot( all( sort( intersect( names( ext_df ) , names( scf_rw_df ) ) ) == c( 'y1' , 'yy1' ) ) ) Remove the implicate identifier from the replicate weights table, add a column of fives for weighting: scf_rw_df[ , 'y1' ] <- NULL scf_df[ , 'five' ] <- 5 Save Locally   Save the object at any point: # scf_fn <- file.path( path.expand( "~" ) , "SCF" , "this_file.rds" ) # saveRDS( scf_df , file = scf_fn , compress = FALSE ) Load the same object: # scf_df <- readRDS( scf_fn ) Survey Design Definition Construct a multiply-imputed, complex sample survey design: Break the main table into five different implicates based on the final character of the column y1: library(stringr) s1_df <- scf_df[ str_sub( scf_df[ , 'y1' ] , -1 , -1 ) == 1 , ] s2_df <- scf_df[ str_sub( scf_df[ , 'y1' ] , -1 , -1 ) == 2 , ] s3_df <- scf_df[ str_sub( scf_df[ , 'y1' ] , -1 , -1 ) == 3 , ] s4_df <- scf_df[ str_sub( scf_df[ , 'y1' ] , -1 , -1 ) == 4 , ] s5_df <- scf_df[ str_sub( scf_df[ , 'y1' ] , -1 , -1 ) == 5 , ] Combine these into a single list, then merge each implicate with the summary extract: scf_imp <- list( s1_df , s2_df , s3_df , s4_df , s5_df ) scf_list <- lapply( scf_imp , merge , ext_df ) Replace all missing values in the replicate weights table with zeroes, multiply the replicate weights by the multiplication factor, then only keep the unique identifier and the final (combined) replicate weights: scf_rw_df[ is.na( scf_rw_df ) ] <- 0 scf_rw_df[ , paste0( 'wgt' , 1:999 ) ] <- scf_rw_df[ , paste0( 'wt1b' , 1:999 ) ] * scf_rw_df[ , paste0( 'mm' , 1:999 ) ] scf_rw_df <- scf_rw_df[ , c( 'yy1' , paste0( 'wgt' , 1:999 ) ) ] Sort both the five implicates and also the replicate weights table by the unique identifier: scf_list <- lapply( scf_list , function( w ) w[ order( w[ , 'yy1' ] ) , ] ) scf_rw_df <- scf_rw_df[ order( scf_rw_df[ , 'yy1' ] ) , ] Define the design: library(survey) library(mitools) scf_design <- svrepdesign( weights = ~wgt , repweights = scf_rw_df[ , -1 ] , data = imputationList( scf_list ) , scale = 1 , rscales = rep( 1 / 998 , 999 ) , mse = FALSE , type = "other" , combined.weights = TRUE ) Variable Recoding Add new columns to the data set: scf_design <- update( scf_design , hhsex = factor( hhsex , levels = 1:2 , labels = c( "male" , "female" ) ) , married = as.numeric( married == 1 ) , edcl = factor( edcl , levels = 1:4 , labels = c( "less than high school" , "high school or GED" , "some college" , "college degree" ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: scf_MIcombine( with( scf_design , svyby( ~ five , ~ five , unwtd.count ) ) ) scf_MIcombine( with( scf_design , svyby( ~ five , ~ hhsex , unwtd.count ) ) ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: scf_MIcombine( with( scf_design , svytotal( ~ five ) ) ) scf_MIcombine( with( scf_design , svyby( ~ five , ~ hhsex , svytotal ) ) ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: scf_MIcombine( with( scf_design , svymean( ~ networth ) ) ) scf_MIcombine( with( scf_design , svyby( ~ networth , ~ hhsex , svymean ) ) ) Calculate the distribution of a categorical variable, overall and by groups: scf_MIcombine( with( scf_design , svymean( ~ edcl ) ) ) scf_MIcombine( with( scf_design , svyby( ~ edcl , ~ hhsex , svymean ) ) ) Calculate the sum of a linear variable, overall and by groups: scf_MIcombine( with( scf_design , svytotal( ~ networth ) ) ) scf_MIcombine( with( scf_design , svyby( ~ networth , ~ hhsex , svytotal ) ) ) Calculate the weighted sum of a categorical variable, overall and by groups: scf_MIcombine( with( scf_design , svytotal( ~ edcl ) ) ) scf_MIcombine( with( scf_design , svyby( ~ edcl , ~ hhsex , svytotal ) ) ) Calculate the median (50th percentile) of a linear variable, overall and by groups: scf_MIcombine( with( scf_design , svyquantile( ~ networth , 0.5 , se = TRUE , interval.type = 'quantile' ) ) ) scf_MIcombine( with( scf_design , svyby( ~ networth , ~ hhsex , svyquantile , 0.5 , se = TRUE , interval.type = 'quantile' , ci = TRUE ) ) ) Estimate a ratio: scf_MIcombine( with( scf_design , svyratio( numerator = ~ income , denominator = ~ networth ) ) ) Subsetting Restrict the survey design to labor force participants: sub_scf_design <- subset( scf_design , lf == 1 ) Calculate the mean (average) of this subset: scf_MIcombine( with( sub_scf_design , svymean( ~ networth ) ) ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- scf_MIcombine( with( scf_design , svymean( ~ networth ) ) ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- scf_MIcombine( with( scf_design , svyby( ~ networth , ~ hhsex , svymean ) ) ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( scf_design$designs[[1]] ) Calculate the complex sample survey-adjusted variance of any statistic: scf_MIcombine( with( scf_design , svyvar( ~ networth ) ) ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement scf_MIcombine( with( scf_design , svymean( ~ networth , deff = TRUE ) ) ) # SRS with replacement scf_MIcombine( with( scf_design , svymean( ~ networth , deff = "replace" ) ) ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: # MIsvyciprop( ~ married , scf_design , # method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: # MIsvyttest( networth ~ married , scf_design ) Perform a chi-squared test of association for survey data: # MIsvychisq( ~ married + edcl , scf_design ) Perform a survey-weighted generalized linear model: glm_result <- scf_MIcombine( with( scf_design , svyglm( networth ~ married + edcl ) ) ) summary( glm_result ) Replication Example This example matches the “Table 4” tab’s cell Y6 of the Excel Based on Public Data: mean_net_worth <- scf_MIcombine( with( scf_design , svymean( ~ networth ) ) ) stopifnot( round( coef( mean_net_worth ) / 1000 , 1 ) == 1059.5 ) This example comes within $500 of the standard error of mean net worth from Table 2 of the Federal Reserve Bulletin, displaying the minor differences between the Internal Data and Public Data: stopifnot( abs( 23.2 - round( SE( mean_net_worth ) / 1000 , 1 ) ) < 0.5 ) This example matches the “Table 4” tab’s cells X6 of the Excel Based on Public Data: # compute quantile with all five implicates stacked (not the recommended technique) fake_design <- svydesign( ~ 1 , data = ext_df[ c( 'networth' , 'wgt' ) ] , weights = ~ wgt ) median_net_worth_incorrect_errors <- svyquantile( ~ networth , fake_design , 0.5 ) stopifnot( round( coef( median_net_worth_incorrect_errors ) / 1000 , 2 ) == 192.7 ) Poverty and Inequality Estimation with convey   The R convey library estimates measures of income concentration, poverty, inequality, and wellbeing. This textbook details the available features. As a starting point for SCF users, this code calculates the gini coefficient on complex sample survey data: library(convey) scf_design$designs <- lapply( scf_design$designs , convey_prep ) scf_MIcombine( with( scf_design , svygini( ~ networth ) ) ) "],["survey-of-income-and-program-participation-sipp.html", "Survey of Income and Program Participation (SIPP) Download, Import, Preparation Analysis Examples with the survey library   Replication Example Poverty and Inequality Estimation with convey   Analysis Examples with srvyr  ", " Survey of Income and Program Participation (SIPP) The primary longitudinal assessment of income fluctuation, labor force participation, social programs. Annual tables with one record per month per person per sampled household, time period weights. A complex sample generalizing to the U.S. civilian non-institutionalized across varying time periods. Multi-year panels since 1980s, its current and now permanent four year rotation beginning in 2018. Administered and financed by the US Census Bureau. Please skim before you begin: 2023 Survey of Income and Program Participation Users’ Guide 2023 Data User Notes A haiku regarding this microdata: # federal programs # poverty oversample # monthly dynamics Download, Import, Preparation Determine which variables from the main table to import: variables_to_keep <- c( 'ssuid' , 'pnum' , 'monthcode' , 'spanel' , 'swave' , 'erelrpe' , 'tlivqtr' , 'wpfinwgt' , 'rmesr' , 'thcyincpov' , 'tfcyincpov' , 'tehc_st' , 'rhicovann' , 'rfpov' , 'thnetworth' , 'tftotinc' ) Download and import the latest main file: library(httr) library(data.table) main_tf <- tempfile() main_url <- paste0( "https://www2.census.gov/programs-surveys/sipp/" , "data/datasets/2023/pu2023_csv.zip" ) GET( main_url , write_disk( main_tf ) , progress() ) main_csv <- unzip( main_tf , exdir = tempdir() ) sipp_main_dt <- fread( main_csv , sep = "|" , select = toupper( variables_to_keep ) ) sipp_main_df <- data.frame( sipp_main_dt ) names( sipp_main_df ) <- tolower( names( sipp_main_df ) ) Download and import the appropriate replicate weights file: rw_tf <- tempfile() rw_url <- paste0( "https://www2.census.gov/programs-surveys/sipp/" , "data/datasets/2023/rw2023_csv.zip" ) GET( rw_url , write_disk( rw_tf ) , progress() ) rw_csv <- unzip( rw_tf , exdir = tempdir() ) sipp_rw_dt <- fread( rw_csv , sep = "|" ) sipp_rw_df <- data.frame( sipp_rw_dt ) names( sipp_rw_df ) <- tolower( names( sipp_rw_df ) ) Limit both files to December records for a point-in-time estimate, then merge: sipp_df <- merge( sipp_main_df[ sipp_main_df[ , 'monthcode' ] %in% 12 , ] , sipp_rw_df[ sipp_rw_df[ , 'monthcode' ] %in% 12 , ] , by = c( 'ssuid' , 'pnum' , 'monthcode' , 'spanel' , 'swave' ) ) stopifnot( nrow( sipp_df ) == sum( sipp_rw_df[ , 'monthcode' ] %in% 12 ) ) Save Locally   Save the object at any point: # sipp_fn <- file.path( path.expand( "~" ) , "SIPP" , "this_file.rds" ) # saveRDS( sipp_df , file = sipp_fn , compress = FALSE ) Load the same object: # sipp_df <- readRDS( sipp_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) sipp_design <- svrepdesign( data = sipp_df , weights = ~ wpfinwgt , repweights = "repwgt([1-9]+)" , type = "Fay" , rho = 0.5 ) Variable Recoding Add new columns to the data set: rmesr_values <- c( "With a job entire month, worked all weeks", "With a job all month, absent from work without pay 1+ weeks, absence not due to layoff", "With a job all month, absent from work without pay 1+ weeks, absence due to layoff", "With a job at least 1 but not all weeks, no time on layoff and no time looking for work", "With a job at least 1 but not all weeks, some weeks on layoff or looking for work", "No job all month, on layoff or looking for work all weeks", "No job all month, at least one but not all weeks on layoff or looking for work", "No job all month, no time on layoff and no time looking for work" ) sipp_design <- update( sipp_design , one = 1 , employment_status = factor( rmesr , levels = 1:8 , labels = rmesr_values ) , household_below_poverty = as.numeric( thcyincpov < 1 ) , family_below_poverty = as.numeric( tfcyincpov < 1 ) , state_name = factor( as.numeric( tehc_st ) , levels = c(1L, 2L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 53L, 54L, 55L, 56L, 60L, 61L) , labels = c("Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "District of Columbia", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming", "Puerto Rico", "Foreign Country") ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( sipp_design , "sampling" ) != 0 ) svyby( ~ one , ~ state_name , sipp_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , sipp_design ) svyby( ~ one , ~ state_name , sipp_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ tftotinc , sipp_design , na.rm = TRUE ) svyby( ~ tftotinc , ~ state_name , sipp_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ employment_status , sipp_design , na.rm = TRUE ) svyby( ~ employment_status , ~ state_name , sipp_design , svymean , na.rm = TRUE ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ tftotinc , sipp_design , na.rm = TRUE ) svyby( ~ tftotinc , ~ state_name , sipp_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ employment_status , sipp_design , na.rm = TRUE ) svyby( ~ employment_status , ~ state_name , sipp_design , svytotal , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ tftotinc , sipp_design , 0.5 , na.rm = TRUE ) svyby( ~ tftotinc , ~ state_name , sipp_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ tftotinc , denominator = ~ rfpov , sipp_design , na.rm = TRUE ) Subsetting Restrict the survey design to individuals ever covered by health insurance during the year: sub_sipp_design <- subset( sipp_design , rhicovann == 1 ) Calculate the mean (average) of this subset: svymean( ~ tftotinc , sub_sipp_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ tftotinc , sipp_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ tftotinc , ~ state_name , sipp_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( sipp_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ tftotinc , sipp_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ tftotinc , sipp_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ tftotinc , sipp_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ family_below_poverty , sipp_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( tftotinc ~ family_below_poverty , sipp_design ) Perform a chi-squared test of association for survey data: svychisq( ~ family_below_poverty + employment_status , sipp_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( tftotinc ~ family_below_poverty + employment_status , sipp_design ) summary( glm_result ) Replication Example This example matches statistics and standard errors from the Wealth and Asset Ownership for Households, by Type of Asset and Selected Characteristics: 2022: Restrict the design to permanent residence-based householders to match the count in Table 4: sipp_household_design <- subset( sipp_design , erelrpe %in% 1:2 & tlivqtr %in% 1:2 ) stopifnot( round( coef( svytotal( ~ one , sipp_household_design ) ) / 1000 , -2 ) == 134100 ) Compute Household Net Worth distribution and standard errors across the Total row of Tables 4 and 4A: sipp_household_design <- update( sipp_household_design , thnetworth_category = factor( findInterval( thnetworth , c( 1 , 5000 , 10000 , 25000 , 50000 , 100000 , 250000 , 500000 ) ) , levels = 0:8 , labels = c( "Zero or Negative" , "$1 to $4,999" , "$5,000 to $9,999" , "$10,000 to $24,999" , "$25,000 to $49,999" , "$50,000 to $99,999" , "$100,000 to $249,999" , "$250,000 to $499,999" , "$500,000 or over" ) ) ) results <- svymean( ~ thnetworth_category , sipp_household_design ) stopifnot( all.equal( as.numeric( round( coef( results ) * 100 , 1 ) ) , c( 11.1 , 6.8 , 3.5 , 5.7 , 5.6 , 7.8 , 15.9 , 14.4 , 29.2 ) ) ) stopifnot( all.equal( as.numeric( round( SE( results ) * 100 , 1 ) ) , c( 0.3 , 0.2 , 0.2 , 0.2 , 0.2 , 0.2 , 0.3 , 0.3 , 0.3 ) ) ) Poverty and Inequality Estimation with convey   The R convey library estimates measures of income concentration, poverty, inequality, and wellbeing. This textbook details the available features. As a starting point for SIPP users, this code calculates the gini coefficient on complex sample survey data: library(convey) sipp_design <- convey_prep( sipp_design ) svygini( ~ tftotinc , sipp_design , na.rm = TRUE ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for SIPP users, this code replicates previously-presented examples: library(srvyr) sipp_srvyr_design <- as_survey( sipp_design ) Calculate the mean (average) of a linear variable, overall and by groups: sipp_srvyr_design %>% summarize( mean = survey_mean( tftotinc , na.rm = TRUE ) ) sipp_srvyr_design %>% group_by( state_name ) %>% summarize( mean = survey_mean( tftotinc , na.rm = TRUE ) ) "],["social-security-public-use-data-files-ssa.html", "Social Security Public-Use Data Files (SSA) Recommended Reading Download, Import, Preparation Analysis Examples with base R   Replication Example Analysis Examples with dplyr   Analysis Examples with data.table   Analysis Examples with duckdb  ", " Social Security Public-Use Data Files (SSA) Microdata from administrative sources like the Master Beneficiary Record, Supplemental Security Record. Tables contain either one record per person or one record per person per year. A systematic sample of either social security number holders (most americans) or program recipients (current beneficiaries). Multiply 1% samples by 100 to get weighted statistics, 5% samples by 20. No expected release timeline. Released by the Office of Research, Evaluation, and Statistics, US Social Security Administration. Recommended Reading Two Methodology Documents: The 2006 Earnings Public-Use Microdata File: An Introduction Comparing Earnings Estimates from the 2006 Public-Use File and the Annual Statistical Supplement One Haiku: # annual earnings. # for pensioner payouts, see # the '04 extract Download, Import, Preparation Download and import the 1951-2006 one percent files with one record per person and per person-year: library(haven) library(httr) tf <- tempfile() ssa_url <- "https://www.ssa.gov/policy/docs/microdata/epuf/epuf2006_sas_files.zip" GET( ssa_url , write_disk( tf ) , progress() ) ssa_files <- unzip( tf , exdir = tempdir() ) ssa_fn <- grep( 'demographic' , ssa_files , value = TRUE ) annual_fn <- grep( 'annual' , ssa_files , value = TRUE ) ssa_tbl <- read_sas( ssa_fn ) annual_tbl <- read_sas( annual_fn ) ssa_df <- data.frame( ssa_tbl ) annual_df <- data.frame( annual_tbl ) names( ssa_df ) <- tolower( names( ssa_df ) ) names( annual_df ) <- tolower( names( annual_df ) ) Sum up 1951-1952 and 1953-2006 earnings, and also 1953-2006 credits, copying the naming convention: summed_earnings_5152 <- with( subset( annual_df , year_earn %in% 1951:1952 ) , aggregate( annual_earnings , list( id ) , sum ) ) names( summed_earnings_5152 ) <- c( 'id' , 'tot_cov_earn5152' ) summed_earnings_5306 <- with( subset( annual_df , year_earn > 1952 ) , aggregate( annual_earnings , list( id ) , sum ) ) names( summed_earnings_5306 ) <- c( 'id' , 'tot_cov_earn5306' ) summed_quarters_5306 <- with( subset( annual_df , year_earn > 1952 ) , aggregate( annual_qtrs , list( id ) , sum ) ) names( summed_quarters_5306 ) <- c( 'id' , 'qc5306' ) Isolate a single year of earnings: earnings_2006 <- annual_df[ annual_df[ , 'year_earn' ] == 2006 , c( 'id' , 'annual_earnings' ) ] names( earnings_2006 ) <- c( 'id' , 'tot_cov_earn06' ) Merge each new column on to the person-level table, then add zeroes to person-years without earnings: stopifnot( all( !is.na( ssa_df ) ) ) before_nrow <- nrow( ssa_df ) ssa_df <- merge( ssa_df , summed_earnings_5152 , all.x = TRUE ) ssa_df <- merge( ssa_df , summed_earnings_5306 , all.x = TRUE ) ssa_df <- merge( ssa_df , summed_quarters_5306 , all.x = TRUE ) ssa_df <- merge( ssa_df , earnings_2006 , all.x = TRUE ) ssa_df[ is.na( ssa_df ) ] <- 0 stopifnot( nrow( ssa_df ) == before_nrow ) Save Locally   Save the object at any point: # ssa_fn <- file.path( path.expand( "~" ) , "SSA" , "this_file.rds" ) # saveRDS( ssa_df , file = ssa_fn , compress = FALSE ) Load the same object: # ssa_df <- readRDS( ssa_fn ) Variable Recoding Add new columns to the data set: ssa_df <- transform( ssa_df , decade_of_birth = floor( yob / 10 ) * 10 , sex = factor( sex , levels = 1:2 , labels = c( 'male' , 'female' ) ) , tot_cov_earn3706 = ( tot_cov_earn3750 + tot_cov_earn5152 + tot_cov_earn5306 ) , qc3706 = ( qc3750 + qc5152 + qc5306 ) , any_earnings_2006 = ( tot_cov_earn06 > 0 ) , earnings_periods = factor( ifelse( ( tot_cov_earn5152 + tot_cov_earn5306 > 0 ) & tot_cov_earn3750 > 0 , 1 , ifelse( tot_cov_earn5152 > 0 | tot_cov_earn5306 > 0 , 2 , ifelse( tot_cov_earn3750 > 0 , 3 , 4 ) ) ) , levels = 1:4 , labels = c( 'Earnings in both periods' , 'Earnings during 1951-2006 only' , 'Earnings during 1937-1950 only' , 'No earnings' ) ) ) Analysis Examples with base R   Unweighted Counts Count the unweighted number of records in the table, overall and by groups: nrow( ssa_df ) table( ssa_df[ , "sex" ] , useNA = "always" ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: mean( ssa_df[ , "tot_cov_earn3706" ] ) tapply( ssa_df[ , "tot_cov_earn3706" ] , ssa_df[ , "sex" ] , mean ) Calculate the distribution of a categorical variable, overall and by groups: prop.table( table( ssa_df[ , "decade_of_birth" ] ) ) prop.table( table( ssa_df[ , c( "decade_of_birth" , "sex" ) ] ) , margin = 2 ) Calculate the sum of a linear variable, overall and by groups: sum( ssa_df[ , "tot_cov_earn3706" ] ) tapply( ssa_df[ , "tot_cov_earn3706" ] , ssa_df[ , "sex" ] , sum ) Calculate the median (50th percentile) of a linear variable, overall and by groups: quantile( ssa_df[ , "tot_cov_earn3706" ] , 0.5 ) tapply( ssa_df[ , "tot_cov_earn3706" ] , ssa_df[ , "sex" ] , quantile , 0.5 ) Subsetting Limit your data.frame to individuals with at least forty lifetime credits: sub_ssa_df <- subset( ssa_df , qc3706 >= 40 ) Calculate the mean (average) of this subset: mean( sub_ssa_df[ , "tot_cov_earn3706" ] ) Measures of Uncertainty Calculate the variance, overall and by groups: var( ssa_df[ , "tot_cov_earn3706" ] ) tapply( ssa_df[ , "tot_cov_earn3706" ] , ssa_df[ , "sex" ] , var ) Regression Models and Tests of Association Perform a t-test: t.test( tot_cov_earn3706 ~ any_earnings_2006 , ssa_df ) Perform a chi-squared test of association: this_table <- table( ssa_df[ , c( "any_earnings_2006" , "decade_of_birth" ) ] ) chisq.test( this_table ) Perform a generalized linear model: glm_result <- glm( tot_cov_earn3706 ~ any_earnings_2006 + decade_of_birth , data = ssa_df ) summary( glm_result ) Replication Example This example matches statistics in The 2006 Earnings Public-Use Microdata File: An Introduction: Chart 5. Percentage distribution of individuals in EPUF, by capped Social Security taxable earnings status: chart_five_results <- prop.table( table( ssa_df[ , 'earnings_periods' ] ) ) chart_five_results <- round( 100 * chart_five_results ) stopifnot( chart_five_results[ 'Earnings in both periods' ] == 16 ) stopifnot( chart_five_results[ 'Earnings during 1951-2006 only' ] == 55 ) stopifnot( chart_five_results[ 'Earnings during 1937-1950 only' ] == 4 ) stopifnot( chart_five_results[ 'No earnings' ] == 25 ) Table 4. Average and median Social Security taxable earnings in EPUF, by sex, 1951–2006 (in dollars): nonzero_2006_earners <- ssa_df[ ssa_df[ , 'tot_cov_earn06' ] > 0 , 'tot_cov_earn06' ] stopifnot( round( mean( nonzero_2006_earners ) , 0 ) == 30953 ) stopifnot( round( quantile( nonzero_2006_earners )[ 3 ] , 0 ) == 24000 ) Table A1. Number and percentage distribution of individuals with Social Security taxable earnings records in EPUF, by sex, 1951–2006: nonzero_2006_earners <- ssa_df[ ssa_df[ , 'tot_cov_earn06' ] > 0 , ] stopifnot( round( mean( nonzero_2006_earners[ , 'tot_cov_earn06' ] ) , 0 ) == 30953 ) stopifnot( round( quantile( nonzero_2006_earners[ , 'tot_cov_earn06' ] )[ 3 ] , 0 ) == 24000 ) This example matches statistics in Comparing Earnings Estimates from the 2006 Earnings Public-Use File and the Annual Statistical Supplement: Table 4. Comparing Supplement and EPUF estimates: Number of all, male, and female workers with any earnings during the year, 1951–2006: stopifnot( round( nrow( nonzero_2006_earners ) * 100 , -3 ) == 156280000 ) earners_in_2006_by_sex <- table( nonzero_2006_earners[ , 'sex' ] ) * 100 stopifnot( round( earners_in_2006_by_sex[ 'male' ] , -3 ) == 81576000 ) stopifnot( round( earners_in_2006_by_sex[ 'female' ] , -3 ) == 74681000 ) Analysis Examples with dplyr   The R dplyr library offers an alternative grammar of data manipulation to base R and SQL syntax. dplyr offers many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, and the tidyverse style of non-standard evaluation. This vignette details the available features. As a starting point for SSA users, this code replicates previously-presented examples: library(dplyr) ssa_tbl <- as_tibble( ssa_df ) Calculate the mean (average) of a linear variable, overall and by groups: ssa_tbl %>% summarize( mean = mean( tot_cov_earn3706 ) ) ssa_tbl %>% group_by( sex ) %>% summarize( mean = mean( tot_cov_earn3706 ) ) Analysis Examples with data.table   The R data.table library provides a high-performance version of base R’s data.frame with syntax and feature enhancements for ease of use, convenience and programming speed. data.table offers concise syntax: fast to type, fast to read, fast speed, memory efficiency, a careful API lifecycle management, an active community, and a rich set of features. This vignette details the available features. As a starting point for SSA users, this code replicates previously-presented examples: library(data.table) ssa_dt <- data.table( ssa_df ) Calculate the mean (average) of a linear variable, overall and by groups: ssa_dt[ , mean( tot_cov_earn3706 ) ] ssa_dt[ , mean( tot_cov_earn3706 ) , by = sex ] Analysis Examples with duckdb   The R duckdb library provides an embedded analytical data management system with support for the Structured Query Language (SQL). duckdb offers a simple, feature-rich, fast, and free SQL OLAP management system. This vignette details the available features. As a starting point for SSA users, this code replicates previously-presented examples: library(duckdb) con <- dbConnect( duckdb::duckdb() , dbdir = 'my-db.duckdb' ) dbWriteTable( con , 'ssa' , ssa_df ) Calculate the mean (average) of a linear variable, overall and by groups: dbGetQuery( con , 'SELECT AVG( tot_cov_earn3706 ) FROM ssa' ) dbGetQuery( con , 'SELECT sex , AVG( tot_cov_earn3706 ) FROM ssa GROUP BY sex' ) "],["trends-in-international-mathematics-and-science-study-timss.html", "Trends in International Mathematics and Science Study (TIMSS) Function Definitions Download, Import, Preparation Analysis Examples with the survey library   Replication Example", " Trends in International Mathematics and Science Study (TIMSS) A comparative study of student achievement in math and science across more than 50 nations. Grade-specific tables with one record per school, student, teacher, plus files containing student achievement, home background, student-teacher linkage, and within-country scoring reliability. A complex survey generalizing to fourth- and eighth-grade populations of participating countries. Released quadrennially since 1995. Funded by the International Association for the Evaluation of Educational Achievement, run at BC. Please skim before you begin: TIMSS 2019 User Guide for the International Database, 2nd Edition Methods and Procedures: TIMSS 2019 Technical Report A haiku regarding this microdata: # brando for stella, # gump's jenny, rock's adrian, # students toward math test Function Definitions This survey uses a multiply-imputed variance estimation technique described in Methods Chapter 14. Most users do not need to study this function carefully. Define a function specific to only this dataset: timss_MIcombine <- function (results, variances, call = sys.call(), df.complete = Inf, ...) { m <- length(results) oldcall <- attr(results, "call") if (missing(variances)) { variances <- suppressWarnings(lapply(results, vcov)) results <- lapply(results, coef) } vbar <- variances[[1]] cbar <- results[[1]] for (i in 2:m) { cbar <- cbar + results[[i]] vbar <- vbar + variances[[i]] } cbar <- cbar/m vbar <- vbar/m # MODIFICATION # evar <- var(do.call("rbind", results)) evar <- sum( ( unlist( results ) - cbar )^2 / 4 ) r <- (1 + 1/m) * evar/vbar df <- (m - 1) * (1 + 1/r)^2 if (is.matrix(df)) df <- diag(df) if (is.finite(df.complete)) { dfobs <- ((df.complete + 1)/(df.complete + 3)) * df.complete * vbar/(vbar + evar) if (is.matrix(dfobs)) dfobs <- diag(dfobs) df <- 1/(1/dfobs + 1/df) } if (is.matrix(r)) r <- diag(r) rval <- list(coefficients = cbar, variance = vbar + evar * (m + 1)/m, call = c(oldcall, call), nimp = m, df = df, missinfo = (r + 2/(df + 3))/(r + 1)) class(rval) <- "MIresult" rval } Download, Import, Preparation Download and unzip the 2019 fourth grade international database: library(httr) tf <- tempfile() this_url <- "https://timss2019.org/international-database/downloads/T19_G4_SPSS%20Data.zip" GET( this_url , write_disk( tf ) , progress() ) unzipped_files <- unzip( tf , exdir = tempdir() ) Import and stack each of the student context data files for Albania through Canada: library(haven) # limit unzipped files to those starting with `asg` followed by three letters followed by `m7` asg_fns <- unzipped_files[ grepl( '^asg[a-z][a-z][a-z]m7' , basename( unzipped_files ) ) ] # further limit asg files to the first ten countries countries_thru_canada <- c("alb", "arm", "aus", "aut", "aze", "bhr", "bfl", "bih", "bgr", "can") fns_thru_canada <- paste0( paste0( '^asg' , countries_thru_canada , 'm7' ) , collapse = "|" ) asg_alb_can_fns <- asg_fns[ grepl( fns_thru_canada , basename( asg_fns ) ) ] timss_df <- NULL for( spss_fn in asg_alb_can_fns ){ this_tbl <- read_spss( spss_fn ) this_tbl <- zap_labels( this_tbl ) this_df <- data.frame( this_tbl ) names( this_df ) <- tolower( names( this_df ) ) timss_df <- rbind( timss_df , this_df ) } # order the data.frame by unique student id timss_df <- timss_df[ with( timss_df , order( idcntry , idstud ) ) , ] Save Locally   Save the object at any point: # timss_fn <- file.path( path.expand( "~" ) , "TIMSS" , "this_file.rds" ) # saveRDS( timss_df , file = timss_fn , compress = FALSE ) Load the same object: # timss_df <- readRDS( timss_fn ) Survey Design Definition Construct a multiply-imputed, complex sample survey design: From among possibly plausible values, determine all columns that are multiply-imputed plausible values: # identify all columns ending with `01` thru `05` ppv <- grep( "(.*)0[1-5]$" , names( timss_df ) , value = TRUE ) # remove those ending digits ppv_prefix <- gsub( "0[1-5]$" , "" , ppv ) # identify each of the possibilities with exactly five matches (five implicates) pv <- names( table( ppv_prefix )[ table( ppv_prefix ) == 5 ] ) # identify each of the `01` thru `05` plausible value columns pv_columns <- grep( paste0( "^" , pv , "0[1-5]$" , collapse = "|" ) , names( timss_df ) , value = TRUE ) Extract those multiply-imputed columns into a separate data.frame, then remove them from the source: pv_wide_df <- timss_df[ c( 'idcntry' , 'idstud' , pv_columns ) ] timss_df[ pv_columns ] <- NULL Reshape these columns from one record per student to one record per student per implicate: pv_long_df <- reshape( pv_wide_df , varying = lapply( paste0( pv , '0' ) , paste0 , 1:5 ) , direction = 'long' , timevar = 'implicate' , idvar = c( 'idcntry' , 'idstud' ) ) names( pv_long_df ) <- gsub( "01$" , "" , names( pv_long_df ) ) Merge the columns from the source data.frame onto the one record per student per implicate data.frame: timss_long_df <- merge( timss_df , pv_long_df ) timss_long_df <- timss_long_df[ with( timss_long_df , order( idcntry , idstud ) ) , ] stopifnot( nrow( timss_long_df ) == nrow( pv_long_df ) ) stopifnot( nrow( timss_long_df ) / 5 == nrow( timss_df ) ) Divide the five plausible value implicates into a list with five data.frames based on the implicate number: timss_list <- split( timss_long_df , timss_long_df[ , 'implicate' ] ) Construct a replicate weights table following the estimation technique described in Methods Chapter 14: weights_df <- timss_df[ c( 'jkrep' , 'jkzone' ) ] for( j in 1:75 ){ for( i in 0:1 ){ weights_df[ weights_df[ , 'jkzone' ] != j , paste0( 'rw' , i , j ) ] <- 1 weights_df[ weights_df[ , 'jkzone' ] == j , paste0( 'rw' , i , j ) ] <- 2 * ( weights_df[ weights_df[ , 'jkzone' ] == j , 'jkrep' ] == i ) } } weights_df[ c( 'jkrep' , 'jkzone' ) ] <- NULL Define the design: library(survey) library(mitools) timss_design <- svrepdesign( weights = ~totwgt , repweights = weights_df , data = imputationList( timss_list ) , type = "other" , scale = 0.5 , rscales = rep( 1 , 150 ) , combined.weights = FALSE , mse = TRUE ) Variable Recoding Add new columns to the data set: timss_design <- update( timss_design , one = 1 , countries_thru_canada = factor( as.numeric( idcntry ) , levels = c(8L, 51L, 36L, 40L, 31L, 48L, 956L, 70L, 100L, 124L) , labels = c("Albania", "Armenia", "Australia", "Austria", "Azerbaijan", "Bahrain", "Belgium (Flemish)", "Bosnia and Herzegovina", "Bulgaria", "Canada") ) , sex = factor( asbg01 , levels = 1:2 , labels = c( "female" , "male" ) ) , born_in_country = ifelse( asbg07 %in% 1:2 , as.numeric( asbg07 == 1 ) , NA ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: timss_MIcombine( with( timss_design , svyby( ~ one , ~ one , unwtd.count ) ) ) timss_MIcombine( with( timss_design , svyby( ~ one , ~ sex , unwtd.count ) ) ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: timss_MIcombine( with( timss_design , svytotal( ~ one ) ) ) timss_MIcombine( with( timss_design , svyby( ~ one , ~ sex , svytotal ) ) ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: timss_MIcombine( with( timss_design , svymean( ~ asmmat , na.rm = TRUE ) ) ) timss_MIcombine( with( timss_design , svyby( ~ asmmat , ~ sex , svymean , na.rm = TRUE ) ) ) Calculate the distribution of a categorical variable, overall and by groups: timss_MIcombine( with( timss_design , svymean( ~ countries_thru_canada ) ) ) timss_MIcombine( with( timss_design , svyby( ~ countries_thru_canada , ~ sex , svymean ) ) ) Calculate the sum of a linear variable, overall and by groups: timss_MIcombine( with( timss_design , svytotal( ~ asmmat , na.rm = TRUE ) ) ) timss_MIcombine( with( timss_design , svyby( ~ asmmat , ~ sex , svytotal , na.rm = TRUE ) ) ) Calculate the weighted sum of a categorical variable, overall and by groups: timss_MIcombine( with( timss_design , svytotal( ~ countries_thru_canada ) ) ) timss_MIcombine( with( timss_design , svyby( ~ countries_thru_canada , ~ sex , svytotal ) ) ) Calculate the median (50th percentile) of a linear variable, overall and by groups: timss_MIcombine( with( timss_design , svyquantile( ~ asmmat , 0.5 , se = TRUE , na.rm = TRUE ) ) ) timss_MIcombine( with( timss_design , svyby( ~ asmmat , ~ sex , svyquantile , 0.5 , se = TRUE , ci = TRUE , na.rm = TRUE ) ) ) Estimate a ratio: timss_MIcombine( with( timss_design , svyratio( numerator = ~ asssci , denominator = ~ asmmat ) ) ) Subsetting Restrict the survey design to Australia, Austria, Azerbaijan, Belgium (French): sub_timss_design <- subset( timss_design , idcntry %in% c( 36 , 40 , 31 , 956 ) ) Calculate the mean (average) of this subset: timss_MIcombine( with( sub_timss_design , svymean( ~ asmmat , na.rm = TRUE ) ) ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- timss_MIcombine( with( timss_design , svymean( ~ asmmat , na.rm = TRUE ) ) ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- timss_MIcombine( with( timss_design , svyby( ~ asmmat , ~ sex , svymean , na.rm = TRUE ) ) ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( timss_design$designs[[1]] ) Calculate the complex sample survey-adjusted variance of any statistic: timss_MIcombine( with( timss_design , svyvar( ~ asmmat , na.rm = TRUE ) ) ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement timss_MIcombine( with( timss_design , svymean( ~ asmmat , na.rm = TRUE , deff = TRUE ) ) ) # SRS with replacement timss_MIcombine( with( timss_design , svymean( ~ asmmat , na.rm = TRUE , deff = "replace" ) ) ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: # MIsvyciprop( ~ born_in_country , timss_design , # method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: # MIsvyttest( asmmat ~ born_in_country , timss_design ) Perform a chi-squared test of association for survey data: # MIsvychisq( ~ born_in_country + countries_thru_canada , timss_design ) Perform a survey-weighted generalized linear model: glm_result <- timss_MIcombine( with( timss_design , svyglm( asmmat ~ born_in_country + countries_thru_canada ) ) ) summary( glm_result ) Replication Example This example matches the mean proficiency and standard error of the Australia row of the Summary Statistics and Standard Errors for Proficiency in Overall Mathematics-Grade 4 table from the Appendix 14A: Summary Statistics and Standard Errors for Proficiency in Grade 4 Mathematics: australia_design <- subset( timss_design , countries_thru_canada %in% "Australia" ) stopifnot( nrow( australia_design ) == 5890 ) result <- timss_MIcombine( with( australia_design , svymean( ~ asmmat ) ) ) stopifnot( round( coef( result ) , 3 ) == 515.880 ) stopifnot( round( SE( result ) , 3 ) == 2.776 ) This example matches the jackknife sampling, imputation, and total variances of the same row: australia_fn <- unzipped_files[ grepl( 'asgaus' , basename( unzipped_files ) ) ] australia_tbl <- read_spss( australia_fn ) australia_tbl <- zap_labels( australia_tbl ) australia_df <- data.frame( australia_tbl ) names( australia_df ) <- tolower( names( australia_df ) ) estimate <- mean( c( with( australia_df , weighted.mean( asmmat01 , totwgt ) ) , with( australia_df , weighted.mean( asmmat02 , totwgt ) ) , with( australia_df , weighted.mean( asmmat03 , totwgt ) ) , with( australia_df , weighted.mean( asmmat04 , totwgt ) ) , with( australia_df , weighted.mean( asmmat05 , totwgt ) ) ) ) stopifnot( round( estimate , 3 ) == 515.880 ) for( k in 1:5 ){ this_variance <- 0 for( j in 1:75 ){ for( i in 0:1 ){ this_variance <- this_variance + ( weighted.mean( australia_df[ , paste0( 'asmmat0' , k ) ] , ifelse( j == australia_df[ , 'jkzone' ] , australia_df[ , 'totwgt' ] * 2 * ( australia_df[ , 'jkrep' ] == i ) , australia_df[ , 'totwgt' ] ) ) - weighted.mean( australia_df[ , paste0( 'asmmat0' , k ) ] , australia_df[ , 'totwgt' ] ) )^2 } } assign( paste0( 'v' , k ) , this_variance * 0.5 ) } sampling_variance <- mean( c( v1 , v2 , v3 , v4 , v5 ) ) stopifnot( round( sampling_variance , 3 ) == 7.397 ) imputation_variance <- ( 6 / 5 ) * ( ( ( with( australia_df , weighted.mean( asmmat01 , totwgt ) ) - estimate )^2 / 4 ) + ( ( with( australia_df , weighted.mean( asmmat02 , totwgt ) ) - estimate )^2 / 4 ) + ( ( with( australia_df , weighted.mean( asmmat03 , totwgt ) ) - estimate )^2 / 4 ) + ( ( with( australia_df , weighted.mean( asmmat04 , totwgt ) ) - estimate )^2 / 4 ) + ( ( with( australia_df , weighted.mean( asmmat05 , totwgt ) ) - estimate )^2 / 4 ) ) stopifnot( round( imputation_variance , 3 ) == 0.309 ) stopifnot( round( sampling_variance + imputation_variance , 3 ) == 7.706 ) "],["violence-against-children-and-youth-surveys-vacs.html", "Violence Against Children And Youth Surveys (VACS) Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " Violence Against Children And Youth Surveys (VACS) The global surveillance system to track and monitor the burden of violence against children. One table per country with one row per sampled respondent. Nationally representative cross-sectional household surveys of children and youth ages 13–24. No listed update frequency across the participating nations. Led by the CDC through funding from PEPFAR, in partnership with Together for Girls. Please skim before you begin: Sampling design and methodology of the Violence Against Children and Youth Surveys Violence Against Children Surveys (VACS): Towards a global surveillance system A haiku regarding this microdata: # enable us to # lodge cane between each spoke of # cycles of abuse Download, Import, Preparation Request public VACS data at https://www.togetherforgirls.org/en/analyzing-public-vacs-data. Select the Mozambique 2019 dataset and Stata option. Download and unzip the Mozambique VACS Public Use Dataset files: library(haven) vacs_tbl <- read_stata( file.path( path.expand( "~" ) , "mozambique_public use data.dta" ) ) vacs_df <- data.frame( vacs_tbl ) names( vacs_df ) <- tolower( names( vacs_df ) ) Save Locally   Save the object at any point: # vacs_fn <- file.path( path.expand( "~" ) , "VACS" , "this_file.rds" ) # saveRDS( vacs_df , file = vacs_fn , compress = FALSE ) Load the same object: # vacs_df <- readRDS( vacs_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) options( survey.lonely.psu = "adjust" ) vacs_design <- svydesign( ids = ~cluster , strata = ~strata , weights = ~sampleweight , data = subset( vacs_df , sampleweight > 0 ) , nest = TRUE ) Variable Recoding Add new columns to the data set: vacs_design <- update( vacs_design , one = 1 , age_sex_group = factor( ifelse( agegrp == 1 , sex , sex + 2 ) , levels = 1:4 , labels = c( 'male 13-17' , 'female 13-17' , 'male 18-24' , 'female 18-24' ) ) , sex = factor( sex , levels = 1:2 , labels = c( 'male' , 'female' ) ) , agegrp = factor( agegrp , levels = 1:2 , labels = c( '13-17' , '18-24' ) ) , ever_attended_school = ifelse( eversch %in% 1:2 , as.numeric( eversch == 1 ) , NA ) , childhood_physical_violence = as.numeric( pv18 == 1 ) , marry = factor( marry , levels = 1:3 , labels = c( 'Yes, ever married' , 'Yes, ever lived with a partner' , 'No, never married or lived with a partner' ) ) , age_at_first_pregnancy = ifelse( prage < 98 , prage , NA ) , age_at_first_cohabitation = ifelse( marage < 98 , marage , NA ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( vacs_design , "sampling" ) != 0 ) svyby( ~ one , ~ age_sex_group , vacs_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , vacs_design ) svyby( ~ one , ~ age_sex_group , vacs_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ age_at_first_cohabitation , vacs_design , na.rm = TRUE ) svyby( ~ age_at_first_cohabitation , ~ age_sex_group , vacs_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ marry , vacs_design ) svyby( ~ marry , ~ age_sex_group , vacs_design , svymean ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ age_at_first_cohabitation , vacs_design , na.rm = TRUE ) svyby( ~ age_at_first_cohabitation , ~ age_sex_group , vacs_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ marry , vacs_design ) svyby( ~ marry , ~ age_sex_group , vacs_design , svytotal ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ age_at_first_cohabitation , vacs_design , 0.5 , na.rm = TRUE ) svyby( ~ age_at_first_cohabitation , ~ age_sex_group , vacs_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ age_at_first_pregnancy , denominator = ~ age_at_first_cohabitation , vacs_design , na.rm = TRUE ) Subsetting Restrict the survey design to : sub_vacs_design <- subset( vacs_design , childhood_physical_violence == 1 ) Calculate the mean (average) of this subset: svymean( ~ age_at_first_cohabitation , sub_vacs_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ age_at_first_cohabitation , vacs_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ age_at_first_cohabitation , ~ age_sex_group , vacs_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( vacs_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ age_at_first_cohabitation , vacs_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ age_at_first_cohabitation , vacs_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ age_at_first_cohabitation , vacs_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ ever_attended_school , vacs_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( age_at_first_cohabitation ~ ever_attended_school , vacs_design ) Perform a chi-squared test of association for survey data: svychisq( ~ ever_attended_school + marry , vacs_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( age_at_first_cohabitation ~ ever_attended_school + marry , vacs_design ) summary( glm_result ) Replication Example This example matches statistics and confidence intervals within 0.1% from the Final Report of the Mozambique Violence Against Children and Youth Survey (VACS), 2019, Table 4.1.1. Prevalence of different types of sexual violence[1] before age 18, among 18-24-year-olds: females_18_to_24_design <- subset( vacs_design , sex == 'female' & agegrp == '18-24' ) # define a function to check unweighted N, prevalence, confidence interval for each estimate check_sv <- function( this_variable , this_design = females_18_to_24_design , N , prevalence , lb , ub ){ this_formula <- as.formula( paste( "~ as.numeric(" , this_variable , "== 1 )" ) ) stopifnot( coef( unwtd.count( this_formula , this_design ) ) == N ) this_result <- svymean( this_formula , this_design , na.rm = TRUE ) stopifnot( round( coef( this_result ) , 3 ) == prevalence ) stopifnot( abs( confint( this_result )[1] - lb ) < 0.0015 ) stopifnot( abs( confint( this_result )[2] - ub ) < 0.0015 ) invisible( TRUE ) } # sexual touching in childhood check_sv( "sv1_only18" , N = 1232 , prevalence = 0.066 , lb = 0.039 , ub = 0.093 ) # unwanted attempted sex in childhood check_sv( "sv2_only18" , N = 1232 , prevalence = 0.061 , lb = 0.035 , ub = 0.087 ) # pressured or coerced sex in childhood check_sv( "sv4_only18" , N = 1221 , prevalence = 0.056 , lb = 0.035 , ub = 0.077 ) # physically forced sex in childhood check_sv( "sv3_only18" , N = 1231 , prevalence = 0.035 , lb = 0.020 , ub = 0.051 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for VACS users, this code replicates previously-presented examples: library(srvyr) vacs_srvyr_design <- as_survey( vacs_design ) Calculate the mean (average) of a linear variable, overall and by groups: vacs_srvyr_design %>% summarize( mean = survey_mean( age_at_first_cohabitation , na.rm = TRUE ) ) vacs_srvyr_design %>% group_by( age_sex_group ) %>% summarize( mean = survey_mean( age_at_first_cohabitation , na.rm = TRUE ) ) "],["youth-risk-behavior-surveillance-system-yrbss.html", "Youth Risk Behavior Surveillance System (YRBSS) Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " Youth Risk Behavior Surveillance System (YRBSS) The high school edition of the Behavioral Risk Factor Surveillance System (BRFSS). One table with one row per sampled youth respondent. A complex sample survey designed to generalize to all public and private school students in grades 9-12 in the United States. Released biennially since 1993. Administered by the Centers for Disease Control and Prevention. Please skim before you begin: Methodology of the Youth Risk Behavior Surveillance System Wikipedia Entry A haiku regarding this microdata: # maladolescence # epidemiology # sex, drugs, rock and roll Download, Import, Preparation Load the SAScii library to interpret a SAS input program, and also re-arrange the SAS input program: library(SAScii) sas_url <- "https://www.cdc.gov/yrbs/files/2023/2023XXH_SAS_Input_Program.sas" sas_text <- tolower( readLines( sas_url ) ) # find the (out of numerical order) # `site` location variable's position # within the SAS input program site_location <- which( sas_text == '@1 site $3.' ) # find the start field's position # within the SAS input program input_location <- which( sas_text == "input" ) # create a vector from 1 to the length of the text file sas_length <- seq( length( sas_text ) ) # remove the site_location sas_length <- sas_length[ -site_location ] # re-insert the site variable's location # immediately after the starting position sas_reorder <- c( sas_length[ seq( input_location ) ] , site_location , sas_length[ seq( input_location + 1 , length( sas_length ) ) ] ) # re-order the sas text file sas_text <- sas_text[ sas_reorder ] sas_tf <- tempfile() writeLines( sas_text , sas_tf ) Download and import the national file: dat_tf <- tempfile() dat_url <- "https://www.cdc.gov/yrbs/files/2023/XXH2023_YRBS_Data.dat" download.file( dat_url , dat_tf , mode = 'wb' ) yrbss_df <- read.SAScii( dat_tf , sas_tf ) names( yrbss_df ) <- tolower( names( yrbss_df ) ) yrbss_df[ , 'one' ] <- 1 Save Locally   Save the object at any point: # yrbss_fn <- file.path( path.expand( "~" ) , "YRBSS" , "this_file.rds" ) # saveRDS( yrbss_df , file = yrbss_fn , compress = FALSE ) Load the same object: # yrbss_df <- readRDS( yrbss_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) yrbss_design <- svydesign( ~ psu , strata = ~ stratum , data = yrbss_df , weights = ~ weight , nest = TRUE ) Variable Recoding Add new columns to the data set: yrbss_design <- update( yrbss_design , did_not_always_wear_seat_belt = as.numeric( qn8 == 1 ) , ever_used_marijuana = as.numeric( qn46 == 1 ) , tried_to_quit_tobacco_past_year = as.numeric( qn40 == 1 ) , used_tobacco_past_year = as.numeric( q40 > 1 ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( yrbss_design , "sampling" ) != 0 ) svyby( ~ one , ~ ever_used_marijuana , yrbss_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , yrbss_design ) svyby( ~ one , ~ ever_used_marijuana , yrbss_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ bmipct , yrbss_design , na.rm = TRUE ) svyby( ~ bmipct , ~ ever_used_marijuana , yrbss_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ q2 , yrbss_design , na.rm = TRUE ) svyby( ~ q2 , ~ ever_used_marijuana , yrbss_design , svymean , na.rm = TRUE ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ bmipct , yrbss_design , na.rm = TRUE ) svyby( ~ bmipct , ~ ever_used_marijuana , yrbss_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ q2 , yrbss_design , na.rm = TRUE ) svyby( ~ q2 , ~ ever_used_marijuana , yrbss_design , svytotal , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ bmipct , yrbss_design , 0.5 , na.rm = TRUE ) svyby( ~ bmipct , ~ ever_used_marijuana , yrbss_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ tried_to_quit_tobacco_past_year , denominator = ~ used_tobacco_past_year , yrbss_design , na.rm = TRUE ) Subsetting Restrict the survey design to youths who ever drank alcohol: sub_yrbss_design <- subset( yrbss_design , qn40 > 1 ) Calculate the mean (average) of this subset: svymean( ~ bmipct , sub_yrbss_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ bmipct , yrbss_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ bmipct , ~ ever_used_marijuana , yrbss_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( yrbss_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ bmipct , yrbss_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ bmipct , yrbss_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ bmipct , yrbss_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ did_not_always_wear_seat_belt , yrbss_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( bmipct ~ did_not_always_wear_seat_belt , yrbss_design ) Perform a chi-squared test of association for survey data: svychisq( ~ did_not_always_wear_seat_belt + q2 , yrbss_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( bmipct ~ did_not_always_wear_seat_belt + q2 , yrbss_design ) summary( glm_result ) Replication Example This example matches statistics, standard errors, and confidence intervals from the “did not always wear a seat belt” row of PDF page 29 of this CDC analysis software document: unwtd_count_result <- unwtd.count( ~ did_not_always_wear_seat_belt , yrbss_design ) stopifnot( coef( unwtd_count_result ) == 15071 ) wtd_n_result <- svytotal( ~ one , subset( yrbss_design , !is.na( did_not_always_wear_seat_belt ) ) ) stopifnot( round( coef( wtd_n_result ) , 0 ) == 16917 ) share_result <- svymean( ~ did_not_always_wear_seat_belt , yrbss_design , na.rm = TRUE ) stopifnot( round( coef( share_result ) , 4 ) == .3958 ) stopifnot( round( SE( share_result ) , 4 ) == .0172 ) ci_result <- svyciprop( ~ did_not_always_wear_seat_belt , yrbss_design , na.rm = TRUE ) stopifnot( round( confint( ci_result )[1] , 4 ) == 0.3621 ) stopifnot( round( confint( ci_result )[2] , 4 ) == 0.4304 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for YRBSS users, this code replicates previously-presented examples: library(srvyr) yrbss_srvyr_design <- as_survey( yrbss_design ) Calculate the mean (average) of a linear variable, overall and by groups: yrbss_srvyr_design %>% summarize( mean = survey_mean( bmipct , na.rm = TRUE ) ) yrbss_srvyr_design %>% group_by( ever_used_marijuana ) %>% summarize( mean = survey_mean( bmipct , na.rm = TRUE ) ) "]] +[["index.html", "Analyze Survey Data for Free Forty-Nine Public Microdatasets, One Easy To Type Website", " Analyze Survey Data for Free Forty-Nine Public Microdatasets, One Easy To Type Website Please ask questions about this book on cross validated for survey statistics or stackoverflow for R. This textbook replaces my archived blog, prior code, and the no longer maintained lodown package. A work of R is never finished, merely abandoned. - Anthony Damico "],["trend-analysis-of-complex-survey-data.html", "Trend Analysis of Complex Survey Data Download, Import, Preparation Append Polynomials to Each Year Unadjusted Analysis Examples Calculate Joinpoints Needed Calculate Predicted Marginals Identify Joinpoint(s) or Breakpoint(s) Interpret and Conclude", " Trend Analysis of Complex Survey Data The purpose of this analysis is to make statistically valid statements such as, “there was a significant linear decrease in the prevalence of high school aged americans who have ever smoked a cigarette across the period 1999-2011” with complex sample survey data. This step-by-step walkthrough exactly reproduces the statistics presented in the Center for Disease Control & Prevention’s (CDC) linear trend analysis. This analysis may complement qualitative evaluation on prevalence changes observed from surveillance data by providing quantitative evidence, such as when a joinpoint (also called breakpoint or changepoint) occurred; however, this analysis does not explain why or how changes in trends occur. Download, Import, Preparation Download and import the multi-year stacked file: library(SAScii) library(readr) sas_url <- "https://www.cdc.gov/healthyyouth/data/yrbs/sadc_2019/2019-SADC-SAS-Input-Program.sas" dat_url <- "https://www.cdc.gov/healthyyouth/data/yrbs/sadc_2019/sadc_2019_national.dat" sas_positions <- parse.SAScii( sas_url ) sas_positions[ , 'varname' ] <- tolower( sas_positions[ , 'varname' ] ) variables_to_keep <- c( "sex" , "grade" , "race4" , "q30" , "year" , "psu" , "stratum" , "weight" ) sas_positions[ , 'column_types' ] <- ifelse( !( sas_positions[ , 'varname' ] %in% variables_to_keep ) , "_" , ifelse( sas_positions[ , 'char' ] , "c" , "d" ) ) yrbss_tbl <- read_fwf( dat_url , fwf_widths( abs( sas_positions[ , 'width' ] ) , col_names = sas_positions[ , 'varname' ] ) , col_types = paste0( sas_positions[ , 'column_types' ] , collapse = "" ) , na = c( "" , "." ) ) yrbss_df <- data.frame( yrbss_tbl ) Restrict the dataset to only years shown in the original analysis and re-name the main variable: yrbss_df <- subset( yrbss_df , year %in% seq( 1991 , 2011 , 2 ) ) yrbss_df[ , 'ever_smoked' ] <- as.numeric( yrbss_df[ , 'q30' ] == 1 ) yrbss_df[ , 'q30' ] <- NULL Recode each categorical variable to factor class: yrbss_df[ , 'sex' ] <- relevel( factor( yrbss_df[ , 'sex' ] ) , ref = "2" ) for ( i in c( 'race4' , 'grade' ) ){ yrbss_df[ , i ] <- relevel( factor( yrbss_df[ , i ] ) , ref = "1" ) } Append Polynomials to Each Year “The polynomials we have used as predictors to this point are natural polynomials, generated from the linear predictor by centering and then powering the linear predictor.” For more detail on this subject, see page 216 of Applied Multiple Regression/Correlation Analysis for the Behavioral Sciences By Jacob Cohen, Patricia Cohen, Stephen G. West, Leona S. Aiken distinct_years_available <- length( seq( 1991 , 2011 , 2 ) ) # store the linear polynomials c11l <- contr.poly( distinct_years_available )[ , ".L" ] # store the quadratic polynomials c11q <- contr.poly( distinct_years_available )[ , ".Q" ] # store the cubic polynomials c11c <- contr.poly( distinct_years_available )[ , ".C" ] For each record in the data set, tack on the linear, quadratic, and cubic contrast value, these contrast values will serve as replacement for the linear year variable in any regression: # year^1 term (linear) yrbss_df[ , "t11l" ] <- c11l[ match( yrbss_df[ , "year" ] , seq( 1991 , 2011 , 2 ) ) ] # year^2 term (quadratic) yrbss_df[ , "t11q" ] <- c11q[ match( yrbss_df[ , "year" ] , seq( 1991 , 2011 , 2 ) ) ] # year^3 term (cubic) yrbss_df[ , "t11c" ] <- c11c[ match( yrbss_df[ , "year" ] , seq( 1991 , 2011 , 2 ) ) ] Unadjusted Analysis Examples Construct a complex sample survey design and match the published unadjusted prevalence rates: options( survey.lonely.psu = "adjust" ) library(survey) des <- svydesign( id = ~psu , strata = ~interaction( stratum , year ) , data = yrbss_df , weights = ~weight , nest = TRUE ) prevalence_over_time <- svyby( ~ ever_smoked , ~ year , des , svymean , na.rm = TRUE ) # confirm prevalence rates match published estimates # of high school students that ever smoked stopifnot( all.equal( round( coef( prevalence_over_time ) , 3 ) , c( .701 , .695 , .713 , .702 , .704 , .639 , .584 , .543 , .503 , .463 , .447 ) , check.attributes = FALSE ) ) Calculate Joinpoints Needed Using the orthogonal coefficients (linear, quadratic, cubic terms) that we previously added to our yrbss_df object before constructing the multi-year stacked survey design, determine how many joinpoints will be needed for a trend analysis. Epidemiological models typically control for possible confounding variables such as age, sex, and race/ethnicity, so those have been included alongside the linear, cubic, and quadratic year terms. Calculate the “ever smoked” regression, adjusted by sex, grade, race/ethnicity, and linear year contrast: linyear <- svyglm( ever_smoked ~ sex + race4 + grade + t11l , design = des , family = quasibinomial ) summary( linyear ) The linear year-contrast variable t11l is significant. Therefore, there is probably going to be some sort of trend. A linear trend by itself does not need joinpoints. Not one, just zero joinpoints. If the linear term were the only significant term (out of linear, quadratic, cubic), then we would not need to calculate a joinpoint. In other words, we would not need to figure out where to best break our time trend into two, three, or even four segments. Since the linear trend is significant, we know there is at least one change across the entire 1991 to 2011 period. Interpretation note about segments of time: The linear term t11l was significant, so we probably have a significant linear trend somewhere to report. Now we need to figure out when that significant linear trend started and when it ended. It might be semantically true that there was a significant linear decrease in high school aged smoking over the entire period of our data 1991-2011; however, it’s inexact to end this analysis after only detecting a linear trend. The purpose of the following few steps is to cordon off different time points from one another. As you’ll see later, there actually was not any detectable decrease from 1991-1999. The entirety of the decline in smoking occurred over the period from 1999-2011. So these next (methodologically tricky) steps serve to provide you and your audience with a more careful statement of statistical significance. It’s not technically wrong to conclude that smoking declined over the period of 1991-2011, it’s just verbose. Think of it as the difference between “humans first walked on the moon in the sixties” and “humans first walked on the moon in 1969” - both statements are correct, but the latter exhibits greater scientific precision. Calculate the “ever smoked” binomial regression, adjusted by sex, grade, race/ethnicity, and both linear and quadratic year contrasts. Notice the addition of t11q: quadyear <- svyglm( ever_smoked ~ sex + race4 + grade + t11l + t11q , design = des , family = quasibinomial ) summary( quadyear ) The linear year-contrast variable is significant but the quadratic year-contrast variable is also significant. Therefore, we should use joinpoint software (the segmented package) for this analysis. A significant quadratic trend needs one joinpoint. Since both linear and quadratic terms are significant, we can also move ahead and test whether the cubic term is also significant. Calculate the “ever smoked” binomial regression, adjusted by sex, grade, race/ethnicity, and linear, quadratic, and cubic year contrasts. Notice the addition of t11c: cubyear <- svyglm( ever_smoked ~ sex + race4 + grade + t11l + t11q + t11c , design = des , family = quasibinomial ) summary( cubyear ) The cubic year-contrast term is also significant in this model. Therefore, we might potentially evaluate this trend using two joinpoints. In other words, a significant result for all linear, quadratic, and cubic year contrasts at this point means we might be able to evaluate three distinct trends (separated by our two joinpoints) across the broader 1991 - 2011 time period of analysis. Although we might now have the statistical ability to analyze three distinct time periods (separated by two joinpoints) across our data, the utility of this depends on the circumstances. Cubic and higher polynomials account for not only the direction of change but also the pace of that change, allowing statistical statements that might not be of interest to an audience: While it might be an exercise in precision to conclude that smoking rates dropped quickest across 1999-2003 and less quickly across 2003-2011, that scientific pair of findings may not be as compelling as the simpler (quadratic but not cubic) statement that smoking rates have dropped across the period of 1999-2011. Calculate Predicted Marginals Calculate the survey-year-independent predictor effects and store these results: marginals <- svyglm( formula = ever_smoked ~ sex + race4 + grade , design = des , family = quasibinomial ) Run these marginals through the svypredmeans function. For archaeology fans out there, this function emulates the PREDMARG statement in the ancient language of SUDAAN: ( means_for_joinpoint <- svypredmeans( marginals , ~factor( year ) ) ) Clean up these results a bit in preparation for a joinpoint analysis: # coerce the results to a data.frame object means_for_joinpoint <- as.data.frame( means_for_joinpoint ) # extract the row names as the survey year means_for_joinpoint[ , "year" ] <- as.numeric( rownames( means_for_joinpoint ) ) # must be sorted, just in case it's not already means_for_joinpoint <- means_for_joinpoint[ order( means_for_joinpoint[ , "year" ] ) , ] Identify Joinpoint(s) or Breakpoint(s) Let’s take a look at how confident we are in the value at each adjusted timepoint. Carrying out a trend analysis requires creating new weights to fit a piecewise linear regression. First, create that weight variable: means_for_joinpoint[ , "wgt" ] <- with( means_for_joinpoint, ( mean / SE ) ^ 2 ) Second, fit a piecewise linear regression, estimating the ‘starting’ linear model with the usual lm function using the log values and the weights: o <- lm( log( mean ) ~ year , weights = wgt , data = means_for_joinpoint ) Now that the regression has been structured correctly, estimate the year that our complex survey trend should be broken into two or more segments: library(segmented) # find only one joinpoint os <- segmented( o , ~year ) summary( os ) Look for the Estimated Break-Point(s) in that result - that’s the critical number from this joinpoint analysis. The segmented package uses an iterative procedure (described in the article below); between-year solutions are returned and should be rounded to the nearest time point in the analysis. The joinpoint software implements two estimating algorithms: the grid-search and the Hudson algorithm. For more detail about these methods, see Muggeo V. (2003) Estimating regression models with unknown break-points. Statistics in Medicine, 22: 3055-3071.. Obtain the annual percent change estimates for each time point: slope( os , APC = TRUE ) The confidence intervals for the annual percent change (APC) may be different from the ones returned by NCI’s Joinpoint Software; for further details, check out Muggeo V. (2010) A Comment on `Estimating average annual per cent change in trend analysis’ by Clegg et al., Statistics in Medicine; 28, 3670-3682. Statistics in Medicine, 29, 1958-1960. This analysis returned similar results to the NCI’s Joinpoint Regression Program by estimating a joinpoint at year=1999 - and, more precisely, that the start of that decreasing trend in smoking prevalence happened at an APC of -3.92 percent. That is, slope2 from the output above. Remember that the cubic-year model above had significant terms as well. Therefore, it would be statistically defensible to calculate two joinpoints rather than only one. However, for this analyses, breaking the 1999-2011 trend into two separate downward trends might not be of interest to the audience. Looking at the slope2 and slope3 estimates and confidence intervals, we might be able to conclude that “ever smoking” decreased across 1999-2003 and also decreased (albeit less rapidly) across 2003-2011. However, communicating two consecutive downward trends might not be of much interest to a lay audience. Forgoing a second possible joinpoint makes sense when the direction of change is more compelling than the pace of change: # find two joinpoints rather than only one os2 <- segmented( o , ~year , npsi = 2 ) summary( os2 ) slope( os2 , APC = TRUE ) Interpret and Conclude After identifying the joinpoint for smoking prevalence, we can create two regression models (one for each time segment - if we had two joinpoints, we would need three regression models). The first model covers the years leading up to (and including) the joinpoint (i.e., 1991 to 1999). The second model includes the years from the joinpoint forward (i.e., 1999 to 2011). So start with 1991, 1993, 1995, 1997, 1999, the five year-points before (and including) 1999: # calculate a five-timepoint linear contrast vector c5l <- contr.poly( 5 )[ , 1 ] # tack the five-timepoint linear contrast vectors onto the current survey design object des <- update( des , t5l = c5l[ match( year , seq( 1991 , 1999 , 2 ) ) ] ) pre_91_99 <- svyglm( ever_smoked ~ sex + race4 + grade + t5l , design = subset( des , year <= 1999 ) , family = quasibinomial ) summary( pre_91_99 ) # confirm 1991-1999 trend coefficient matches published estimates stopifnot( round( pre_91_99$coefficients['t5l'] , 5 ) == .03704 ) This reproduces the calculations behind the sentence on pdf page 6 of the original document: In this example, T5L_L had a p-value=0.52261 and beta=0.03704. Therefore, there was “no significant change in the prevalence of ever smoking a cigarette during 1991-1999.” Then move on to 1999, 2001, 2003, 2005, 2007, 2009, and 2011, the seven year-points after (and including) 1999: # calculate a seven-timepoint linear contrast vector c7l <- contr.poly( 7 )[ , 1 ] # tack the seven-timepoint linear contrast vectors onto the current survey design object des <- update( des , t7l = c7l[ match( year , seq( 1999 , 2011 , 2 ) ) ] ) post_99_11 <- svyglm( ever_smoked ~ sex + race4 + grade + t7l , design = subset( des , year >= 1999 ) , family = quasibinomial ) summary( post_99_11 ) # confirm 1999-2011 trend coefficient matches published estimates stopifnot( round( post_99_11$coefficients['t7l'] , 5 ) == -0.99165 ) This reproduces the calculations behind the sentence on pdf page 6 of the original document: In this example, T7L_R had a p-value<0.0001 and beta=-0.99165. Therefore, there was a “significant linear decrease in the prevalence of ever smoking a cigarette during 1999-2011.” "],["american-community-survey-acs.html", "American Community Survey (ACS) Recommended Reading Download, Import, Preparation Analysis Examples with the survey library   Replication Example Poverty and Inequality Estimation with convey   Analysis Examples with srvyr  ", " American Community Survey (ACS) The US Census Bureau’s annual replacement for the long-form decennial census. Two tables per state, the first with one row per household and the second with one row per individual. The civilian population of the United States. Released annually since 2005. Administered and financed by the US Census Bureau. Recommended Reading Four Example Strengths & Limitations: ✔️ Large sample size and sub-national geographies ✔️ Large userbase and supporting projects ❌ Short questionnaire ❌ Program participation undercount Three Example Findings: Life expectancy in adulthood fell between 1990 and 2018 for those without a university degree. In 2021, 8 million families lived with non-relatives and were not renters nor homeowners themselves. More than half of rural physicians were at least 50 years old in 2017, more than a quarter at least 60. Two Methodology Documents: Guidance for Data Users Wikipedia Entry One Haiku: # one percent sample # the decennial census # in miniature Download, Import, Preparation Choose either the entire United States with sas_hus.zip, or use a state’s abbreviation like sas_hal.zip for Alabama or sas_hak.zip for Alaska. This imports the Alabama household file: library(haven) tf_household <- tempfile() this_url_household <- "https://www2.census.gov/programs-surveys/acs/data/pums/2023/1-Year/sas_hal.zip" download.file( this_url_household , tf_household , mode = 'wb' ) unzipped_files_household <- unzip( tf_household , exdir = tempdir() ) acs_sas_household <- grep( '\\\\.sas7bdat$' , unzipped_files_household , value = TRUE ) acs_df_household <- read_sas( acs_sas_household ) names( acs_df_household ) <- tolower( names( acs_df_household ) ) Choose either the entire United States with sas_pus.zip, or use a state’s abbreviation like sas_pal.zip for Alabama or sas_pak.zip for Alaska. This imports the Alabama person file: tf_person <- tempfile() this_url_person <- "https://www2.census.gov/programs-surveys/acs/data/pums/2023/1-Year/sas_pal.zip" download.file( this_url_person , tf_person , mode = 'wb' ) unzipped_files_person <- unzip( tf_person , exdir = tempdir() ) acs_sas_person <- grep( '\\\\.sas7bdat$' , unzipped_files_person , value = TRUE ) acs_df_person <- read_sas( acs_sas_person ) names( acs_df_person ) <- tolower( names( acs_df_person ) ) Remove overlapping column and merge household + person files: acs_df_household[ , 'rt' ] <- NULL acs_df_person[ , 'rt' ] <- NULL acs_df <- merge( acs_df_household , acs_df_person ) stopifnot( nrow( acs_df ) == nrow( acs_df_person ) ) acs_df[ , 'one' ] <- 1 Save Locally   Save the object at any point: # acs_fn <- file.path( path.expand( "~" ) , "ACS" , "this_file.rds" ) # saveRDS( acs_df , file = acs_fn , compress = FALSE ) Load the same object: # acs_df <- readRDS( acs_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) acs_design <- svrepdesign( weight = ~pwgtp , repweights = 'pwgtp[0-9]+' , scale = 4 / 80 , rscales = rep( 1 , 80 ) , mse = TRUE , type = 'JK1' , data = acs_df ) Variable Recoding Add new columns to the data set: acs_design <- update( acs_design , state_name = factor( as.numeric( state ) , levels = c(1L, 2L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 53L, 54L, 55L, 56L, 72L) , labels = c("Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "District of Columbia", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming", "Puerto Rico") ) , cit = factor( cit , levels = 1:5 , labels = c( 'born in the u.s.' , 'born in the territories' , 'born abroad to american parents' , 'naturalized citizen' , 'non-citizen' ) ) , poverty_level = as.numeric( povpip ) , married = as.numeric( mar %in% 1 ) , sex = factor( sex , labels = c( 'male' , 'female' ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( acs_design , "sampling" ) != 0 ) svyby( ~ one , ~ cit , acs_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , acs_design ) svyby( ~ one , ~ cit , acs_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ poverty_level , acs_design , na.rm = TRUE ) svyby( ~ poverty_level , ~ cit , acs_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ sex , acs_design ) svyby( ~ sex , ~ cit , acs_design , svymean ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ poverty_level , acs_design , na.rm = TRUE ) svyby( ~ poverty_level , ~ cit , acs_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ sex , acs_design ) svyby( ~ sex , ~ cit , acs_design , svytotal ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ poverty_level , acs_design , 0.5 , na.rm = TRUE ) svyby( ~ poverty_level , ~ cit , acs_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ ssip , denominator = ~ pincp , acs_design , na.rm = TRUE ) Subsetting Restrict the survey design to senior citizens: sub_acs_design <- subset( acs_design , agep >= 65 ) Calculate the mean (average) of this subset: svymean( ~ poverty_level , sub_acs_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ poverty_level , acs_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ poverty_level , ~ cit , acs_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( acs_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ poverty_level , acs_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ poverty_level , acs_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ poverty_level , acs_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ married , acs_design , method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( poverty_level ~ married , acs_design ) Perform a chi-squared test of association for survey data: svychisq( ~ married + sex , acs_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( poverty_level ~ married + sex , acs_design ) summary( glm_result ) Replication Example This matches statistics, standard errors, and margin of errors from Alabama’s 2023 PUMS tallies: Match the sum of the weights: stopifnot( round( coef( svytotal( ~ one , acs_design ) ) , 0 ) == 5108468 ) Compute the population by age: pums_estimate <- c(287689L, 306458L, 325713L, 355557L, 334520L, 640995L, 649985L, 621783L, 307747L, 344812L, 553817L, 289119L, 90273L) pums_standard_error <- c(2698L, 5964L, 5865L, 5081L, 4427L, 5202L, 4615L, 4804L, 4947L, 4804L, 2166L, 3600L, 3080L) pums_margin_of_error <- c(4439L, 9811L, 9647L, 8358L, 7282L, 8557L, 7592L, 7903L, 8137L, 7902L, 3563L, 5922L, 5067L) results <- svytotal( ~ as.numeric( agep %in% 0:4 ) + as.numeric( agep %in% 5:9 ) + as.numeric( agep %in% 10:14 ) + as.numeric( agep %in% 15:19 ) + as.numeric( agep %in% 20:24 ) + as.numeric( agep %in% 25:34 ) + as.numeric( agep %in% 35:44 ) + as.numeric( agep %in% 45:54 ) + as.numeric( agep %in% 55:59 ) + as.numeric( agep %in% 60:64 ) + as.numeric( agep %in% 65:74 ) + as.numeric( agep %in% 75:84 ) + as.numeric( agep %in% 85:100 ) , acs_design ) stopifnot( all( round( coef( results ) , 0 ) == pums_estimate ) ) stopifnot( all( round( SE( results ) , 0 ) == pums_standard_error ) ) stopifnot( all( round( SE( results ) * 1.645 , 0 ) == pums_margin_of_error ) ) Poverty and Inequality Estimation with convey   The R convey library estimates measures of income concentration, poverty, inequality, and wellbeing. This textbook details the available features. As a starting point for ACS users, this code calculates the gini coefficient on complex sample survey data: library(convey) acs_design <- convey_prep( acs_design ) svygini( ~ hincp , acs_design , na.rm = TRUE ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for ACS users, this code replicates previously-presented examples: library(srvyr) acs_srvyr_design <- as_survey( acs_design ) Calculate the mean (average) of a linear variable, overall and by groups: acs_srvyr_design %>% summarize( mean = survey_mean( poverty_level , na.rm = TRUE ) ) acs_srvyr_design %>% group_by( cit ) %>% summarize( mean = survey_mean( poverty_level , na.rm = TRUE ) ) "],["area-health-resources-files-ahrf.html", "Area Health Resources Files (AHRF) Recommended Reading Download, Import, Preparation Analysis Examples with base R   Replication Example Analysis Examples with dplyr   Analysis Examples with data.table   Analysis Examples with duckdb  ", " Area Health Resources Files (AHRF) National, state, and county-level data on health care professions, health facilities, population characteristics, health workforce training, hospital utilization and expenditure, and the environment. One table with one row per county and a second table with one row per state. Replaced annually with the latest available county- and state-level statistics. Compiled by the Bureau of Health Workforce at the Health Services and Resources Administration. Recommended Reading Two Methodology Documents: User Documentation for the County Area Health Resources File (AHRF) 2021-2022 Release Frequently Asked Questions One Haiku: # local aggregates # to spread merge join spline regress # like fresh buttered bread Download, Import, Preparation Download and import the most current county-level file: library(haven) tf <- tempfile() ahrf_url <- "https://data.hrsa.gov//DataDownload/AHRF/AHRF_2021-2022_SAS.zip" download.file( ahrf_url , tf , mode = 'wb' ) unzipped_files <- unzip( tf , exdir = tempdir() ) sas_fn <- grep( "\\\\.sas7bdat$" , unzipped_files , value = TRUE ) ahrf_tbl <- read_sas( sas_fn ) ahrf_df <- data.frame( ahrf_tbl ) names( ahrf_df ) <- tolower( names( ahrf_df ) ) Save Locally   Save the object at any point: # ahrf_fn <- file.path( path.expand( "~" ) , "AHRF" , "this_file.rds" ) # saveRDS( ahrf_df , file = ahrf_fn , compress = FALSE ) Load the same object: # ahrf_df <- readRDS( ahrf_fn ) Variable Recoding Add new columns to the data set: ahrf_df <- transform( ahrf_df , cbsa_indicator_code = factor( as.numeric( f1406720 ) , levels = 0:2 , labels = c( "not metro" , "metro" , "micro" ) ) , mhi_2020 = f1322620 , whole_county_hpsa_2022 = as.numeric( f0978722 ) == 1 , census_region = factor( as.numeric( f04439 ) , levels = 1:4 , labels = c( "northeast" , "midwest" , "south" , "west" ) ) ) Analysis Examples with base R   Unweighted Counts Count the unweighted number of records in the table, overall and by groups: nrow( ahrf_df ) table( ahrf_df[ , "cbsa_indicator_code" ] , useNA = "always" ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: mean( ahrf_df[ , "mhi_2020" ] , na.rm = TRUE ) tapply( ahrf_df[ , "mhi_2020" ] , ahrf_df[ , "cbsa_indicator_code" ] , mean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: prop.table( table( ahrf_df[ , "census_region" ] ) ) prop.table( table( ahrf_df[ , c( "census_region" , "cbsa_indicator_code" ) ] ) , margin = 2 ) Calculate the sum of a linear variable, overall and by groups: sum( ahrf_df[ , "mhi_2020" ] , na.rm = TRUE ) tapply( ahrf_df[ , "mhi_2020" ] , ahrf_df[ , "cbsa_indicator_code" ] , sum , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: quantile( ahrf_df[ , "mhi_2020" ] , 0.5 , na.rm = TRUE ) tapply( ahrf_df[ , "mhi_2020" ] , ahrf_df[ , "cbsa_indicator_code" ] , quantile , 0.5 , na.rm = TRUE ) Subsetting Limit your data.frame to California: sub_ahrf_df <- subset( ahrf_df , f12424 == "CA" ) Calculate the mean (average) of this subset: mean( sub_ahrf_df[ , "mhi_2020" ] , na.rm = TRUE ) Measures of Uncertainty Calculate the variance, overall and by groups: var( ahrf_df[ , "mhi_2020" ] , na.rm = TRUE ) tapply( ahrf_df[ , "mhi_2020" ] , ahrf_df[ , "cbsa_indicator_code" ] , var , na.rm = TRUE ) Regression Models and Tests of Association Perform a t-test: t.test( mhi_2020 ~ whole_county_hpsa_2022 , ahrf_df ) Perform a chi-squared test of association: this_table <- table( ahrf_df[ , c( "whole_county_hpsa_2022" , "census_region" ) ] ) chisq.test( this_table ) Perform a generalized linear model: glm_result <- glm( mhi_2020 ~ whole_county_hpsa_2022 + census_region , data = ahrf_df ) summary( glm_result ) Replication Example Match the record count in row number 8,543 of AHRF 2021-2022 Technical Documentation.xlsx: stopifnot( nrow( ahrf_df ) == 3232 ) Analysis Examples with dplyr   The R dplyr library offers an alternative grammar of data manipulation to base R and SQL syntax. dplyr offers many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, and the tidyverse style of non-standard evaluation. This vignette details the available features. As a starting point for AHRF users, this code replicates previously-presented examples: library(dplyr) ahrf_tbl <- as_tibble( ahrf_df ) Calculate the mean (average) of a linear variable, overall and by groups: ahrf_tbl %>% summarize( mean = mean( mhi_2020 , na.rm = TRUE ) ) ahrf_tbl %>% group_by( cbsa_indicator_code ) %>% summarize( mean = mean( mhi_2020 , na.rm = TRUE ) ) Analysis Examples with data.table   The R data.table library provides a high-performance version of base R’s data.frame with syntax and feature enhancements for ease of use, convenience and programming speed. data.table offers concise syntax: fast to type, fast to read, fast speed, memory efficiency, a careful API lifecycle management, an active community, and a rich set of features. This vignette details the available features. As a starting point for AHRF users, this code replicates previously-presented examples: library(data.table) ahrf_dt <- data.table( ahrf_df ) Calculate the mean (average) of a linear variable, overall and by groups: ahrf_dt[ , mean( mhi_2020 , na.rm = TRUE ) ] ahrf_dt[ , mean( mhi_2020 , na.rm = TRUE ) , by = cbsa_indicator_code ] Analysis Examples with duckdb   The R duckdb library provides an embedded analytical data management system with support for the Structured Query Language (SQL). duckdb offers a simple, feature-rich, fast, and free SQL OLAP management system. This vignette details the available features. As a starting point for AHRF users, this code replicates previously-presented examples: library(duckdb) con <- dbConnect( duckdb::duckdb() , dbdir = 'my-db.duckdb' ) dbWriteTable( con , 'ahrf' , ahrf_df ) Calculate the mean (average) of a linear variable, overall and by groups: dbGetQuery( con , 'SELECT AVG( mhi_2020 ) FROM ahrf' ) dbGetQuery( con , 'SELECT cbsa_indicator_code , AVG( mhi_2020 ) FROM ahrf GROUP BY cbsa_indicator_code' ) "],["american-housing-survey-ahs.html", "American Housing Survey (AHS) Recommended Reading Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " American Housing Survey (AHS) The nationwide assessment of housing stock, with information on physical condition and neighborhood, costs of financing and maintenance, owner and renter characteristics, and changes over time. Nationally-representative and metropolitan flat files with one row per household, plus relational files. A complex sample survey of occupied and vacant housing units designed to generalize to all structures in the United States, both nationally and also for about thirty-five metropolitan areas. Released more or less biennially since 1973, with longitudinal samples redrawn in 1985 and 2015. Sponsored by the Department of Housing and Urban Development, run by the Census Bureau. Recommended Reading Four Example Strengths & Limitations: ✔️ National, state, and metro area geographies ✔️ Housing unit-focused questionnaire provides greater detail on housing stock ❌ Housing unit-focused questionnaire asks fewer detailed questions of occupants on some topics ❌ Underreported estimate of adjustable rate mortgages Three Example Findings: In 2017, 21% of residences nationwide did not have adequate space for COVID-19 isolation. From 1991 to 2017, single men earned 1.5% higher housing investment returns vs. women. More than a quarter of a million households receiving HUD rental assistance lacked accessibility features but had a member using a mobility device (like a wheelchair or walker) in 2019. Two Methodology Documents: Getting Started with the Public Use File: 2015 to 2023 Wikipedia Entry One Haiku: # real estate supply # half bath addition, raised roof # vent, rent too damn high Download, Import, Preparation Download and import the national 2023 flat file: library(haven) library(httr) tf <- tempfile() this_url <- paste0( "https://www2.census.gov/programs-surveys/ahs/" , "2023/AHS%202023%20National%20PUF%20v1.0%20Flat%20SAS.zip" ) GET( this_url , write_disk( tf ) , progress() ) ahs_tbl <- read_sas( tf ) ahs_df <- data.frame( ahs_tbl ) names( ahs_df ) <- tolower( names( ahs_df ) ) Save Locally   Save the object at any point: # ahs_fn <- file.path( path.expand( "~" ) , "AHS" , "this_file.rds" ) # saveRDS( ahs_df , file = ahs_fn , compress = FALSE ) Load the same object: # ahs_df <- readRDS( ahs_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) ahs_design <- svrepdesign( weights = ~ weight , repweights = "repweight[1-9]" , type = "Fay" , rho = ( 1 - 1 / sqrt( 4 ) ) , mse = TRUE , data = ahs_df ) Variable Recoding Add new columns to the data set: ahs_design <- update( ahs_design , one = 1 , tenure = factor( ifelse( tenure %in% c( -6 , 'N' ) , 4 , tenure ) , levels = 1:4 , labels = c( 'Owned or being bought' , 'Rented for cash rent' , 'Occupied without payment of cash rent' , 'Not occupied' ) ) , lotsize = factor( lotsize , levels = 1:7 , labels = c( "Less then 1/8 acre" , "1/8 up to 1/4 acre" , "1/4 up to 1/2 acre" , "1/2 up to 1 acre" , "1 up to 5 acres" , "5 up to 10 acres" , "10 acres or more" ) ) , below_poverty = as.numeric( perpovlvl < 100 ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( ahs_design , "sampling" ) != 0 ) svyby( ~ one , ~ tenure , ahs_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , ahs_design ) svyby( ~ one , ~ tenure , ahs_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ totrooms , ahs_design , na.rm = TRUE ) svyby( ~ totrooms , ~ tenure , ahs_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ lotsize , ahs_design , na.rm = TRUE ) svyby( ~ lotsize , ~ tenure , ahs_design , svymean , na.rm = TRUE ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ totrooms , ahs_design , na.rm = TRUE ) svyby( ~ totrooms , ~ tenure , ahs_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ lotsize , ahs_design , na.rm = TRUE ) svyby( ~ lotsize , ~ tenure , ahs_design , svytotal , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ totrooms , ahs_design , 0.5 , na.rm = TRUE ) svyby( ~ totrooms , ~ tenure , ahs_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ totrooms , denominator = ~ rent , ahs_design , na.rm = TRUE ) Subsetting Restrict the survey design to homes with a garage or carport: sub_ahs_design <- subset( ahs_design , garage == 1 ) Calculate the mean (average) of this subset: svymean( ~ totrooms , sub_ahs_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ totrooms , ahs_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ totrooms , ~ tenure , ahs_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( ahs_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ totrooms , ahs_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ totrooms , ahs_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ totrooms , ahs_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ below_poverty , ahs_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( totrooms ~ below_poverty , ahs_design ) Perform a chi-squared test of association for survey data: svychisq( ~ below_poverty + lotsize , ahs_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( totrooms ~ below_poverty + lotsize , ahs_design ) summary( glm_result ) Replication Example This example matches the estimate and margin of error of the Total row of the General Housing tab from the AHS 2023 Table Specifications and PUF Estimates for User Verification: result <- svytotal( ~ as.numeric( intstatus == 1 ) , ahs_design ) stopifnot( round( coef( result ) / 1000 , 0 ) == 133231 ) ci_results <- confint( result , level = 0.9 ) stopifnot( round( ( ci_results[ 2 ] - coef( result ) ) / 1000 , 0 ) == 381 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for AHS users, this code replicates previously-presented examples: library(srvyr) ahs_srvyr_design <- as_survey( ahs_design ) Calculate the mean (average) of a linear variable, overall and by groups: ahs_srvyr_design %>% summarize( mean = survey_mean( totrooms , na.rm = TRUE ) ) ahs_srvyr_design %>% group_by( tenure ) %>% summarize( mean = survey_mean( totrooms , na.rm = TRUE ) ) "],["american-national-election-studies-anes.html", "American National Election Studies (ANES) Recommended Reading Function Definitions Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " American National Election Studies (ANES) A time series recording belief, public opinion, and political participation back to Dewey vs. Truman. Most tables contain one row per sampled eligible voter, varying weights like pre- and post-election. A complex sample generalizing to eligible voters in the U.S. with some panels to follow individuals. Core studies released quadrennially (presidential elections), plus pilot studies (often at midterms). Administered by a consortium of universities and funded by the National Science Foundation. Recommended Reading Four Example Strengths & Limitations: ✔️ Time series studies interview both before and after quadrennial elections ✔️ Instrument design tested in smaller study prior to inclusion ❌ Turnout errors are part of a long-standing problem of turnout over-estimation in surveys ❌ Prior survey questions not always asked again Three Example Findings: Younger Americans were less politically polarized than older Americans in 2020. In 2020, 90% of Biden and Trump voters also opted for a congressional candidate of the same party. Between 1996 and 2016, demographic groups least likely to use the Internet and social media experienced larger changes in political polarization than those more likely to use the Internet. Two Methodology Documents: ANES 2020 Time Series Study Full Release: User Guide and Codebook How to Analyze ANES Survey Data One Haiku: # chez sacrificed queen # quadrennial bloodless coup # knight churchill's least worst Function Definitions Define a function to import a stata file as a data.frame: library(haven) anes_import_dta <- function( this_fn ){ this_tbl <- read_dta( this_fn ) this_tbl <- zap_labels( this_tbl ) this_df <- data.frame( this_tbl ) names( this_df ) <- tolower( names( this_df ) ) this_df } Download, Import, Preparation Register for the ANES Data Center at https://electionstudies.org/ Choose 2020 Time Series Study Download the STATA version of the February 10, 2022 file: library(haven) anes_fn <- file.path( path.expand( "~" ) , "anes_timeseries_2020_stata_20220210.dta" ) anes_df <- anes_import_dta( anes_fn ) Save Locally   Save the object at any point: # anes_fn <- file.path( path.expand( "~" ) , "ANES" , "this_file.rds" ) # saveRDS( anes_df , file = anes_fn , compress = FALSE ) Load the same object: # anes_df <- readRDS( anes_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) anes_design <- svydesign( ids = ~ v200010c , strata = ~ v200010d , weights = ~ v200010a , data = subset( anes_df , v200010a > 0 ) , nest = TRUE ) Variable Recoding Add new columns to the data set: anes_design <- update( anes_design , one = 1 , democratic_party_rating = ifelse( v201156 %in% 0:100 , v201156 , NA ) , republican_party_rating = ifelse( v201157 %in% 0:100 , v201157 , NA ) , primary_voter = ifelse( v201020 %in% 1:2 , as.numeric( v201020 == 1 ) , NA ) , think_gov_spend_least = factor( v201645 , levels = 1:4 , labels = c( 'foreign aid (correct)' , 'medicare' , 'national defense' , 'social security' ) ) , undoc_kids = factor( v201423x , levels = 1:6 , labels = c( 'should sent back - favor a great deal' , 'should sent back - favor a moderate amount' , 'should sent back - favor a little' , 'should allow to stay - favor a little' , 'should allow to stay - favor a moderate amount' , 'should allow to stay - favor a great deal' ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( anes_design , "sampling" ) != 0 ) svyby( ~ one , ~ undoc_kids , anes_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , anes_design ) svyby( ~ one , ~ undoc_kids , anes_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ republican_party_rating , anes_design , na.rm = TRUE ) svyby( ~ republican_party_rating , ~ undoc_kids , anes_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ think_gov_spend_least , anes_design , na.rm = TRUE ) svyby( ~ think_gov_spend_least , ~ undoc_kids , anes_design , svymean , na.rm = TRUE ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ republican_party_rating , anes_design , na.rm = TRUE ) svyby( ~ republican_party_rating , ~ undoc_kids , anes_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ think_gov_spend_least , anes_design , na.rm = TRUE ) svyby( ~ think_gov_spend_least , ~ undoc_kids , anes_design , svytotal , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ republican_party_rating , anes_design , 0.5 , na.rm = TRUE ) svyby( ~ republican_party_rating , ~ undoc_kids , anes_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ republican_party_rating , denominator = ~ democratic_party_rating , anes_design , na.rm = TRUE ) Subsetting Restrict the survey design to party id: independent: sub_anes_design <- subset( anes_design , v201231x == 4 ) Calculate the mean (average) of this subset: svymean( ~ republican_party_rating , sub_anes_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ republican_party_rating , anes_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ republican_party_rating , ~ undoc_kids , anes_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( anes_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ republican_party_rating , anes_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ republican_party_rating , anes_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ republican_party_rating , anes_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ primary_voter , anes_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( republican_party_rating ~ primary_voter , anes_design ) Perform a chi-squared test of association for survey data: svychisq( ~ primary_voter + think_gov_spend_least , anes_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( republican_party_rating ~ primary_voter + think_gov_spend_least , anes_design ) summary( glm_result ) Replication Example This example matches statistics and standard errors in the Age rows of the ANES respondents (weighted) column of Table 1A from Benchmark and Attrition Report for the ANES 2016 Time Series Study: Log in to the ANES Data Center at https://electionstudies.org/ Choose 2016 Time Series Study. Download the DTA version of the September 4, 2019 file Download the DTA version of the Methodology File December 10, 2018 anes2016_fn <- file.path( path.expand( "~" ) , "anes_timeseries_2016.dta" ) anes2016_df <- anes_import_dta( anes2016_fn ) method2016_fn <- file.path( path.expand( "~" ) , "anes_timeseries_2016_methodology_dta.dta" ) method2016_df <- anes_import_dta( method2016_fn ) before_nrow <- nrow( anes2016_df ) anes2016_df <- merge( anes2016_df , method2016_df , by = 'v160001' ) stopifnot( nrow( anes2016_df ) == before_nrow ) anes2016_df[ , 'age_categories' ] <- factor( findInterval( anes2016_df[ , 'v161267' ] , c( 18 , seq( 30 , 70 , 10 ) ) ) , levels = 1:6 , labels = c( '18-29' , '30-39' , '40-49' , '50-59' , '60-69' , '70 or older' ) ) anes2016_design <- svrepdesign( data = subset( anes2016_df , v160101f > 0 ) , weights = ~ v160101f , repweights = 'weight_ftf_rkwt([0-9]+)' , type = 'JK1' , scale = 32 / 33 ) ( results <- svymean( ~ age_categories , anes2016_design , na.rm = TRUE ) ) published_results <- c( 0.21 , 0.158 , 0.156 , 0.2 , 0.147 , 0.129 ) published_standard_errors <- c( 0.0091 , 0.009 , 0.0094 , 0.0122 , 0.0069 , 0.0083 ) stopifnot( all( round( coef( results ) , 3 ) == published_results ) ) stopifnot( all( round( SE( results ) , 4 ) == published_standard_errors ) ) This example matches statistics and standard errors in the Age rows of the Design-consistent, with published strata column of Table 1 from How to Analyze ANES Survey Data: Log in to the ANES Data Center at https://electionstudies.org/ Choose 2004 Time Series Study4 Download the DTA version of the Full Release August 16, 2005 file Choose 2006 Pilot Study Download the DTA version of the April 26, 2007 file anes2004_fn <- file.path( path.expand( "~" ) , "anes2004TS.dta" ) anes2004_df <- anes_import_dta( anes2004_fn ) pilot2006_fn <- file.path( path.expand( "~" ) , "anes2006pilot.dta" ) pilot2006_df <- anes_import_dta( pilot2006_fn ) before_nrow <- nrow( pilot2006_df ) pilot2006_df <- merge( pilot2006_df , anes2004_df , by.x = 'v06p001' , by.y = 'v040001' ) stopifnot( nrow( pilot2006_df ) == before_nrow ) pilot2006_df[ , 'age_categories' ] <- factor( findInterval( pilot2006_df[ , 'v043250' ] , c( 18 , seq( 30 , 70 , 10 ) ) ) , levels = 1:6 , labels = c( '18-29' , '30-39' , '40-49' , '50-59' , '60-69' , '70 or older' ) ) pilot2006_design <- svydesign( id = ~v06p007b , strata = ~v06p007a , data = pilot2006_df , weights = ~v06p002 , nest = TRUE ) ( results <- svymean( ~ age_categories , pilot2006_design , na.rm = TRUE ) ) published_results <- c( 0.207 , 0.162 , 0.218 , 0.175 , 0.111 , 0.126 ) published_standard_errors <- c( 0.0251 , 0.024 , 0.022 , 0.0149 , 0.0125 , 0.0287 ) stopifnot( all( round( coef( results ) , 3 ) == published_results ) ) stopifnot( all( round( SE( results ) , 4 ) == published_standard_errors ) ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for ANES users, this code replicates previously-presented examples: library(srvyr) anes_srvyr_design <- as_survey( anes_design ) Calculate the mean (average) of a linear variable, overall and by groups: anes_srvyr_design %>% summarize( mean = survey_mean( republican_party_rating , na.rm = TRUE ) ) anes_srvyr_design %>% group_by( undoc_kids ) %>% summarize( mean = survey_mean( republican_party_rating , na.rm = TRUE ) ) "],["american-time-use-survey-atus.html", "American Time Use Survey (ATUS) Recommended Reading Function Definitions Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " American Time Use Survey (ATUS) Sampled individuals write down everything they do for a single twenty-four hour period, in ten minute intervals. Time use data allows for the study of uncompensated work like cooking, chores, childcare. Many tables with structures described in the user guide, linkable to the Current Population Survey. A complex survey generalizing to person-hours among civilian non-institutional americans aged 15+. Released annually since 2003. Administered by the Bureau of Labor Statistics. Recommended Reading Four Example Strengths & Limitations: ✔️ Detailed respondent activity information ✔️ Network of international time use researchers ❌ Each individual respondent contributes only 24 hours of activity on “diary day” ❌ Limited sample sizes do not represent smaller geographic areas Three Example Findings: On average during 2021 and 2022, 37.1 million people in the US provided unpaid eldercare. Approximately 15% of working hours were performed at home in the US from 2011 to 2018. Low physical activity during 2014-2016 cannot be broadly attributed to limited leisure time. Two Methodology Documents: American Time Use Survey User’s Guide Wikipedia Entry One Haiku: # don't judge me bruno # eat one hour, sleep the rest # it's my lazy day Function Definitions Define a function to download, unzip, and import each comma-separated value dat file: library(httr) atus_csv_import <- function( this_url ){ this_tf <- tempfile() this_dl <- GET( this_url , user_agent( "email@address.com") ) writeBin( content( this_dl ) , this_tf ) unzipped_files <- unzip( this_tf , exdir = tempdir() ) this_dat <- grep( '\\\\.dat$' , unzipped_files , value = TRUE ) this_df <- read.csv( this_dat ) file.remove( c( this_tf , unzipped_files ) ) names( this_df ) <- tolower( names( this_df ) ) this_df } Download, Import, Preparation Download and import the activity, respondent, roster, and weights tables: act_df <- atus_csv_import( "https://www.bls.gov/tus/datafiles/atusact-2023.zip" ) resp_df <- atus_csv_import( "https://www.bls.gov/tus/datafiles/atusresp-2023.zip" ) rost_df <- atus_csv_import( "https://www.bls.gov/tus/datafiles/atusrost-2023.zip" ) wgts_df <- atus_csv_import( "https://www.bls.gov/tus/datafiles/atuswgts-2023.zip" ) Specify which variables to keep in each of the data.frame objects: act_df <- act_df[ c( 'tucaseid' , 'tutier1code' , 'tutier2code' , 'tuactdur24' ) ] resp_df <- resp_df[ c( 'tucaseid' , 'tufinlwgt' , 'tulineno' ) ] rost_df <- rost_df[ , c( 'tucaseid' , 'tulineno' , 'teage' , 'tesex' ) ] Distribute travel-related activities (tutier1code == 18 from the lexicon) based on their second tier code: act_df[ act_df[ , 'tutier1code' ] == 18 & act_df[ , 'tutier2code' ] == 99 , 'tutier1code' ] <- 50 act_df[ act_df[ , 'tutier1code' ] == 18 , 'tutier1code' ] <- act_df[ act_df[ , 'tutier1code' ] == 18 , 'tutier2code' ] Sum up all durations at the (respondent x major activity category)-level: act_long_df <- aggregate( tuactdur24 ~ tucaseid + tutier1code , data = act_df , sum ) act_wide_df <- reshape( act_long_df , idvar = 'tucaseid' , timevar = 'tutier1code' , direction = 'wide' ) # for individuals not engaging in an activity category, replace missings with zero minutes act_wide_df[ is.na( act_wide_df ) ] <- 0 # for all columns except the respondent identifier, convert minutes to hours act_wide_df[ , -1 ] <- act_wide_df[ , -1 ] / 60 Merge the respondent and summed activity tables, then the roster table, and finally the replicate weights: resp_act_df <- merge( resp_df , act_wide_df ) stopifnot( nrow( resp_act_df ) == nrow( resp_df ) ) resp_act_rost_df <- merge( resp_act_df , rost_df ) stopifnot( nrow( resp_act_rost_df ) == nrow( resp_df ) ) atus_df <- merge( resp_act_rost_df , wgts_df ) stopifnot( nrow( atus_df ) == nrow( resp_df ) ) # remove dots from column names names( atus_df ) <- gsub( "\\\\." , "_" , names( atus_df ) ) atus_df[ , 'one' ] <- 1 Save Locally   Save the object at any point: # atus_fn <- file.path( path.expand( "~" ) , "ATUS" , "this_file.rds" ) # saveRDS( atus_df , file = atus_fn , compress = FALSE ) Load the same object: # atus_df <- readRDS( atus_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) atus_design <- svrepdesign( weights = ~ tufinlwgt , repweights = "finlwgt[0-9]" , type = "Fay" , rho = ( 1 - 1 / sqrt( 4 ) ) , mse = TRUE , data = atus_df ) Variable Recoding Add new columns to the data set: # caring for and helping household members is top level 03 from the lexicon # https://www.bls.gov/tus/lexicons/lexiconwex2023.pdf atus_design <- update( atus_design , any_care = as.numeric( tuactdur24_3 > 0 ) , tesex = factor( tesex , levels = 1:2 , labels = c( 'male' , 'female' ) ) , age_category = factor( 1 + findInterval( teage , c( 18 , 35 , 65 ) ) , labels = c( "under 18" , "18 - 34" , "35 - 64" , "65 or older" ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( atus_design , "sampling" ) != 0 ) svyby( ~ one , ~ age_category , atus_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , atus_design ) svyby( ~ one , ~ age_category , atus_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ tuactdur24_1 , atus_design ) svyby( ~ tuactdur24_1 , ~ age_category , atus_design , svymean ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ tesex , atus_design ) svyby( ~ tesex , ~ age_category , atus_design , svymean ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ tuactdur24_1 , atus_design ) svyby( ~ tuactdur24_1 , ~ age_category , atus_design , svytotal ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ tesex , atus_design ) svyby( ~ tesex , ~ age_category , atus_design , svytotal ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ tuactdur24_1 , atus_design , 0.5 ) svyby( ~ tuactdur24_1 , ~ age_category , atus_design , svyquantile , 0.5 , ci = TRUE ) Estimate a ratio: svyratio( numerator = ~ tuactdur24_5 , denominator = ~ tuactdur24_12 , atus_design ) Subsetting Restrict the survey design to any time volunteering: sub_atus_design <- subset( atus_design , tuactdur24_15 > 0 ) Calculate the mean (average) of this subset: svymean( ~ tuactdur24_1 , sub_atus_design ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ tuactdur24_1 , atus_design ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ tuactdur24_1 , ~ age_category , atus_design , svymean ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( atus_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ tuactdur24_1 , atus_design ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ tuactdur24_1 , atus_design , deff = TRUE ) # SRS with replacement svymean( ~ tuactdur24_1 , atus_design , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ any_care , atus_design , method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( tuactdur24_1 ~ any_care , atus_design ) Perform a chi-squared test of association for survey data: svychisq( ~ any_care + tesex , atus_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( tuactdur24_1 ~ any_care + tesex , atus_design ) summary( glm_result ) Replication Example This example matches the “Caring for and helping household members” row of Table A-1: hours_per_day_civilian_population <- svymean( ~ tuactdur24_3 , atus_design ) stopifnot( round( coef( hours_per_day_civilian_population ) , 2 ) == 0.5 ) percent_engaged_per_day <- svymean( ~ any_care , atus_design ) stopifnot( round( coef( percent_engaged_per_day ) , 3 ) == 0.22 ) hours_per_day_among_engaged <- svymean( ~ tuactdur24_3 , subset( atus_design , any_care ) ) stopifnot( round( coef( hours_per_day_among_engaged ) , 2 ) == 2.29 ) This example matches the average hours and SE from Section 7.5 of the User’s Guide: Download and import the activity, activity summary, respondent, and weights tables: actsum07_df <- atus_csv_import( "https://www.bls.gov/tus/datafiles/atussum_2007.zip" ) resp07_df <- atus_csv_import( "https://www.bls.gov/tus/datafiles/atusresp_2007.zip" ) act07_df <- atus_csv_import( "https://www.bls.gov/tus/datafiles/atusact_2007.zip" ) wgts07_df <- atus_csv_import( "https://www.bls.gov/tus/datafiles/atuswgts_2007.zip" ) Option 1. Sum the two television fields from the activity summary file, removing zeroes: television_per_person <- data.frame( tucaseid = actsum07_df[ , 'tucaseid' ] , tuactdur24 = rowSums( actsum07_df[ , c( 't120303' , 't120304' ) ] ) ) television_per_person <- television_per_person[ television_per_person[ , 'tuactdur24' ] > 0 , ] Option 2. Limit the activity file to television watching records according to the 2007 Lexicon: television_activity <- subset( act07_df , tutier1code == 12 & tutier2code == 3 & tutier3code %in% 3:4 ) television_activity_summed <- aggregate( tuactdur24 ~ tucaseid , data = television_activity , sum ) Confirm both aggregation options yield the same results: stopifnot( all( television_per_person[ , 'tucaseid' ] == television_activity_summed[ , 'tucaseid' ] ) ) stopifnot( all( television_per_person[ , 'tuactdur24' ] == television_activity_summed[ , 'tuactdur24' ] ) ) Merge the respondent and summed activity tables, then the replicate weights: resp07_tpp_df <- merge( resp07_df[ , c( 'tucaseid' , 'tufinlwgt' ) ] , television_per_person , all.x = TRUE ) stopifnot( nrow( resp07_tpp_df ) == nrow( resp07_df ) ) # for individuals without television time, replace missings with zero minutes resp07_tpp_df[ is.na( resp07_tpp_df[ , 'tuactdur24' ] ) , 'tuactdur24' ] <- 0 # convert minutes to hours resp07_tpp_df[ , 'tuactdur24_hour' ] <- resp07_tpp_df[ , 'tuactdur24' ] / 60 atus07_df <- merge( resp07_tpp_df , wgts07_df ) stopifnot( nrow( atus07_df ) == nrow( resp07_df ) ) Construct a complex sample survey design: atus07_design <- svrepdesign( weights = ~ tufinlwgt , repweights = "finlwgt[0-9]" , type = "Fay" , rho = ( 1 - 1 / sqrt( 4 ) ) , data = atus07_df ) Match the statistic and SE of the number of hours daily that americans older than 14 watch tv: result <- svymean( ~ tuactdur24_hour , atus07_design ) stopifnot( round( coef( result ) , 2 ) == 2.62 ) stopifnot( round( SE( result ) , 4 ) == 0.0293 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for ATUS users, this code replicates previously-presented examples: library(srvyr) atus_srvyr_design <- as_survey( atus_design ) Calculate the mean (average) of a linear variable, overall and by groups: atus_srvyr_design %>% summarize( mean = survey_mean( tuactdur24_1 ) ) atus_srvyr_design %>% group_by( age_category ) %>% summarize( mean = survey_mean( tuactdur24_1 ) ) "],["behavioral-risk-factor-surveillance-system-brfss.html", "Behavioral Risk Factor Surveillance System (BRFSS) Recommended Reading Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " Behavioral Risk Factor Surveillance System (BRFSS) A health behavior telephone interview survey with enough sample size to examine all fifty states. One table with one row per telephone respondent. A complex survey designed to generalize to the civilian non-institutional adult population of the U.S. Released annually since 1984 but all states did not participate until 1994. Administered by the Centers for Disease Control and Prevention. Recommended Reading Four Example Strengths & Limitations: ✔️ Wide variety of disease surveillance utilities across every state ✔️ Sufficient sample to examine selected cities and counties in addition to all states ❌ Not every topical module asked in every state ❌ Kentucky and Pennsylvania were unable to collect enough data for the public use file in 2023 Three Example Findings: Among adults in 2000, 52% of ever smokers had quit smoking, and this ratio rose to 61% by 2019. By 2030, 49% of US adults will have obesity, and in every state this rate will be above 35%. Disabled Iowan adults in 2019 were more than three times more likely to indicate having depression. Two Methodology Documents: BRFSS Data User Guide Wikipedia Entry One Haiku: # a cellphone vibrates # it's the cdc! asking # if you ate veggies Download, Import, Preparation Download and import the national file: library(haven) zip_tf <- tempfile() zip_url <- "https://www.cdc.gov/brfss/annual_data/2023/files/LLCP2023XPT.zip" download.file( zip_url , zip_tf , mode = 'wb' ) brfss_tbl <- read_xpt( zip_tf ) brfss_df <- data.frame( brfss_tbl ) names( brfss_df ) <- tolower( names( brfss_df ) ) brfss_df[ , 'one' ] <- 1 Save Locally   Save the object at any point: # brfss_fn <- file.path( path.expand( "~" ) , "BRFSS" , "this_file.rds" ) # saveRDS( brfss_df , file = brfss_fn , compress = FALSE ) Load the same object: # brfss_df <- readRDS( brfss_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) options( survey.lonely.psu = "adjust" ) variables_to_keep <- c( 'one' , 'x_psu' , 'x_ststr' , 'x_llcpwt' , 'genhlth' , 'medcost1' , 'x_state' , 'x_age80' , 'physhlth' , 'menthlth' , 'x_hlthpl1' ) brfss_df <- brfss_df[ variables_to_keep ] brfss_national_design <- svydesign( id = ~ x_psu , strata = ~ x_ststr , data = brfss_df , weight = ~ x_llcpwt , nest = TRUE ) Since large linearized survey designs execute slowly, a replication design might be preferrable for exploratory analysis. Coefficients (such as means and medians) do not change, standard errors and confidence intervals differ slightly. The initial conversion with as.svrepdesign requires an extended period of processing time (perhaps run once overnight), subsequent analyses will finish much faster: # brfss_replication_design <- # as.svrepdesign( # brfss_national_design , # type = 'bootstrap' # ) # system.time( print( svymean( ~ x_age80 , brfss_national_design ) ) ) # system.time( print( svymean( ~ x_age80 , brfss_replication_design ) ) ) In this example, limit the national design to only Alaska for quicker processing: brfss_design <- subset( brfss_national_design , x_state %in% 2 ) Variable Recoding Add new columns to the data set: brfss_design <- update( brfss_design , fair_or_poor_health = ifelse( genhlth %in% 1:5 , as.numeric( genhlth > 3 ) , NA ) , no_doc_visit_due_to_cost = factor( medcost1 , levels = c( 1 , 2 , 7 , 9 ) , labels = c( "yes" , "no" , "dk" , "rf" ) ) , physhlth_days_not_good = ifelse( physhlth <= 30 , physhlth , ifelse( physhlth == 88 , 0 , NA ) ) , menthlth_days_not_good = ifelse( menthlth <= 30 , menthlth , ifelse( menthlth == 88 , 0 , NA ) ) , state_name = factor( x_state , levels = c(1, 2, 4, 5, 6, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56, 66, 72, 78) , labels = c("ALABAMA", "ALASKA", "ARIZONA", "ARKANSAS", "CALIFORNIA", "COLORADO", "CONNECTICUT", "DELAWARE", "DISTRICT OF COLUMBIA", "FLORIDA", "GEORGIA", "HAWAII", "IDAHO", "ILLINOIS", "INDIANA", "IOWA", "KANSAS", "KENTUCKY", "LOUISIANA", "MAINE", "MARYLAND", "MASSACHUSETTS", "MICHIGAN", "MINNESOTA", "MISSISSIPPI", "MISSOURI", "MONTANA", "NEBRASKA", "NEVADA", "NEW HAMPSHIRE", "NEW JERSEY", "NEW MEXICO", "NEW YORK", "NORTH CAROLINA", "NORTH DAKOTA", "OHIO", "OKLAHOMA", "OREGON", "PENNSYLVANIA", "RHODE ISLAND", "SOUTH CAROLINA", "SOUTH DAKOTA", "TENNESSEE", "TEXAS", "UTAH", "VERMONT", "VIRGINIA", "WASHINGTON", "WEST VIRGINIA", "WISCONSIN", "WYOMING", "GUAM", "PUERTO RICO", "U.S. VIRGIN ISLANDS") ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( brfss_design , "sampling" ) != 0 ) svyby( ~ one , ~ state_name , brfss_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , brfss_design ) svyby( ~ one , ~ state_name , brfss_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ x_age80 , brfss_design ) svyby( ~ x_age80 , ~ state_name , brfss_design , svymean ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ no_doc_visit_due_to_cost , brfss_design , na.rm = TRUE ) svyby( ~ no_doc_visit_due_to_cost , ~ state_name , brfss_design , svymean , na.rm = TRUE ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ x_age80 , brfss_design ) svyby( ~ x_age80 , ~ state_name , brfss_design , svytotal ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ no_doc_visit_due_to_cost , brfss_design , na.rm = TRUE ) svyby( ~ no_doc_visit_due_to_cost , ~ state_name , brfss_design , svytotal , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ x_age80 , brfss_design , 0.5 ) svyby( ~ x_age80 , ~ state_name , brfss_design , svyquantile , 0.5 , ci = TRUE ) Estimate a ratio: svyratio( numerator = ~ physhlth_days_not_good , denominator = ~ menthlth_days_not_good , brfss_design , na.rm = TRUE ) Subsetting Restrict the survey design to persons without health insurance: sub_brfss_design <- subset( brfss_design , x_hlthpl1 == 2 ) Calculate the mean (average) of this subset: svymean( ~ x_age80 , sub_brfss_design ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ x_age80 , brfss_design ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ x_age80 , ~ state_name , brfss_design , svymean ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( brfss_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ x_age80 , brfss_design ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ x_age80 , brfss_design , deff = TRUE ) # SRS with replacement svymean( ~ x_age80 , brfss_design , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ fair_or_poor_health , brfss_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( x_age80 ~ fair_or_poor_health , brfss_design ) Perform a chi-squared test of association for survey data: svychisq( ~ fair_or_poor_health + no_doc_visit_due_to_cost , brfss_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( x_age80 ~ fair_or_poor_health + no_doc_visit_due_to_cost , brfss_design ) summary( glm_result ) Replication Example This example matches Alaska’s confidence intervals from the BRFSS Prevalence & Trends Data: result <- svymean( ~ no_doc_visit_due_to_cost , subset( brfss_design , no_doc_visit_due_to_cost %in% c( 'yes' , 'no' ) ) , na.rm = TRUE ) stopifnot( round( coef( result )[1] , 3 ) == 0.111 ) stopifnot( round( confint( result )[ 1 , 1 ] , 3 ) == 0.098 ) stopifnot( round( confint( result )[ 1 , 2 ] , 3 ) == 0.123 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for BRFSS users, this code replicates previously-presented examples: library(srvyr) brfss_srvyr_design <- as_survey( brfss_design ) Calculate the mean (average) of a linear variable, overall and by groups: brfss_srvyr_design %>% summarize( mean = survey_mean( x_age80 ) ) brfss_srvyr_design %>% group_by( state_name ) %>% summarize( mean = survey_mean( x_age80 ) ) "],["consumer-expenditure-survey-ces.html", "Consumer Expenditure Survey (CES) Recommended Reading Download, Import, Preparation Analysis Examples with the survey library   Replication Example", " Consumer Expenditure Survey (CES) A household budget survey designed to guide major economic indicators like the Consumer Price Index. One table of survey responses per quarter with one row per sampled household (consumer unit). Additional tables containing one record per expenditure. A complex sample survey designed to generalize to the civilian non-institutional U.S. population. Released annually since 1996. Administered by the Bureau of Labor Statistics. Recommended Reading Four Example Strengths & Limitations: ✔️ Detailed expenditure categories ✔️ Respondents diary spending for two consecutive 1-week periods ❌ Measures purchases but not consumption ❌ Consumer unit definition differs from households or families in other surveys Three Example Findings: In 2022, one third of total nationwide expenditures were attributed to housing-related expenses. Between 2015 and early 2022, male household heads consumed a greater proportion of resources (33%) compared to female household heads (28%), who, in turn, consume more than children (23%). In 2020, if income increased by $100, spending on all food and alcohol increased by $14 on average. Two Methodology Documents: Consumer Expenditure Surveys Public Use Microdata Getting Started Guide Wikipedia Entry One Haiku: # price indices and # you spent how much on beans, jack? # pocketbook issues Download, Import, Preparation Download both the prior and current year of interview microdata: library(httr) tf_prior_year <- tempfile() this_url_prior_year <- "https://www.bls.gov/cex/pumd/data/stata/intrvw22.zip" dl_prior_year <- GET( this_url_prior_year , user_agent( "email@address.com" ) ) writeBin( content( dl_prior_year ) , tf_prior_year ) unzipped_files_prior_year <- unzip( tf_prior_year , exdir = tempdir() ) tf_current_year <- tempfile() this_url_current_year <- "https://www.bls.gov/cex/pumd/data/stata/intrvw23.zip" dl_current_year <- GET( this_url_current_year , user_agent( "email@address.com" ) ) writeBin( content( dl_current_year ) , tf_current_year ) unzipped_files_current_year <- unzip( tf_current_year , exdir = tempdir() ) unzipped_files <- c( unzipped_files_current_year , unzipped_files_prior_year ) Import and stack all 2023 quarterly files plus 2024’s first quarter: library(haven) fmli_files <- grep( "fmli2[3-4]" , unzipped_files , value = TRUE ) fmli_tbls <- lapply( fmli_files , read_dta ) fmli_dfs <- lapply( fmli_tbls , data.frame ) fmli_dfs <- lapply( fmli_dfs , function( w ){ names( w ) <- tolower( names( w ) ) ; w } ) fmli_cols <- lapply( fmli_dfs , names ) intersecting_cols <- Reduce( intersect , fmli_cols ) fmli_dfs <- lapply( fmli_dfs , function( w ) w[ intersecting_cols ] ) ces_df <- do.call( rbind , fmli_dfs ) Scale the weight columns based on the number of months in 2023: ces_df[ , c( 'qintrvyr' , 'qintrvmo' ) ] <- sapply( ces_df[ , c( 'qintrvyr' , 'qintrvmo' ) ] , as.numeric ) weight_columns <- grep( 'wt' , names( ces_df ) , value = TRUE ) ces_df <- transform( ces_df , mo_scope = ifelse( qintrvyr %in% 2023 & qintrvmo %in% 1:3 , qintrvmo - 1 , ifelse( qintrvyr %in% 2024 , 4 - qintrvmo , 3 ) ) ) for ( this_column in weight_columns ){ ces_df[ is.na( ces_df[ , this_column ] ) , this_column ] <- 0 ces_df[ , paste0( 'popwt_' , this_column ) ] <- ( ces_df[ , this_column ] * ces_df[ , 'mo_scope' ] / 12 ) } Combine previous quarter and current quarter variables into a single variable: expenditure_variables <- gsub( "pq$" , "" , grep( "pq$" , names( ces_df ) , value = TRUE ) ) # confirm that for every variable ending in pq, # there's the same variable ending in cq stopifnot( all( paste0( expenditure_variables , 'cq' ) %in% names( ces_df ) ) ) # confirm none of the variables without the pq or cq suffix exist if( any( expenditure_variables %in% names( ces_df ) ) ) stop( "variable conflict" ) for( this_column in expenditure_variables ){ ces_df[ , this_column ] <- rowSums( ces_df[ , paste0( this_column , c( 'pq' , 'cq' ) ) ] , na.rm = TRUE ) # annualize the quarterly spending ces_df[ , this_column ] <- 4 * ces_df[ , this_column ] ces_df[ is.na( ces_df[ , this_column ] ) , this_column ] <- 0 } Append any interview survey UCC found at https://www.bls.gov/cex/ce_source_integrate.xlsx: ucc_exp <- c( "450110" , "450210" ) mtbi_files <- grep( "mtbi2[3-4]" , unzipped_files , value = TRUE ) mtbi_tbls <- lapply( mtbi_files , read_dta ) mtbi_dfs <- lapply( mtbi_tbls , data.frame ) mtbi_dfs <- lapply( mtbi_dfs , function( w ){ names( w ) <- tolower( names( w ) ) ; w } ) mtbi_dfs <- lapply( mtbi_dfs , function( w ) w[ c( 'newid' , 'cost' , 'ucc' , 'ref_yr' ) ] ) mtbi_df <- do.call( rbind , mtbi_dfs ) mtbi_df <- subset( mtbi_df , ( ref_yr %in% 2023 ) & ( ucc %in% ucc_exp ) ) mtbi_agg <- aggregate( cost ~ newid , data = mtbi_df , sum ) names( mtbi_agg ) <- c( 'newid' , 'new_car_truck_exp' ) before_nrow <- nrow( ces_df ) ces_df <- merge( ces_df , mtbi_agg , all.x = TRUE ) stopifnot( nrow( ces_df ) == before_nrow ) ces_df[ is.na( ces_df[ , 'new_car_truck_exp' ] ) , 'new_car_truck_exp' ] <- 0 Save Locally   Save the object at any point: # ces_fn <- file.path( path.expand( "~" ) , "CES" , "this_file.rds" ) # saveRDS( ces_df , file = ces_fn , compress = FALSE ) Load the same object: # ces_df <- readRDS( ces_fn ) Survey Design Definition Construct a multiply-imputed, complex sample survey design: Separate the ces_df data.frame into five implicates, each differing from the others only in the multiply-imputed variables: library(survey) library(mitools) # create a vector containing all of the multiply-imputed variables # (leaving the numbers off the end) mi_vars <- gsub( "5$" , "" , grep( "[a-z]5$" , names( ces_df ) , value = TRUE ) ) # loop through each of the five variables.. for ( i in 1:5 ){ # copy the 'ces_df' table over to a new temporary data frame 'x' x <- ces_df # loop through each of the multiply-imputed variables.. for ( j in mi_vars ){ # copy the contents of the current column (for example 'welfare1') # over to a new column ending in 'mi' (for example 'welfaremi') x[ , paste0( j , 'mi' ) ] <- x[ , paste0( j , i ) ] # delete the all five of the imputed variable columns x <- x[ , !( names( x ) %in% paste0( j , 1:5 ) ) ] } assign( paste0( 'imp' , i ) , x ) } ces_design <- svrepdesign( weights = ~ finlwt21 , repweights = "^wtrep[0-9][0-9]$" , data = imputationList( list( imp1 , imp2 , imp3 , imp4 , imp5 ) ) , type = "BRR" , combined.weights = TRUE , mse = TRUE ) Variable Recoding Add new columns to the data set: ces_design <- update( ces_design , one = 1 , any_food_stamp = as.numeric( jfs_amtmi > 0 ) , bls_urbn = factor( bls_urbn , levels = 1:2 , labels = c( 'urban' , 'rural' ) ) , sex_ref = factor( sex_ref , levels = 1:2 , labels = c( 'male' , 'female' ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: MIcombine( with( ces_design , svyby( ~ one , ~ one , unwtd.count ) ) ) MIcombine( with( ces_design , svyby( ~ one , ~ bls_urbn , unwtd.count ) ) ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: MIcombine( with( ces_design , svytotal( ~ one ) ) ) MIcombine( with( ces_design , svyby( ~ one , ~ bls_urbn , svytotal ) ) ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: MIcombine( with( ces_design , svymean( ~ totexp ) ) ) MIcombine( with( ces_design , svyby( ~ totexp , ~ bls_urbn , svymean ) ) ) Calculate the distribution of a categorical variable, overall and by groups: MIcombine( with( ces_design , svymean( ~ sex_ref ) ) ) MIcombine( with( ces_design , svyby( ~ sex_ref , ~ bls_urbn , svymean ) ) ) Calculate the sum of a linear variable, overall and by groups: MIcombine( with( ces_design , svytotal( ~ totexp ) ) ) MIcombine( with( ces_design , svyby( ~ totexp , ~ bls_urbn , svytotal ) ) ) Calculate the weighted sum of a categorical variable, overall and by groups: MIcombine( with( ces_design , svytotal( ~ sex_ref ) ) ) MIcombine( with( ces_design , svyby( ~ sex_ref , ~ bls_urbn , svytotal ) ) ) Calculate the median (50th percentile) of a linear variable, overall and by groups: MIcombine( with( ces_design , svyquantile( ~ totexp , 0.5 , se = TRUE ) ) ) MIcombine( with( ces_design , svyby( ~ totexp , ~ bls_urbn , svyquantile , 0.5 , se = TRUE , ci = TRUE ) ) ) Estimate a ratio: MIcombine( with( ces_design , svyratio( numerator = ~ totexp , denominator = ~ fincbtxmi ) ) ) Subsetting Restrict the survey design to california residents: sub_ces_design <- subset( ces_design , state == '06' ) Calculate the mean (average) of this subset: MIcombine( with( sub_ces_design , svymean( ~ totexp ) ) ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- MIcombine( with( ces_design , svymean( ~ totexp ) ) ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- MIcombine( with( ces_design , svyby( ~ totexp , ~ bls_urbn , svymean ) ) ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( ces_design$designs[[1]] ) Calculate the complex sample survey-adjusted variance of any statistic: MIcombine( with( ces_design , svyvar( ~ totexp ) ) ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement MIcombine( with( ces_design , svymean( ~ totexp , deff = TRUE ) ) ) # SRS with replacement MIcombine( with( ces_design , svymean( ~ totexp , deff = "replace" ) ) ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: # MIsvyciprop( ~ any_food_stamp , ces_design , # method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: # MIsvyttest( totexp ~ any_food_stamp , ces_design ) Perform a chi-squared test of association for survey data: # MIsvychisq( ~ any_food_stamp + sex_ref , ces_design ) Perform a survey-weighted generalized linear model: glm_result <- MIcombine( with( ces_design , svyglm( totexp ~ any_food_stamp + sex_ref ) ) ) summary( glm_result ) Replication Example This example matches the number of consumer units and the Cars and trucks, new rows of Table R-1: result <- MIcombine( with( ces_design , svytotal( ~ as.numeric( popwt_finlwt21 / finlwt21 ) ) ) ) stopifnot( round( coef( result ) , -3 ) == 134556000 ) results <- sapply( weight_columns , function( this_column ){ sum( ces_df[ , 'new_car_truck_exp' ] * ces_df[ , this_column ] ) / sum( ces_df[ , paste0( 'popwt_' , this_column ) ] ) } ) stopifnot( round( results[1] , 2 ) == 2896.03 ) standard_error <- sqrt( ( 1 / 44 ) * sum( ( results[-1] - results[1] )^2 ) ) stopifnot( round( standard_error , 2 ) == 225.64 ) # note the minor differences MIcombine( with( ces_design , svymean( ~ cartkn ) ) ) "],["california-health-interview-survey-chis.html", "California Health Interview Survey (CHIS) Recommended Reading Function Definitions Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " California Health Interview Survey (CHIS) California’s National Health Interview Survey (NHIS), a healthcare survey for the nation’s largest state. One adult, one teenage (12-17), and one child table, each with one row per sampled respondent. A complex survey designed to generalize to the civilian non-institutionalized population of California. Released annually since 2011, and biennially since 2001. Administered by the UCLA Center for Health Policy Research. Recommended Reading Four Example Strengths & Limitations: ✔️ Neighborhood-level estimates ✔️ Oversamples allow targeted research questions ❌ Low response rates compared to nationwide surveys ❌ Two-year data periods reduces precision of trend analyses Three Example Findings: In 2021, adults with limited English proficiency were less likely to use video or telephone telehealth. The share of non-citizen kids reporting excellent health increased from 2013-2015 to 2017-2019. Adults working from home had worse health behaviors and mental health than other workers in 2021. Two Methodology Documents: CHIS 2021-2022 Methodology Report Series, Report 1: Sample Design DESIGN CHIS 2021-2022 Methodology Report Series, Report 5: Weighting and Variance Estimation One Haiku: # strike gold, movie star # play, third wish cali genie # statewide health survey Function Definitions Define a function to unzip and import each Stata file: library(haven) chis_import <- function( this_fn ){ these_files <- unzip( this_fn , exdir = tempdir() ) stata_fn <- grep( "ADULT\\\\.|CHILD\\\\.|TEEN\\\\." , these_files , value = TRUE ) this_tbl <- read_stata( stata_fn ) this_df <- data.frame( this_tbl ) names( this_df ) <- tolower( names( this_df ) ) # remove labelled classes labelled_cols <- sapply( this_df , function( w ) class( w )[1] == 'haven_labelled' ) this_df[ labelled_cols ] <- sapply( this_df[ labelled_cols ] , as.numeric ) this_df } Download, Import, Preparation Register at the UCLA Center for Health Policy Research at https://healthpolicy.ucla.edu/user/register. Choose Year: 2022, Age Group: Adult and Teen and Child, File Type: Stata. Download the 2022 Adult, Teen, and Child Stata files (version Oct 2023). Import the adult, teen, and child stata tables into data.frame objects: chis_adult_df <- chis_import( file.path( path.expand( "~" ) , "adult_stata_2022.zip" ) ) chis_teen_df <- chis_import( file.path( path.expand( "~" ) , "teen_stata_2022.zip" ) ) chis_child_df <- chis_import( file.path( path.expand( "~" ) , "child_stata_2022.zip" ) ) Harmonize the general health condition variable across the three data.frame objects: chis_adult_df[ , 'general_health' ] <- c( 1 , 2 , 3 , 4 , 4 )[ chis_adult_df[ , 'ab1' ] ] chis_teen_df[ , 'general_health' ] <- chis_teen_df[ , 'tb1_p1' ] chis_child_df[ , 'general_health' ] <- c( 1 , 2 , 3 , 4 , 4 )[ chis_child_df[ , 'ca6' ] ] Add four age categories across the three data.frame objects: chis_adult_df[ , 'age_categories' ] <- ifelse( chis_adult_df[ , 'srage_p1' ] >= 65 , 4 , 3 ) chis_teen_df[ , 'age_categories' ] <- 2 chis_child_df[ , 'age_categories' ] <- 1 Harmonize the usual source of care variable across the three data.frame objects: chis_adult_df[ , 'no_usual_source_of_care' ] <- as.numeric( chis_adult_df[ , 'ah1v2' ] == 2 ) chis_teen_df[ , 'no_usual_source_of_care' ] <- as.numeric( chis_teen_df[ , 'tf1v2' ] == 2 ) chis_child_df[ , 'no_usual_source_of_care' ] <- as.numeric( chis_child_df[ , 'cd1v2' ] == 2 ) Add monthly fruit and vegetable counts to the adult data.frame object, blanking the other two: chis_adult_df[ , 'adult_fruits_past_month' ] <- chis_adult_df[ , 'ae2' ] chis_adult_df[ , 'adult_veggies_past_month' ] <- chis_adult_df[ , 'ae7' ] chis_teen_df[ , c( 'adult_fruits_past_month' , 'adult_veggies_past_month' ) ] <- NA chis_child_df[ , c( 'adult_fruits_past_month' , 'adult_veggies_past_month' ) ] <- NA Specify which variables to keep in each of the data.frame objects, then stack them: variables_to_keep <- c( grep( '^rakedw' , names( chis_adult_df ) , value = TRUE ) , 'general_health' , 'age_categories' , 'adult_fruits_past_month' , 'adult_veggies_past_month' , 'srsex' , 'povll2_p1v2' , 'no_usual_source_of_care' ) chis_df <- rbind( chis_child_df[ variables_to_keep ] , chis_teen_df[ variables_to_keep ] , chis_adult_df[ variables_to_keep ] ) Save Locally   Save the object at any point: # chis_fn <- file.path( path.expand( "~" ) , "CHIS" , "this_file.rds" ) # saveRDS( chis_df , file = chis_fn , compress = FALSE ) Load the same object: # chis_df <- readRDS( chis_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) chis_design <- svrepdesign( data = chis_df , weights = ~ rakedw0 , repweights = "rakedw[1-9]" , type = "other" , scale = 1 , rscales = 1 , mse = TRUE ) Variable Recoding Add new columns to the data set: chis_design <- update( chis_design , one = 1 , gender = factor( srsex , levels = 1:2 , labels = c( 'male' , 'female' ) ) , age_categories = factor( age_categories , levels = 1:4 , labels = c( 'children under 12' , 'teens age 12-17' , 'adults age 18-64' , 'seniors' ) ) , general_health = factor( general_health , levels = 1:4 , labels = c( 'Excellent' , 'Very good' , 'Good' , 'Fair/Poor' ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( chis_design , "sampling" ) != 0 ) svyby( ~ one , ~ general_health , chis_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , chis_design ) svyby( ~ one , ~ general_health , chis_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ povll2_p1v2 , chis_design ) svyby( ~ povll2_p1v2 , ~ general_health , chis_design , svymean ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ gender , chis_design ) svyby( ~ gender , ~ general_health , chis_design , svymean ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ povll2_p1v2 , chis_design ) svyby( ~ povll2_p1v2 , ~ general_health , chis_design , svytotal ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ gender , chis_design ) svyby( ~ gender , ~ general_health , chis_design , svytotal ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ povll2_p1v2 , chis_design , 0.5 ) svyby( ~ povll2_p1v2 , ~ general_health , chis_design , svyquantile , 0.5 , ci = TRUE ) Estimate a ratio: svyratio( numerator = ~ adult_fruits_past_month , denominator = ~ adult_veggies_past_month , chis_design , na.rm = TRUE ) Subsetting Restrict the survey design to seniors: sub_chis_design <- subset( chis_design , age_categories == 'seniors' ) Calculate the mean (average) of this subset: svymean( ~ povll2_p1v2 , sub_chis_design ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ povll2_p1v2 , chis_design ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ povll2_p1v2 , ~ general_health , chis_design , svymean ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( chis_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ povll2_p1v2 , chis_design ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ povll2_p1v2 , chis_design , deff = TRUE ) # SRS with replacement svymean( ~ povll2_p1v2 , chis_design , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ no_usual_source_of_care , chis_design , method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( povll2_p1v2 ~ no_usual_source_of_care , chis_design ) Perform a chi-squared test of association for survey data: svychisq( ~ no_usual_source_of_care + gender , chis_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( povll2_p1v2 ~ no_usual_source_of_care + gender , chis_design ) summary( glm_result ) Replication Example This matches the proportions and counts from AskCHIS. The standard errors do not match precisely, but the team at UCLA confirmed this survey design definition to be correct, and that the minor standard error and confidence interval differences should not impact any analyses from a statistical perspective: chis_adult_design <- svrepdesign( data = chis_adult_df , weights = ~ rakedw0 , repweights = "rakedw[1-9]" , type = "other" , scale = 1 , rscales = 1 , mse = TRUE ) chis_adult_design <- update( chis_adult_design , ab1 = factor( ab1 , levels = 1:5 , labels = c( 'Excellent' , 'Very good' , 'Good' , 'Fair' , 'Poor' ) ) ) this_proportion <- svymean( ~ ab1 , chis_adult_design ) stopifnot( round( coef( this_proportion ) , 3 ) == c( 0.183 , 0.340 , 0.309 , 0.139 , 0.029 ) ) this_count <- svytotal( ~ ab1 , chis_adult_design ) stopifnot( round( coef( this_count ) , -3 ) == c( 5414000 , 10047000 , 9138000 , 4106000 , 855000 ) ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for CHIS users, this code replicates previously-presented examples: library(srvyr) chis_srvyr_design <- as_survey( chis_design ) Calculate the mean (average) of a linear variable, overall and by groups: chis_srvyr_design %>% summarize( mean = survey_mean( povll2_p1v2 ) ) chis_srvyr_design %>% group_by( general_health ) %>% summarize( mean = survey_mean( povll2_p1v2 ) ) "],["census-of-governments-cog.html", "Census of Governments (COG) Recommended Reading Download, Import, Preparation Analysis Examples with base R   Replication Example Analysis Examples with dplyr   Analysis Examples with data.table   Analysis Examples with duckdb  ", " Census of Governments (COG) Location, employment, and payroll for state and local (but not federal) government agencies in the U.S. One record per agency, one per agency function, plus the government units master address file. Complete enumeration of civilian employment in state and local governments in the 50 states + D.C. The Annual Survey of Public Employment & Payroll becomes a census in years ending with 2 and 7. Administered and financed by the US Census Bureau. Recommended Reading Two Methodology Documents: 2022 Census of Governments, Survey of Public Employment & Payroll Methodology Government Units Survey Methodology One Haiku: # courthouse steps wedding # schools police fire water # no fed mail invite Download, Import, Preparation Download, import, and stack the government units listing file: library(readxl) tf_gus <- tempfile() gus_url <- "https://www2.census.gov/programs-surveys/gus/datasets/2022/govt_units_2022.ZIP" download.file( gus_url , tf_gus , mode = 'wb' ) unzipped_files_gus <- unzip( tf_gus , exdir = tempdir() ) xlsx_gus_fn <- grep( "\\\\.xlsx$" , unzipped_files_gus , value = TRUE ) xlsx_sheets <- excel_sheets( xlsx_gus_fn ) # read all sheets into a list of tibbles gus_tbl_list <- lapply( xlsx_sheets , function( w ) read_excel( xlsx_gus_fn , sheet = w ) ) # convert all tibbles to data.frame objects gus_df_list <- lapply( gus_tbl_list , data.frame ) # lowercase all column names gus_df_list <- lapply( gus_df_list , function( w ){ names( w ) <- tolower( names( w ) ) ; w } ) # add the excel tab source to each data.frame for( i in seq( xlsx_sheets ) ) gus_df_list[[ i ]][ , 'source_tab' ] <- xlsx_sheets[ i ] # determine which columns are in all tables column_intersect <- Reduce( intersect , lapply( gus_df_list , names ) ) # determine which columns are in some but not all tables column_union <- unique( unlist( lapply( gus_df_list , names ) ) ) # these columns will be discarded by stacking: unique( unlist( lapply( lapply( gus_df_list , names ) , function( w ) column_union[ !column_union %in% w ] ) ) ) # stack all excel sheets, keeping only the columns that all tables have in common gus_df <- Reduce( rbind , lapply( gus_df_list , function( w ) w[ column_intersect ] ) ) Download and import the survey of public employment & payroll, one record per function (not per unit): tf_apes <- tempfile() apes_url <- paste0( "https://www2.census.gov/programs-surveys/apes/datasets/" , "2022/2022%20COG-E%20Individual%20Unit%20Files.zip" ) download.file( apes_url , tf_apes , mode = 'wb' ) unzipped_files_apes <- unzip( tf_apes , exdir = tempdir() ) xlsx_apes_fn <- grep( "\\\\.xlsx$" , unzipped_files_apes , value = TRUE ) apes_tbl <- read_excel( xlsx_apes_fn ) apes_df <- data.frame( apes_tbl ) names( apes_df ) <- tolower( names( apes_df ) ) Review the non-matching records between these two tables, then merge: # all DEP School Districts and a third of Special Districts are not in the `apes_df` table( gus_df[ , 'census_id_gidid' ] %in% apes_df[ , 'individual.unit.id' ] , gus_df[ , 'source_tab' ] , useNA = 'always' ) # state governments are not in the `gus_df` table( apes_df[ , 'individual.unit.id' ] %in% gus_df[ , 'census_id_gidid' ] , apes_df[ , 'type.of.government' ] , useNA = 'always' ) # check for overlapping field names: ( overlapping_names <- intersect( names( apes_df ) , names( gus_df ) ) ) # rename the state column in `gus_df` to state abbreviation names( gus_df )[ names( gus_df ) == 'state' ] <- 'stateab' double_df <- merge( apes_df , gus_df , by.x = 'individual.unit.id' , by.y = 'census_id_gidid' , all.x = TRUE ) stopifnot( nrow( double_df ) == nrow( apes_df ) ) # replace dots with underscores names( double_df ) <- gsub( "\\\\." , "_" , names( double_df ) ) Keep either the one record per agency rows or the one record per function rows: # `Total - All Government Employment Functions` records sum to the same as all other records: with( double_df , tapply( full_time_employees , grepl( "Total" , government_function ) , sum ) ) with( double_df , tapply( part_time_payroll , grepl( "Total" , government_function ) , sum ) ) # keep one record per government function (multiple records per agency): cog_df <- subset( double_df , !grepl( "Total" , government_function ) ) # keep one record per government agency: # cog_df <- subset( double_df , grepl( "Total" , government_function ) ) Save Locally   Save the object at any point: # cog_fn <- file.path( path.expand( "~" ) , "COG" , "this_file.rds" ) # saveRDS( cog_df , file = cog_fn , compress = FALSE ) Load the same object: # cog_df <- readRDS( cog_fn ) Variable Recoding Add new columns to the data set: cog_df <- transform( cog_df , one = 1 , total_payroll = full_time_payroll + part_time_payroll , total_employees = full_time_employees + part_time_employees , any_full_time_employees = full_time_employees > 0 ) Analysis Examples with base R   Unweighted Counts Count the unweighted number of records in the table, overall and by groups: nrow( cog_df ) table( cog_df[ , "type_of_government" ] , useNA = "always" ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: mean( cog_df[ , "full_time_employees" ] ) tapply( cog_df[ , "full_time_employees" ] , cog_df[ , "type_of_government" ] , mean ) Calculate the distribution of a categorical variable, overall and by groups: prop.table( table( cog_df[ , "census_region" ] ) ) prop.table( table( cog_df[ , c( "census_region" , "type_of_government" ) ] ) , margin = 2 ) Calculate the sum of a linear variable, overall and by groups: sum( cog_df[ , "full_time_employees" ] ) tapply( cog_df[ , "full_time_employees" ] , cog_df[ , "type_of_government" ] , sum ) Calculate the median (50th percentile) of a linear variable, overall and by groups: quantile( cog_df[ , "full_time_employees" ] , 0.5 ) tapply( cog_df[ , "full_time_employees" ] , cog_df[ , "type_of_government" ] , quantile , 0.5 ) Subsetting Limit your data.frame to Elementary, Secondary, Higher, and Other Educational Government Agencies: sub_cog_df <- subset( cog_df , grepl( 'Education' , government_function ) ) Calculate the mean (average) of this subset: mean( sub_cog_df[ , "full_time_employees" ] ) Measures of Uncertainty Calculate the variance, overall and by groups: var( cog_df[ , "full_time_employees" ] ) tapply( cog_df[ , "full_time_employees" ] , cog_df[ , "type_of_government" ] , var ) Regression Models and Tests of Association Perform a t-test: t.test( full_time_employees ~ any_full_time_employees , cog_df ) Perform a chi-squared test of association: this_table <- table( cog_df[ , c( "any_full_time_employees" , "census_region" ) ] ) chisq.test( this_table ) Perform a generalized linear model: glm_result <- glm( full_time_employees ~ any_full_time_employees + census_region , data = cog_df ) summary( glm_result ) Replication Example This example matches excel cell “C17” of Employment & Payroll Data by State and by Function: financial_admin_df <- subset( cog_df , government_function == 'Financial Administration' ) stopifnot( sum( financial_admin_df[ , 'full_time_employees' ] ) == 401394 ) Analysis Examples with dplyr   The R dplyr library offers an alternative grammar of data manipulation to base R and SQL syntax. dplyr offers many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, and the tidyverse style of non-standard evaluation. This vignette details the available features. As a starting point for COG users, this code replicates previously-presented examples: library(dplyr) cog_tbl <- as_tibble( cog_df ) Calculate the mean (average) of a linear variable, overall and by groups: cog_tbl %>% summarize( mean = mean( full_time_employees ) ) cog_tbl %>% group_by( type_of_government ) %>% summarize( mean = mean( full_time_employees ) ) Analysis Examples with data.table   The R data.table library provides a high-performance version of base R’s data.frame with syntax and feature enhancements for ease of use, convenience and programming speed. data.table offers concise syntax: fast to type, fast to read, fast speed, memory efficiency, a careful API lifecycle management, an active community, and a rich set of features. This vignette details the available features. As a starting point for COG users, this code replicates previously-presented examples: library(data.table) cog_dt <- data.table( cog_df ) Calculate the mean (average) of a linear variable, overall and by groups: cog_dt[ , mean( full_time_employees ) ] cog_dt[ , mean( full_time_employees ) , by = type_of_government ] Analysis Examples with duckdb   The R duckdb library provides an embedded analytical data management system with support for the Structured Query Language (SQL). duckdb offers a simple, feature-rich, fast, and free SQL OLAP management system. This vignette details the available features. As a starting point for COG users, this code replicates previously-presented examples: library(duckdb) con <- dbConnect( duckdb::duckdb() , dbdir = 'my-db.duckdb' ) dbWriteTable( con , 'cog' , cog_df ) Calculate the mean (average) of a linear variable, overall and by groups: dbGetQuery( con , 'SELECT AVG( full_time_employees ) FROM cog' ) dbGetQuery( con , 'SELECT type_of_government , AVG( full_time_employees ) FROM cog GROUP BY type_of_government' ) "],["current-population-survey-cps.html", "Current Population Survey (CPS) Recommended Reading Download, Import, Preparation Analysis Examples with the survey library   Replication Example Poverty and Inequality Estimation with convey   Analysis Examples with srvyr  ", " Current Population Survey (CPS) The principal labor force survey, providing income, poverty, and health insurance coverage estimates. One table with one row per sampled household, a second table with one row per family within each sampled household, and a third table with one row per individual within each of those families. A complex sample designed to generalize to the civilian non-institutional population of the US. Released annually since 1998, linkable to the Basic Monthly releases. Administered jointly by the US Census Bureau and the Bureau of Labor Statistics. Recommended Reading Four Example Strengths & Limitations: ✔️ Detailed labor force categorizations ✔️ Transparent methodological changes ❌ Retirement and investment income undercount ❌ Informal worker undercount Three Example Findings: In 2024, 57% of 18 to 24 year olds and 16% of 25 to 34 year olds lived in their parental home. The ratio of working poor to all individuals in the labor force for at least 27 weeks was 4% in 2022. Between 2022 and 2023, the share of children without health coverage rose from 5.4% to 5.8%. Two Methodology Documents: Current Population Survey 2024 Annual Social and Economic (ASEC) Supplement Wikipedia Entry One Haiku: # jobs robbed by robot # luddite rebellion looming # blue, due to red pill Download, Import, Preparation Download and unzip the 2024 file: library(httr) tf <- tempfile() this_url <- "https://www2.census.gov/programs-surveys/cps/datasets/2024/march/asecpub24sas.zip" GET( this_url , write_disk( tf ) , progress() ) unzipped_files <- unzip( tf , exdir = tempdir() ) Import all four files: library(haven) four_tbl <- lapply( unzipped_files , read_sas ) four_df <- lapply( four_tbl , data.frame ) four_df <- lapply( four_df , function( w ){ names( w ) <- tolower( names( w ) ) ; w } ) household_df <- four_df[[ grep( 'hhpub' , basename( unzipped_files ) ) ]] family_df <- four_df[[ grep( 'ffpub' , basename( unzipped_files ) ) ]] person_df <- four_df[[ grep( 'pppub' , basename( unzipped_files ) ) ]] repwgts_df <- four_df[[ grep( 'repwgt' , basename( unzipped_files ) ) ]] Divide weights: household_df[ , 'hsup_wgt' ] <- household_df[ , 'hsup_wgt' ] / 100 family_df[ , 'fsup_wgt' ] <- family_df[ , 'fsup_wgt' ] / 100 for ( j in c( 'marsupwt' , 'a_ernlwt' , 'a_fnlwgt' ) ) person_df[ , j ] <- person_df[ , j ] / 100 Merge these four files: names( family_df )[ names( family_df ) == 'fh_seq' ] <- 'h_seq' names( person_df )[ names( person_df ) == 'ph_seq' ] <- 'h_seq' names( person_df )[ names( person_df ) == 'phf_seq' ] <- 'ffpos' hh_fm_df <- merge( household_df , family_df ) hh_fm_pr_df <- merge( hh_fm_df , person_df ) cps_df <- merge( hh_fm_pr_df , repwgts_df ) stopifnot( nrow( cps_df ) == nrow( person_df ) ) Save Locally   Save the object at any point: # cps_fn <- file.path( path.expand( "~" ) , "CPS" , "this_file.rds" ) # saveRDS( cps_df , file = cps_fn , compress = FALSE ) Load the same object: # cps_df <- readRDS( cps_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) cps_design <- svrepdesign( weights = ~ marsupwt , repweights = "pwwgt[1-9]" , type = "Fay" , rho = ( 1 - 1 / sqrt( 4 ) ) , data = cps_df , combined.weights = TRUE , mse = TRUE ) Variable Recoding Add new columns to the data set: cps_design <- update( cps_design , one = 1 , a_maritl = factor( a_maritl , labels = c( "married - civilian spouse present" , "married - AF spouse present" , "married - spouse absent" , "widowed" , "divorced" , "separated" , "never married" ) ) , state_name = factor( gestfips , levels = c(1L, 2L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 53L, 54L, 55L, 56L) , labels = c("Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "District of Columbia", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming") ) , male = as.numeric( a_sex == 1 ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( cps_design , "sampling" ) != 0 ) svyby( ~ one , ~ state_name , cps_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , cps_design ) svyby( ~ one , ~ state_name , cps_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ ptotval , cps_design ) svyby( ~ ptotval , ~ state_name , cps_design , svymean ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ a_maritl , cps_design ) svyby( ~ a_maritl , ~ state_name , cps_design , svymean ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ ptotval , cps_design ) svyby( ~ ptotval , ~ state_name , cps_design , svytotal ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ a_maritl , cps_design ) svyby( ~ a_maritl , ~ state_name , cps_design , svytotal ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ ptotval , cps_design , 0.5 ) svyby( ~ ptotval , ~ state_name , cps_design , svyquantile , 0.5 , ci = TRUE ) Estimate a ratio: svyratio( numerator = ~ moop , denominator = ~ ptotval , cps_design ) Subsetting Restrict the survey design to persons aged 18-64: sub_cps_design <- subset( cps_design , a_age %in% 18:64 ) Calculate the mean (average) of this subset: svymean( ~ ptotval , sub_cps_design ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ ptotval , cps_design ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ ptotval , ~ state_name , cps_design , svymean ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( cps_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ ptotval , cps_design ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ ptotval , cps_design , deff = TRUE ) # SRS with replacement svymean( ~ ptotval , cps_design , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ male , cps_design , method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( ptotval ~ male , cps_design ) Perform a chi-squared test of association for survey data: svychisq( ~ male + a_maritl , cps_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( ptotval ~ male + a_maritl , cps_design ) summary( glm_result ) Replication Example This example matches the count and share of individuals with health insurance in Table H-01: count_covered <- svytotal( ~ as.numeric( cov == 1 ) , cps_design ) stopifnot( round( coef( count_covered ) , -5 ) == 305200000 ) stopifnot( round( coef( count_covered ) - confint( count_covered , level = 0.9 )[1] , -3 ) == 704000 ) share_covered <- svymean( ~ as.numeric( cov == 1 ) , subset( cps_design , cov > 0 ) ) stopifnot( round( coef( share_covered ) , 3 ) == 0.920 ) stopifnot( round( coef( share_covered ) - confint( share_covered , level = 0.9 )[1] , 3 ) == 0.002 ) Poverty and Inequality Estimation with convey   The R convey library estimates measures of income concentration, poverty, inequality, and wellbeing. This textbook details the available features. As a starting point for CPS users, this code calculates the gini coefficient on complex sample survey data: library(convey) cps_design <- convey_prep( cps_design ) cps_household_design <- subset( cps_design , a_exprrp %in% 1:2 ) svygini( ~ htotval , cps_household_design ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for CPS users, this code replicates previously-presented examples: library(srvyr) cps_srvyr_design <- as_survey( cps_design ) Calculate the mean (average) of a linear variable, overall and by groups: cps_srvyr_design %>% summarize( mean = survey_mean( ptotval ) ) cps_srvyr_design %>% group_by( state_name ) %>% summarize( mean = survey_mean( ptotval ) ) "],["exame-nacional-de-desempenho-de-estudantes-enade.html", "Exame Nacional de Desempenho de Estudantes (ENADE) Recommended Reading Download, Import, Preparation Analysis Examples with base R   Replication Example Analysis Examples with dplyr   Analysis Examples with data.table   Analysis Examples with duckdb  ", " Exame Nacional de Desempenho de Estudantes (ENADE) The nationwide mandatory examination of college graduates. One table with one row per individual undergraduate student in Brazil. An enumeration of undergraduate students in Brazil. Released annually since 2004. Compiled by the Instituto Nacional de Estudos e Pesquisas Educacionais Anísio Teixeira (INEP). Recommended Reading Two Methodology Documents: Cálculo da nota final do Exame Nacional de Desempenho dos Estudiantes Wikipedia Entry One Haiku: # undergraduates # sit for standardized testing # exit interview Download, Import, Preparation Download, import, and merge two of the 2021 files: library(httr) library(archive) tf <- tempfile() this_url <- "https://download.inep.gov.br/microdados/microdados_enade_2021.zip" GET( this_url , write_disk( tf ) , progress() ) archive_extract( tf , dir = tempdir() ) read_enade_archive <- function( this_regular_expression , this_directory ){ this_filename <- grep( this_regular_expression , list.files( this_directory , recursive = TRUE , full.names = TRUE ) , value = TRUE ) this_df <- read.table( this_filename , header = TRUE , sep = ";" , na.strings = "" ) names( this_df ) <- tolower( names( this_df ) ) this_df } arq1_df <- read_enade_archive( 'arq1\\\\.txt$' , tempdir() ) arq1_df <- unique( arq1_df[ c( 'co_curso' , 'co_uf_curso' , 'co_categad' , 'co_grupo' ) ] ) arq3_df <- read_enade_archive( 'arq3\\\\.txt$' , tempdir() ) enade_df <- merge( arq3_df , arq1_df ) stopifnot( nrow( enade_df ) == nrow( arq3_df ) ) Save Locally   Save the object at any point: # enade_fn <- file.path( path.expand( "~" ) , "ENADE" , "this_file.rds" ) # saveRDS( enade_df , file = enade_fn , compress = FALSE ) Load the same object: # enade_df <- readRDS( enade_fn ) Variable Recoding Add new columns to the data set: enade_df <- transform( enade_df , # qual foi o tempo gasto por voce para concluir a prova? less_than_two_hours = as.numeric( co_rs_i9 %in% c( 'A' , 'B' ) ) , administrative_category = factor( co_categad , levels = c( 1:5 , 7 ) , labels = c( '1. Pública Federal' , '2. Pública Estadual' , '3. Pública Municipal' , '4. Privada com fins lucrativos' , '5. Privada sem fins lucrativos' , '7. Especial' ) ) , state_name = factor( co_uf_curso , levels = c( 11:17 , 21:29 , 31:33 , 35 , 41:43 , 50:53 ) , labels = c( "Rondonia" , "Acre" , "Amazonas" , "Roraima" , "Para" , "Amapa" , "Tocantins" , "Maranhao" , "Piaui" , "Ceara" , "Rio Grande do Norte" , "Paraiba" , "Pernambuco" , "Alagoas" , "Sergipe" , "Bahia" , "Minas Gerais" , "Espirito Santo" , "Rio de Janeiro" , "Sao Paulo" , "Parana" , "Santa Catarina" , "Rio Grande do Sul" , "Mato Grosso do Sul" , "Mato Grosso" , "Goias" , "Distrito Federal" ) ) ) Analysis Examples with base R   Unweighted Counts Count the unweighted number of records in the table, overall and by groups: nrow( enade_df ) table( enade_df[ , "administrative_category" ] , useNA = "always" ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: mean( enade_df[ , "nt_obj_fg" ] , na.rm = TRUE ) tapply( enade_df[ , "nt_obj_fg" ] , enade_df[ , "administrative_category" ] , mean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: prop.table( table( enade_df[ , "state_name" ] ) ) prop.table( table( enade_df[ , c( "state_name" , "administrative_category" ) ] ) , margin = 2 ) Calculate the sum of a linear variable, overall and by groups: sum( enade_df[ , "nt_obj_fg" ] , na.rm = TRUE ) tapply( enade_df[ , "nt_obj_fg" ] , enade_df[ , "administrative_category" ] , sum , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: quantile( enade_df[ , "nt_obj_fg" ] , 0.5 , na.rm = TRUE ) tapply( enade_df[ , "nt_obj_fg" ] , enade_df[ , "administrative_category" ] , quantile , 0.5 , na.rm = TRUE ) Subsetting Limit your data.frame to students reporting that the general training section was easy or very easy: sub_enade_df <- subset( enade_df , co_rs_i1 %in% c( "A" , "B" ) ) Calculate the mean (average) of this subset: mean( sub_enade_df[ , "nt_obj_fg" ] , na.rm = TRUE ) Measures of Uncertainty Calculate the variance, overall and by groups: var( enade_df[ , "nt_obj_fg" ] , na.rm = TRUE ) tapply( enade_df[ , "nt_obj_fg" ] , enade_df[ , "administrative_category" ] , var , na.rm = TRUE ) Regression Models and Tests of Association Perform a t-test: t.test( nt_obj_fg ~ less_than_two_hours , enade_df ) Perform a chi-squared test of association: this_table <- table( enade_df[ , c( "less_than_two_hours" , "state_name" ) ] ) chisq.test( this_table ) Perform a generalized linear model: glm_result <- glm( nt_obj_fg ~ less_than_two_hours + state_name , data = enade_df ) summary( glm_result ) Replication Example This example matches the tecnologia em gestão da tecnologia da informação test scores on PDF page 48 of the 2021 final results document: it_students <- subset( enade_df , co_grupo %in% 6409 ) results <- sapply( it_students[ c( 'nt_fg' , 'nt_ce' , 'nt_ger' ) ] , mean , na.rm = TRUE ) stopifnot( round( results[ 'nt_fg' ] , 1 ) == 30.4 ) stopifnot( round( results[ 'nt_ce' ] , 1 ) == 38.2 ) stopifnot( round( results[ 'nt_ger' ] , 1 ) == 36.3 ) Analysis Examples with dplyr   The R dplyr library offers an alternative grammar of data manipulation to base R and SQL syntax. dplyr offers many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, and the tidyverse style of non-standard evaluation. This vignette details the available features. As a starting point for ENADE users, this code replicates previously-presented examples: library(dplyr) enade_tbl <- as_tibble( enade_df ) Calculate the mean (average) of a linear variable, overall and by groups: enade_tbl %>% summarize( mean = mean( nt_obj_fg , na.rm = TRUE ) ) enade_tbl %>% group_by( administrative_category ) %>% summarize( mean = mean( nt_obj_fg , na.rm = TRUE ) ) Analysis Examples with data.table   The R data.table library provides a high-performance version of base R’s data.frame with syntax and feature enhancements for ease of use, convenience and programming speed. data.table offers concise syntax: fast to type, fast to read, fast speed, memory efficiency, a careful API lifecycle management, an active community, and a rich set of features. This vignette details the available features. As a starting point for ENADE users, this code replicates previously-presented examples: library(data.table) enade_dt <- data.table( enade_df ) Calculate the mean (average) of a linear variable, overall and by groups: enade_dt[ , mean( nt_obj_fg , na.rm = TRUE ) ] enade_dt[ , mean( nt_obj_fg , na.rm = TRUE ) , by = administrative_category ] Analysis Examples with duckdb   The R duckdb library provides an embedded analytical data management system with support for the Structured Query Language (SQL). duckdb offers a simple, feature-rich, fast, and free SQL OLAP management system. This vignette details the available features. As a starting point for ENADE users, this code replicates previously-presented examples: library(duckdb) con <- dbConnect( duckdb::duckdb() , dbdir = 'my-db.duckdb' ) dbWriteTable( con , 'enade' , enade_df ) Calculate the mean (average) of a linear variable, overall and by groups: dbGetQuery( con , 'SELECT AVG( nt_obj_fg ) FROM enade' ) dbGetQuery( con , 'SELECT administrative_category , AVG( nt_obj_fg ) FROM enade GROUP BY administrative_category' ) "],["exame-nacional-do-ensino-medio-enem.html", "Exame Nacional do Ensino Medio (ENEM) Recommended Reading Download, Import, Preparation Analysis Examples with base R   Replication Example Analysis Examples with dplyr   Analysis Examples with data.table   Analysis Examples with duckdb  ", " Exame Nacional do Ensino Medio (ENEM) The national student aptitude test, used to assess high school completion and university admission. One table with one row per test-taking student, a second of study habit questionnaire respondents. Updated annually since 1998. Maintained by Brazil’s Instituto Nacional de Estudos e Pesquisas Educacionais Anisio Teixeira Recommended Reading Two Methodology Documents: Leia_Me_Enem included in each annual zipped file Wikipedia Entry One Haiku: # graduation stage # shake hands, toss cap, unroll scroll, # mais um exame? Download, Import, Preparation Download and unzip the 2022 file: library(httr) library(archive) tf <- tempfile() this_url <- "https://download.inep.gov.br/microdados/microdados_enem_2022.zip" GET( this_url , write_disk( tf ) , progress() ) archive_extract( tf , dir = tempdir() ) Import the 2022 file: library(readr) enem_fns <- list.files( tempdir() , recursive = TRUE , full.names = TRUE ) enem_fn <- grep( "MICRODADOS_ENEM_([0-9][0-9][0-9][0-9])\\\\.csv$" , enem_fns , value = TRUE ) enem_tbl <- read_csv2( enem_fn , locale = locale( encoding = 'latin1' ) ) enem_df <- data.frame( enem_tbl ) names( enem_df ) <- tolower( names( enem_df ) ) Save Locally   Save the object at any point: # enem_fn <- file.path( path.expand( "~" ) , "ENEM" , "this_file.rds" ) # saveRDS( enem_df , file = enem_fn , compress = FALSE ) Load the same object: # enem_df <- readRDS( enem_fn ) Variable Recoding Add new columns to the data set: enem_df <- transform( enem_df , domestic_worker = as.numeric( q007 %in% c( 'B' , 'C' , 'D' ) ) , administrative_category = factor( tp_dependencia_adm_esc , levels = 1:4 , labels = c( 'Federal' , 'Estadual' , 'Municipal' , 'Privada' ) ) , state_name = factor( co_uf_esc , levels = c( 11:17 , 21:29 , 31:33 , 35 , 41:43 , 50:53 ) , labels = c( "Rondonia" , "Acre" , "Amazonas" , "Roraima" , "Para" , "Amapa" , "Tocantins" , "Maranhao" , "Piaui" , "Ceara" , "Rio Grande do Norte" , "Paraiba" , "Pernambuco" , "Alagoas" , "Sergipe" , "Bahia" , "Minas Gerais" , "Espirito Santo" , "Rio de Janeiro" , "Sao Paulo" , "Parana" , "Santa Catarina" , "Rio Grande do Sul" , "Mato Grosso do Sul" , "Mato Grosso" , "Goias" , "Distrito Federal" ) ) ) Analysis Examples with base R   Unweighted Counts Count the unweighted number of records in the table, overall and by groups: nrow( enem_df ) table( enem_df[ , "administrative_category" ] , useNA = "always" ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: mean( enem_df[ , "nu_nota_mt" ] , na.rm = TRUE ) tapply( enem_df[ , "nu_nota_mt" ] , enem_df[ , "administrative_category" ] , mean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: prop.table( table( enem_df[ , "state_name" ] ) ) prop.table( table( enem_df[ , c( "state_name" , "administrative_category" ) ] ) , margin = 2 ) Calculate the sum of a linear variable, overall and by groups: sum( enem_df[ , "nu_nota_mt" ] , na.rm = TRUE ) tapply( enem_df[ , "nu_nota_mt" ] , enem_df[ , "administrative_category" ] , sum , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: quantile( enem_df[ , "nu_nota_mt" ] , 0.5 , na.rm = TRUE ) tapply( enem_df[ , "nu_nota_mt" ] , enem_df[ , "administrative_category" ] , quantile , 0.5 , na.rm = TRUE ) Subsetting Limit your data.frame to mother graduated from high school: sub_enem_df <- subset( enem_df , q002 %in% c( 'E' , 'F' , 'G' ) ) Calculate the mean (average) of this subset: mean( sub_enem_df[ , "nu_nota_mt" ] , na.rm = TRUE ) Measures of Uncertainty Calculate the variance, overall and by groups: var( enem_df[ , "nu_nota_mt" ] , na.rm = TRUE ) tapply( enem_df[ , "nu_nota_mt" ] , enem_df[ , "administrative_category" ] , var , na.rm = TRUE ) Regression Models and Tests of Association Perform a t-test: t.test( nu_nota_mt ~ domestic_worker , enem_df ) Perform a chi-squared test of association: this_table <- table( enem_df[ , c( "domestic_worker" , "state_name" ) ] ) chisq.test( this_table ) Perform a generalized linear model: glm_result <- glm( nu_nota_mt ~ domestic_worker + state_name , data = enem_df ) summary( glm_result ) Replication Example This example matches the registration counts in the Sinopse ENEM 2022 Excel table: stopifnot( nrow( enem_df ) == 3476105 ) Analysis Examples with dplyr   The R dplyr library offers an alternative grammar of data manipulation to base R and SQL syntax. dplyr offers many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, and the tidyverse style of non-standard evaluation. This vignette details the available features. As a starting point for ENEM users, this code replicates previously-presented examples: library(dplyr) enem_tbl <- as_tibble( enem_df ) Calculate the mean (average) of a linear variable, overall and by groups: enem_tbl %>% summarize( mean = mean( nu_nota_mt , na.rm = TRUE ) ) enem_tbl %>% group_by( administrative_category ) %>% summarize( mean = mean( nu_nota_mt , na.rm = TRUE ) ) Analysis Examples with data.table   The R data.table library provides a high-performance version of base R’s data.frame with syntax and feature enhancements for ease of use, convenience and programming speed. data.table offers concise syntax: fast to type, fast to read, fast speed, memory efficiency, a careful API lifecycle management, an active community, and a rich set of features. This vignette details the available features. As a starting point for ENEM users, this code replicates previously-presented examples: library(data.table) enem_dt <- data.table( enem_df ) Calculate the mean (average) of a linear variable, overall and by groups: enem_dt[ , mean( nu_nota_mt , na.rm = TRUE ) ] enem_dt[ , mean( nu_nota_mt , na.rm = TRUE ) , by = administrative_category ] Analysis Examples with duckdb   The R duckdb library provides an embedded analytical data management system with support for the Structured Query Language (SQL). duckdb offers a simple, feature-rich, fast, and free SQL OLAP management system. This vignette details the available features. As a starting point for ENEM users, this code replicates previously-presented examples: library(duckdb) con <- dbConnect( duckdb::duckdb() , dbdir = 'my-db.duckdb' ) dbWriteTable( con , 'enem' , enem_df ) Calculate the mean (average) of a linear variable, overall and by groups: dbGetQuery( con , 'SELECT AVG( nu_nota_mt ) FROM enem' ) dbGetQuery( con , 'SELECT administrative_category , AVG( nu_nota_mt ) FROM enem GROUP BY administrative_category' ) "],["european-social-survey-ess.html", "European Social Survey (ESS) Recommended Reading Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " European Social Survey (ESS) The barometer of political opinion and behavior across the continent. One table per country with one row per sampled respondent. A complex sample designed to generalize to residents aged 15 and older in participating nations. Released biennially since 2002. Headquartered at City, University of London and governed by a scientific team across Europe. Recommended Reading Four Example Strengths & Limitations: ✔️ Rotating modules allow external researchers to propose new questions ✔️ Sub-national geographies available ❌ Country-specific differences in methodology ❌ Questionnaires only translated into languages spoken by at least 5% of each country’s population Three Example Findings: Childless adults aged 65 to 74 in 2002 were not more socially excluded than those in 2018. Between 2002-2003 and 2016-2017, there was little change overall in the extent to which Europeans felt that their countries were made a better or worse place to live as a result of migration. The 2022 Russian invasion of Ukraine reduced authoritarian attitudes across eight nations. Two Methodology Documents: Findings from the European Social Survey Wikipedia Entry One Haiku: # pent up belief gauge # open border monarchists # survey for your thoughts Download, Import, Preparation Register at the ESS Data Portal at https://ess-search.nsd.no/. Choose ESS round 8 - 2016. Welfare attitudes, Attitudes to climate change. Download the integrated file and also the sample design (SDDF) files as SAV (SPSS) files: library(foreign) ess_int_df <- read.spss( file.path( path.expand( "~" ) , "ESS8e02_2.sav" ) , to.data.frame = TRUE , use.value.labels = FALSE ) ess_sddf_df <- read.spss( file.path( path.expand( "~" ) , "ESS8SDDFe01_1.sav" ) , to.data.frame = TRUE , use.value.labels = FALSE ) ess_df <- merge( ess_int_df , ess_sddf_df , by = c( 'cntry' , 'idno' ) ) stopifnot( nrow( ess_df ) == nrow( ess_int_df ) ) Save Locally   Save the object at any point: # ess_fn <- file.path( path.expand( "~" ) , "ESS" , "this_file.rds" ) # saveRDS( ess_df , file = ess_fn , compress = FALSE ) Load the same object: # ess_df <- readRDS( ess_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) options( survey.lonely.psu = "adjust" ) ess_df[ , 'anweight' ] <- ess_df[ , 'pspwght' ] * ess_df[ , 'pweight' ] * 10000 ess_design <- svydesign( ids = ~psu , strata = ~stratum , weights = ~anweight , data = ess_df , nest = TRUE ) Variable Recoding Add new columns to the data set: ess_design <- update( ess_design , one = 1 , gndr = factor( gndr , labels = c( 'male' , 'female' ) ) , netusoft = factor( netusoft , levels = 1:5 , labels = c( 'Never' , 'Only occasionally' , 'A few times a week' , 'Most days' , 'Every day' ) ) , belonging_to_particular_religion = as.numeric( rlgblg == 1 ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( ess_design , "sampling" ) != 0 ) svyby( ~ one , ~ cntry , ess_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , ess_design ) svyby( ~ one , ~ cntry , ess_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ ppltrst , ess_design , na.rm = TRUE ) svyby( ~ ppltrst , ~ cntry , ess_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ gndr , ess_design , na.rm = TRUE ) svyby( ~ gndr , ~ cntry , ess_design , svymean , na.rm = TRUE ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ ppltrst , ess_design , na.rm = TRUE ) svyby( ~ ppltrst , ~ cntry , ess_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ gndr , ess_design , na.rm = TRUE ) svyby( ~ gndr , ~ cntry , ess_design , svytotal , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ ppltrst , ess_design , 0.5 , na.rm = TRUE ) svyby( ~ ppltrst , ~ cntry , ess_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ ppltrst , denominator = ~ pplfair , ess_design , na.rm = TRUE ) Subsetting Restrict the survey design to voters: sub_ess_design <- subset( ess_design , vote == 1 ) Calculate the mean (average) of this subset: svymean( ~ ppltrst , sub_ess_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ ppltrst , ess_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ ppltrst , ~ cntry , ess_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( ess_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ ppltrst , ess_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ ppltrst , ess_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ ppltrst , ess_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ belonging_to_particular_religion , ess_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( ppltrst ~ belonging_to_particular_religion , ess_design ) Perform a chi-squared test of association for survey data: svychisq( ~ belonging_to_particular_religion + gndr , ess_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( ppltrst ~ belonging_to_particular_religion + gndr , ess_design ) summary( glm_result ) Replication Example This example matches statistics and confidence intervals within 0.1% from the Guide to Using Weights and Sample Design Indicators with ESS Data: published_proportions <- c( 0.166 , 0.055 , 0.085 , 0.115 , 0.578 ) published_lb <- c( 0.146 , 0.045 , 0.072 , 0.099 , 0.550 ) published_ub <- c( 0.188 , 0.068 , 0.100 , 0.134 , 0.605 ) austrians <- subset( ess_design , cntry == 'AT' ) ( results <- svymean( ~ netusoft , austrians , na.rm = TRUE ) ) stopifnot( all( round( coef( results ) , 3 ) == published_proportions ) ) ( ci_results <- confint( results ) ) stopifnot( all( abs( ci_results[ , 1 ] - published_lb ) < 0.0015 ) ) stopifnot( all( abs( ci_results[ , 2 ] - published_ub ) < 0.0015 ) ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for ESS users, this code replicates previously-presented examples: library(srvyr) ess_srvyr_design <- as_survey( ess_design ) Calculate the mean (average) of a linear variable, overall and by groups: ess_srvyr_design %>% summarize( mean = survey_mean( ppltrst , na.rm = TRUE ) ) ess_srvyr_design %>% group_by( cntry ) %>% summarize( mean = survey_mean( ppltrst , na.rm = TRUE ) ) "],["fda-adverse-event-reporting-system-faers.html", "FDA Adverse Event Reporting System (FAERS) Recommended Reading Function Definitions Download, Import, Preparation Analysis Examples with base R   Replication Example Analysis Examples with dplyr   Analysis Examples with data.table   Analysis Examples with duckdb  ", " FDA Adverse Event Reporting System (FAERS) The post-marketing safety surveillance program for drug and therapeutic biological products. Multiple tables linked by primaryid including demographics, outcomes, drug start and end dates. Voluntary reports from practitioners and patients, not representative, no verification of causality. Published quarterly since 2004, file structure revisions at 2012Q4 and 2014Q3. Maintained by the United States Food and Drug Administration (FDA). Recommended Reading Two Methodology Documents: ASC_NTS.DOC included in each quarterly zipped file, especially the Entity Relationship Diagram Questions and Answers on FDA’s Adverse Event Reporting System (FAERS) One Haiku: # side effect guestbook # violet you're turning violet # vi'lent dose response Function Definitions Define a function to import each text file: read_faers <- function( this_fn ){ read.table( this_fn , sep = "$" , header = TRUE , comment.char = "" , quote = "" ) } Download, Import, Preparation Download the quarterly file: library(httr) tf <- tempfile() this_url <- "https://fis.fda.gov/content/Exports/faers_ascii_2023q1.zip" GET( this_url , write_disk( tf ) , progress() ) unzipped_files <- unzip( tf , exdir = tempdir() ) Import multiple tables from the downloaded quarter of microdata: # one record per report faers_demo_df <- read_faers( grep( 'DEMO23Q1\\\\.txt$' , unzipped_files , value = TRUE ) ) # one or more record per report faers_drug_df <- read_faers( grep( 'DRUG23Q1\\\\.txt$' , unzipped_files , value = TRUE ) ) # zero or more records per report faers_outcome_df <- read_faers( grep( 'OUTC23Q1\\\\.txt$' , unzipped_files , value = TRUE ) ) Construct an analysis file limited to reported deaths: # limit the outcome file to deaths faers_deaths_df <- subset( faers_outcome_df , outc_cod == 'DE' ) # merge demographics with each reported death faers_df <- merge( faers_demo_df , faers_deaths_df ) # confirm that the analysis file matches the number of death outcomes stopifnot( nrow( faers_deaths_df ) == nrow( faers_df ) ) # confirm zero reports include multiple deaths from the same reported adverse event stopifnot( nrow( faers_df ) == length( unique( faers_df[ , 'primaryid' ] ) ) ) Save Locally   Save the object at any point: # faers_fn <- file.path( path.expand( "~" ) , "FAERS" , "this_file.rds" ) # saveRDS( faers_df , file = faers_fn , compress = FALSE ) Load the same object: # faers_df <- readRDS( faers_fn ) Variable Recoding Add new columns to the data set: faers_df <- transform( faers_df , physician_reported = as.numeric( occp_cod == "MD" ) , reporter_country_categories = ifelse( reporter_country == 'US' , 'USA' , ifelse( reporter_country == 'COUNTRY NOT SPECIFIED' , 'missing' , ifelse( reporter_country == 'JP' , 'Japan' , ifelse( reporter_country == 'UK' , 'UK' , ifelse( reporter_country == 'CA' , 'Canada' , ifelse( reporter_country == 'FR' , 'France' , 'Other' ) ) ) ) ) ) , init_fda_year = as.numeric( substr( init_fda_dt , 1 , 4 ) ) ) Analysis Examples with base R   Unweighted Counts Count the unweighted number of records in the table, overall and by groups: nrow( faers_df ) table( faers_df[ , "reporter_country_categories" ] , useNA = "always" ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: mean( faers_df[ , "init_fda_year" ] , na.rm = TRUE ) tapply( faers_df[ , "init_fda_year" ] , faers_df[ , "reporter_country_categories" ] , mean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: prop.table( table( faers_df[ , "sex" ] ) ) prop.table( table( faers_df[ , c( "sex" , "reporter_country_categories" ) ] ) , margin = 2 ) Calculate the sum of a linear variable, overall and by groups: sum( faers_df[ , "init_fda_year" ] , na.rm = TRUE ) tapply( faers_df[ , "init_fda_year" ] , faers_df[ , "reporter_country_categories" ] , sum , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: quantile( faers_df[ , "init_fda_year" ] , 0.5 , na.rm = TRUE ) tapply( faers_df[ , "init_fda_year" ] , faers_df[ , "reporter_country_categories" ] , quantile , 0.5 , na.rm = TRUE ) Subsetting Limit your data.frame to elderly persons: sub_faers_df <- subset( faers_df , age_grp == "E" ) Calculate the mean (average) of this subset: mean( sub_faers_df[ , "init_fda_year" ] , na.rm = TRUE ) Measures of Uncertainty Calculate the variance, overall and by groups: var( faers_df[ , "init_fda_year" ] , na.rm = TRUE ) tapply( faers_df[ , "init_fda_year" ] , faers_df[ , "reporter_country_categories" ] , var , na.rm = TRUE ) Regression Models and Tests of Association Perform a t-test: t.test( init_fda_year ~ physician_reported , faers_df ) Perform a chi-squared test of association: this_table <- table( faers_df[ , c( "physician_reported" , "sex" ) ] ) chisq.test( this_table ) Perform a generalized linear model: glm_result <- glm( init_fda_year ~ physician_reported + sex , data = faers_df ) summary( glm_result ) Replication Example This example matches the death frequency counts in the OUTC23Q1.pdf file in the downloaded quarter: stopifnot( nrow( faers_df ) == 37704 ) Analysis Examples with dplyr   The R dplyr library offers an alternative grammar of data manipulation to base R and SQL syntax. dplyr offers many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, and the tidyverse style of non-standard evaluation. This vignette details the available features. As a starting point for FAERS users, this code replicates previously-presented examples: library(dplyr) faers_tbl <- as_tibble( faers_df ) Calculate the mean (average) of a linear variable, overall and by groups: faers_tbl %>% summarize( mean = mean( init_fda_year , na.rm = TRUE ) ) faers_tbl %>% group_by( reporter_country_categories ) %>% summarize( mean = mean( init_fda_year , na.rm = TRUE ) ) Analysis Examples with data.table   The R data.table library provides a high-performance version of base R’s data.frame with syntax and feature enhancements for ease of use, convenience and programming speed. data.table offers concise syntax: fast to type, fast to read, fast speed, memory efficiency, a careful API lifecycle management, an active community, and a rich set of features. This vignette details the available features. As a starting point for FAERS users, this code replicates previously-presented examples: library(data.table) faers_dt <- data.table( faers_df ) Calculate the mean (average) of a linear variable, overall and by groups: faers_dt[ , mean( init_fda_year , na.rm = TRUE ) ] faers_dt[ , mean( init_fda_year , na.rm = TRUE ) , by = reporter_country_categories ] Analysis Examples with duckdb   The R duckdb library provides an embedded analytical data management system with support for the Structured Query Language (SQL). duckdb offers a simple, feature-rich, fast, and free SQL OLAP management system. This vignette details the available features. As a starting point for FAERS users, this code replicates previously-presented examples: library(duckdb) con <- dbConnect( duckdb::duckdb() , dbdir = 'my-db.duckdb' ) dbWriteTable( con , 'faers' , faers_df ) Calculate the mean (average) of a linear variable, overall and by groups: dbGetQuery( con , 'SELECT AVG( init_fda_year ) FROM faers' ) dbGetQuery( con , 'SELECT reporter_country_categories , AVG( init_fda_year ) FROM faers GROUP BY reporter_country_categories' ) "],["general-social-survey-gss.html", "General Social Survey (GSS) Recommended Reading Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " General Social Survey (GSS) A historical record of the concerns, experiences, attitudes, and practices of residents of the United States. Both cross-sectional and panel tables with one row per sampled respondent. A complex sample survey generalizing to non-institutionalized adults (18+) in the United States. Updated biennially since 1972. Funded by National Science Foundation, administered by the National Opinion Research Center. Recommended Reading Four Example Strengths & Limitations: ✔️ Fifty years of comparable measures for trend analyses ✔️ Fifteen minutes of questionnaire aligns with International Social Survey Programme ❌ One adult per household interviewed, living in larger households lowers probability of selection ❌ In 2022, 21% of mixed-mode interviews were aged 65+ versus 10% of the web-only oversample Three Example Findings: Between 2000 and 2021, confidence in the scientific community has remained steady. During 2018 to 2021, support for gun control was overwhelmingly positive among survey respondents, differing from sentiment analysis of social media data over the same period. In 2021, 24% of Americans reported they were “not too happy” in life, up from 13% in 2018. Two Methodology Documents: DOCUMENTATION AND PUBLIC USE FILE CODEBOOK (Release 1) Wikipedia Entry One Haiku: # chat about who will # be allowed marriage, children. # first date questionnaire Download, Import, Preparation Download and import the 1972-2022 cumulative data file: library(haven) zip_tf <- tempfile() zip_url <- "https://gss.norc.org/Documents/sas/GSS_sas.zip" download.file( zip_url , zip_tf , mode = 'wb' ) unzipped_files <- unzip( zip_tf , exdir = tempdir() ) gss_tbl <- read_sas( grep( '\\\\.sas7bdat$' , unzipped_files , value = TRUE ) ) gss_df <- data.frame( gss_tbl ) names( gss_df ) <- tolower( names( gss_df ) ) gss_df[ , 'one' ] <- 1 Save Locally   Save the object at any point: # gss_fn <- file.path( path.expand( "~" ) , "GSS" , "this_file.rds" ) # saveRDS( gss_df , file = gss_fn , compress = FALSE ) Load the same object: # gss_df <- readRDS( gss_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) options( survey.lonely.psu = "adjust" ) gss_design <- svydesign( ~ vpsu , strata = ~ interaction( year , vstrat ) , data = subset( gss_df , year >= 1975 & !is.na( wtssnrps ) ) , weights = ~ wtssnrps , nest = TRUE ) Variable Recoding Add new columns to the data set: gss_design <- update( gss_design , polviews = factor( polviews , levels = 1:7 , labels = c( "Extremely liberal" , "Liberal" , "Slightly liberal" , "Moderate, middle of the road" , "Slightly conservative" , "Conservative" , "Extremely conservative" ) ) , born_in_usa = as.numeric( born == 1 ) , race = factor( race , levels = 1:3 , labels = c( "white" , "black" , "other" ) ) , region = factor( region , levels = 1:9 , labels = c( "New England" , "Middle Atlantic" , "East North Central" , "West North Central" , "South Atlantic" , "East South Central" , "West South Central" , "Mountain" , "Pacific" ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( gss_design , "sampling" ) != 0 ) svyby( ~ one , ~ region , gss_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , gss_design ) svyby( ~ one , ~ region , gss_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ age , gss_design , na.rm = TRUE ) svyby( ~ age , ~ region , gss_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ race , gss_design , na.rm = TRUE ) svyby( ~ race , ~ region , gss_design , svymean , na.rm = TRUE ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ age , gss_design , na.rm = TRUE ) svyby( ~ age , ~ region , gss_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ race , gss_design , na.rm = TRUE ) svyby( ~ race , ~ region , gss_design , svytotal , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ age , gss_design , 0.5 , na.rm = TRUE ) svyby( ~ age , ~ region , gss_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ adults , denominator = ~ hompop , gss_design , na.rm = TRUE ) Subsetting Restrict the survey design to females: sub_gss_design <- subset( gss_design , sex == 2 ) Calculate the mean (average) of this subset: svymean( ~ age , sub_gss_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ age , gss_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ age , ~ region , gss_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( gss_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ age , gss_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ age , gss_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ age , gss_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ born_in_usa , gss_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( age ~ born_in_usa , gss_design ) Perform a chi-squared test of association for survey data: svychisq( ~ born_in_usa + race , gss_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( age ~ born_in_usa + race , gss_design ) summary( glm_result ) Replication Example Match the unweighted record count totals on PDF page 74 of the Public Use File codebook: stopifnot( nrow( subset( gss_design , year == 2021 ) ) == 4032 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for GSS users, this code replicates previously-presented examples: library(srvyr) gss_srvyr_design <- as_survey( gss_design ) Calculate the mean (average) of a linear variable, overall and by groups: gss_srvyr_design %>% summarize( mean = survey_mean( age , na.rm = TRUE ) ) gss_srvyr_design %>% group_by( region ) %>% summarize( mean = survey_mean( age , na.rm = TRUE ) ) "],["health-and-retirement-study-hrs.html", "Health and Retirement Study (HRS) Recommended Reading Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " Health and Retirement Study (HRS) This detailed longitudinal study of the elderly in the United States allows for findings such as, “Among community residents aged 55-64 years old in 1998, what share lived in nursing homes by 2020?” Many tables from different timepoints, most with one row per sampled respondent and linkable. A complex sample survey designed to generalize to Americans aged 50+ at each interview point. Released biennially since 1992. Administered by the University of Michigan’s Institute for Social Research with data management by the RAND Corporation and cross-national harmonization by the University of Southern California. Funded by the National Institute on Aging and the Social Security Administration. Recommended Reading Four Example Strengths & Limitations: ✔️ Multiple cohorts allow extended tracking of older individuals across time ✔️ Linkable to Medicare, Medicaid, SSA records, also to genetic and biomarker data ❌ Sample size may prevent analysis of smaller populations or rare events ❌ Attritors may differ in many ways from the general population Three Example Findings: Among individuals aged 50+ between 2003 and 2016, 80% of those who experienced a long-term care hospital stay subsequently died or suffered a severe impairment within 30 months. Wealth loss of 75%+ was negatively associated with subsequent cognitive function over 2012-2016. The total monetary cost of dementia in 2010 was between $157 billion and $215 billion. Two Methodology Documents: Getting Started with the Health and Retirement Study RAND HRS Longitudinal File 2020 (V1) Documentation One Haiku: # sankey diagram # comes alive at fifty five # till death? you respond Download, Import, Preparation Register at the HRS Data Portal at https://hrsdata.isr.umich.edu/user/register. Choose RAND HRS Longitudinal File 2020 Latest release: Mar 2023 (V1). Download the STATA dataset randhrs1992_2020v1_STATA.zip dated 04/05/2023: library(haven) hrs_fn <- file.path( path.expand( "~" ) , "randhrs1992_2020v1.dta" ) hrs_tbl <- read_dta( hrs_fn ) hrs_df <- data.frame( hrs_tbl ) names( hrs_df ) <- tolower( names( hrs_df ) ) Save Locally   Save the object at any point: # hrs_fn <- file.path( path.expand( "~" ) , "HRS" , "this_file.rds" ) # saveRDS( hrs_df , file = hrs_fn , compress = FALSE ) Load the same object: # hrs_df <- readRDS( hrs_fn ) Survey Design Definition Construct a complex sample survey design: This design generalizes to residents of the United States that were living in the community in 1996 (wave 3) and also still alive (and participating in the survey) as of 2020 (wave 15): library(survey) hrs_design <- svydesign( id = ~ raehsamp , strata = ~ raestrat , weights = ~ r3wtresp , nest = TRUE , data = subset( hrs_df , r3wtresp > 0 & inw15 == 1 ) ) Variable Recoding Add new columns to the data set: hrs_design <- update( hrs_design , one = 1 , working_in_1996 = r3work , working_in_2020 = r15work , marital_stat_1996 = factor( r3mstat , levels = 1:8 , labels = c( "Married" , "Married, spouse absent" , "Partnered" , "Separated" , "Divorced" , "Separated/divorced" , "Widowed" , "Never married" ) ) , marital_stat_2020 = factor( r15mstat , levels = 1:8 , labels = c( "Married" , "Married, spouse absent" , "Partnered" , "Separated" , "Divorced" , "Separated/divorced" , "Widowed" , "Never married" ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( hrs_design , "sampling" ) != 0 ) svyby( ~ one , ~ marital_stat_1996 , hrs_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , hrs_design ) svyby( ~ one , ~ marital_stat_1996 , hrs_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ h15ahous , hrs_design , na.rm = TRUE ) svyby( ~ h15ahous , ~ marital_stat_1996 , hrs_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ marital_stat_2020 , hrs_design , na.rm = TRUE ) svyby( ~ marital_stat_2020 , ~ marital_stat_1996 , hrs_design , svymean , na.rm = TRUE ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ h15ahous , hrs_design , na.rm = TRUE ) svyby( ~ h15ahous , ~ marital_stat_1996 , hrs_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ marital_stat_2020 , hrs_design , na.rm = TRUE ) svyby( ~ marital_stat_2020 , ~ marital_stat_1996 , hrs_design , svytotal , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ h15ahous , hrs_design , 0.5 , na.rm = TRUE ) svyby( ~ h15ahous , ~ marital_stat_1996 , hrs_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ h4ahous , denominator = ~ h15ahous , hrs_design , na.rm = TRUE ) Subsetting Restrict the survey design to : sub_hrs_design <- subset( hrs_design , working_in_1996 == 1 ) Calculate the mean (average) of this subset: svymean( ~ h15ahous , sub_hrs_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ h15ahous , hrs_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ h15ahous , ~ marital_stat_1996 , hrs_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( hrs_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ h15ahous , hrs_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ h15ahous , hrs_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ h15ahous , hrs_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ working_in_2020 , hrs_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( h15ahous ~ working_in_2020 , hrs_design ) Perform a chi-squared test of association for survey data: svychisq( ~ working_in_2020 + marital_stat_2020 , hrs_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( h15ahous ~ working_in_2020 + marital_stat_2020 , hrs_design ) summary( glm_result ) Replication Example This example matches statistics and confidence intervals to four digits from the Gateway to Global Aging’s An Introduction to HRS, RAND HRS Longitudinal File, and Harmonized HRS: Navigate to Contributed Projects at https://hrsdata.isr.umich.edu/data-products/contributed-projects. Choose Gateway Harmonized HRS Latest release: Aug 2023 Version D Download the STATA dataset H_HRS_d_stata.zip dated 09/12/2023 harmonized_hrs_fn <- file.path( path.expand( "~" ) , "H_HRS_d.dta" ) harmonized_hrs_tbl <- read_dta( harmonized_hrs_fn ) harmonized_hrs_df <- data.frame( harmonized_hrs_tbl ) names( harmonized_hrs_df ) <- tolower( names( harmonized_hrs_df ) ) Merge on cluster and strata variables from the RAND HRS Longitudinal file: harmonized_hrs_rand_df <- merge( harmonized_hrs_df , hrs_df[ c( 'hhid' , 'pn' , 'raestrat' , 'raehsamp' ) ] , by = c( 'hhid' , 'pn' ) ) stopifnot( nrow( harmonized_hrs_rand_df ) == nrow( hrs_df ) ) Limit the survey design to respondents answering at least two of the five different life satisfaction questions in the 2014 (wave 12) psychosocial leave-behind survey: h12sc_df <- subset( harmonized_hrs_rand_df , r12scwtresp > 0 & inw12sc == 1 ) r12sc_design <- svydesign( ~ raehsamp , strata = ~ raestrat , data = h12sc_df , weights = ~ r12scwtresp , nest = TRUE ) Reproduce the coefficient, standard error, and confidence intervals presented at 53:20 of the tutorial: result <- svymean( ~ r12lsatsc , r12sc_design , na.rm = TRUE ) stopifnot( round( coef( result ) , 4 ) == 4.9822 ) stopifnot( round( SE( result ) , 4 ) == 0.0226 ) stopifnot( round( confint( result , df = degf( r12sc_design ) ) , 4 ) == c( 4.9369 , 5.0276 ) ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for HRS users, this code replicates previously-presented examples: library(srvyr) hrs_srvyr_design <- as_survey( hrs_design ) Calculate the mean (average) of a linear variable, overall and by groups: hrs_srvyr_design %>% summarize( mean = survey_mean( h15ahous , na.rm = TRUE ) ) hrs_srvyr_design %>% group_by( marital_stat_1996 ) %>% summarize( mean = survey_mean( h15ahous , na.rm = TRUE ) ) "],["medicare-current-beneficiary-survey-mcbs.html", "Medicare Current Beneficiary Survey (MCBS) Recommended Reading Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " Medicare Current Beneficiary Survey (MCBS) The monitoring system for Medicare enrollees in the United States on topics not available in the program’s administrative data, such as out of pocket expenditure and beneficiary satisfaction. Survey and supplemental tables with one row per sampled individual, although downloadable datasets not linkable. A complex sample survey designed to generalize to all elderly and disabled individuals with at least one month of program enrollment during the calendar year. Released annually as a public use file since 2015. Conducted by the Office of Enterprise Data and Analytics (OEDA) of the Centers for Medicare & Medicaid Services (CMS) through a contract with NORC at the University of Chicago. Recommended Reading Four Example Strengths & Limitations: ✔️ Respondents interviewed up to three times annually across four years ✔️ More than 1.2 million interviews since inception ❌ Some respondents designate a proxy to interview on their behalf ❌ Facility residents included, but not in public use file Three Example Findings: Among all Medicare beneficiaries, 7 percent reported having problems paying a medical bill in 2021. Between 1999 and 2017, Medicare beneficiaries with diabetes faced higher out-of-pocket costs. In 2020, healthcare expenditure for non-fatal falls was $80 billion, the majority paid by Medicare. Two Methodology Documents: MCBS Methodology Report MCBS Advanced Tutorial on Weighting and Variance Estimation One Haiku: # old, or disabled # access to medical care, # utilization Download, Import, Preparation tf <- tempfile() this_url <- "https://www.cms.gov/files/zip/cspuf2021.zip" download.file( this_url , tf , mode = 'wb' ) unzipped_files <- unzip( tf , exdir = tempdir() ) mcbs_csv <- grep( '\\\\.csv$' , unzipped_files , value = TRUE ) mcbs_df <- read.csv( mcbs_csv ) names( mcbs_df ) <- tolower( names( mcbs_df ) ) Save Locally   Save the object at any point: # mcbs_fn <- file.path( path.expand( "~" ) , "MCBS" , "this_file.rds" ) # saveRDS( mcbs_df , file = mcbs_fn , compress = FALSE ) Load the same object: # mcbs_df <- readRDS( mcbs_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) mcbs_design <- svrepdesign( weight = ~cspufwgt , repweights = 'cspuf[0-9]+' , mse = TRUE , type = 'Fay' , rho = 0.3 , data = mcbs_df ) Variable Recoding Add new columns to the data set: mcbs_design <- update( mcbs_design , one = 1 , csp_age = factor( csp_age , levels = 1:3 , labels = c( '01: younger than 65' , '02: 65 to 74' , '03: 75 or older' ) ) , two_or_more_chronic_conditions = as.numeric( csp_nchrncnd > 1 ) , csp_sex = factor( csp_sex , labels = c( 'male' , 'female' ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( mcbs_design , "sampling" ) != 0 ) svyby( ~ one , ~ csp_age , mcbs_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , mcbs_design ) svyby( ~ one , ~ csp_age , mcbs_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ pamtoop , mcbs_design ) svyby( ~ pamtoop , ~ csp_age , mcbs_design , svymean ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ csp_sex , mcbs_design ) svyby( ~ csp_sex , ~ csp_age , mcbs_design , svymean ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ pamtoop , mcbs_design ) svyby( ~ pamtoop , ~ csp_age , mcbs_design , svytotal ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ csp_sex , mcbs_design ) svyby( ~ csp_sex , ~ csp_age , mcbs_design , svytotal ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ pamtoop , mcbs_design , 0.5 ) svyby( ~ pamtoop , ~ csp_age , mcbs_design , svyquantile , 0.5 , ci = TRUE ) Estimate a ratio: svyratio( numerator = ~ pamtoop , denominator = ~ pamttot , mcbs_design ) Subsetting Restrict the survey design to household income below $25,000: sub_mcbs_design <- subset( mcbs_design , csp_income == 1 ) Calculate the mean (average) of this subset: svymean( ~ pamtoop , sub_mcbs_design ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ pamtoop , mcbs_design ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ pamtoop , ~ csp_age , mcbs_design , svymean ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( mcbs_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ pamtoop , mcbs_design ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ pamtoop , mcbs_design , deff = TRUE ) # SRS with replacement svymean( ~ pamtoop , mcbs_design , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ two_or_more_chronic_conditions , mcbs_design , method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( pamtoop ~ two_or_more_chronic_conditions , mcbs_design ) Perform a chi-squared test of association for survey data: svychisq( ~ two_or_more_chronic_conditions + csp_sex , mcbs_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( pamtoop ~ two_or_more_chronic_conditions + csp_sex , mcbs_design ) summary( glm_result ) Replication Example This example matches the weighted total from the 2021 Data User’s Guide: Cost Supplement File Public Use File: stopifnot( round( coef( svytotal( ~ one , mcbs_design ) ) , 0 ) == 59040948 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for MCBS users, this code replicates previously-presented examples: library(srvyr) mcbs_srvyr_design <- as_survey( mcbs_design ) Calculate the mean (average) of a linear variable, overall and by groups: mcbs_srvyr_design %>% summarize( mean = survey_mean( pamtoop ) ) mcbs_srvyr_design %>% group_by( csp_age ) %>% summarize( mean = survey_mean( pamtoop ) ) "],["medical-expenditure-panel-survey-meps.html", "Medical Expenditure Panel Survey (MEPS) Recommended Reading Function Definitions Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " Medical Expenditure Panel Survey (MEPS) The Household Component captures person-level spending across service categories, coverage types. The consolidated file contains one row per individual within each sampled household, other tables contain one record per event (like prescription fills, hospitalizations), per job, per insurance policy. A complex sample survey designed to generalize to the U.S. civilian non-institutionalized population. Released annually since 1996. Administered by the Agency for Healthcare Research and Quality. Recommended Reading Four Example Strengths & Limitations: ✔️ Detailed information about individual medical events ✔️ Detailed sources of health expenditures ❌ Methodological changes could make a notable impact on trend analyses for conditions ❌ Household-reported medical events may be undercounted Three Example Findings: In 2021, the top 1 percent of the population ranked by their healthcare expenditures accounted for 24.0 percent of total healthcare expenditures among the U.S. civilian noninstitutionalized population. Compared to those losing a job during the three prior years, nonelderly adults who lost a job during 2014 thru 2016 had a 6% net reduction in loss of health insurance coverage. Americans lose ~$1,500 per year (in 2013 USD) over their life-course due to bad health realizations. Two Methodology Documents: MEPS HC-224 2020 Full Year Consolidated Data File Wikipedia Entry One Haiku: # king dumpty's horsemen # ahrq stitches payors, bills, claims # fractured health system Function Definitions Define a function to download, unzip, and import each sas file: library(haven) meps_sas_import <- function( this_url ){ this_tf <- tempfile() download.file( this_url , this_tf , mode = 'wb' ) this_tbl <- read_sas( this_tf ) this_df <- data.frame( this_tbl ) names( this_df ) <- tolower( names( this_df ) ) this_df } Download, Import, Preparation Download and import the 2022 consolidated file and the replicate weights file: meps_cons_df <- meps_sas_import( "https://meps.ahrq.gov/mepsweb/data_files/pufs/h243/h243v9.zip" ) meps_brr_df <- meps_sas_import( "https://meps.ahrq.gov/mepsweb/data_files/pufs/h036brr/h36brr22v9.zip" ) Merge the consolidated file with the replicate weights: meps_df <- merge( meps_cons_df , meps_brr_df ) stopifnot( nrow( meps_df ) == nrow( meps_cons_df ) ) meps_df[ , 'one' ] <- 1 Save Locally   Save the object at any point: # meps_fn <- file.path( path.expand( "~" ) , "MEPS" , "this_file.rds" ) # saveRDS( meps_df , file = meps_fn , compress = FALSE ) Load the same object: # meps_df <- readRDS( meps_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) meps_design <- svrepdesign( data = meps_df , weights = ~ perwt22f , type = "BRR" , combined.weights = FALSE , repweights = "brr[1-9]+" , mse = TRUE ) Variable Recoding Add new columns to the data set: meps_design <- update( meps_design , one = 1 , insured_december_31st = ifelse( ins22x %in% 1:2 , as.numeric( ins22x == 1 ) , NA ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( meps_design , "sampling" ) != 0 ) svyby( ~ one , ~ region22 , meps_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , meps_design ) svyby( ~ one , ~ region22 , meps_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ totexp22 , meps_design ) svyby( ~ totexp22 , ~ region22 , meps_design , svymean ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ sex , meps_design ) svyby( ~ sex , ~ region22 , meps_design , svymean ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ totexp22 , meps_design ) svyby( ~ totexp22 , ~ region22 , meps_design , svytotal ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ sex , meps_design ) svyby( ~ sex , ~ region22 , meps_design , svytotal ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ totexp22 , meps_design , 0.5 ) svyby( ~ totexp22 , ~ region22 , meps_design , svyquantile , 0.5 , ci = TRUE ) Estimate a ratio: svyratio( numerator = ~ totmcd22 , denominator = ~ totexp22 , meps_design ) Subsetting Restrict the survey design to seniors: sub_meps_design <- subset( meps_design , agelast >= 65 ) Calculate the mean (average) of this subset: svymean( ~ totexp22 , sub_meps_design ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ totexp22 , meps_design ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ totexp22 , ~ region22 , meps_design , svymean ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( meps_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ totexp22 , meps_design ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ totexp22 , meps_design , deff = TRUE ) # SRS with replacement svymean( ~ totexp22 , meps_design , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ insured_december_31st , meps_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( totexp22 ~ insured_december_31st , meps_design ) Perform a chi-squared test of association for survey data: svychisq( ~ insured_december_31st + sex , meps_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( totexp22 ~ insured_december_31st + sex , meps_design ) summary( glm_result ) Replication Example This example matches the statistic and standard error shown under Analysis of the Total Population: library(foreign) xport_2002_tf <- tempfile() xport_2002_url <- "https://meps.ahrq.gov/data_files/pufs/h70ssp.zip" download.file( xport_2002_url , xport_2002_tf , mode = 'wb' ) unzipped_2002_xport <- unzip( xport_2002_tf , exdir = tempdir() ) meps_2002_df <- read.xport( unzipped_2002_xport ) names( meps_2002_df ) <- tolower( names( meps_2002_df ) ) meps_2002_design <- svydesign( ~ varpsu , strata = ~ varstr , weights = ~ perwt02f , data = meps_2002_df , nest = TRUE ) result <- svymean( ~ totexp02 , meps_2002_design ) stopifnot( round( coef( result ) , 2 ) == 2813.24 ) stopifnot( round( SE( result ) , 2 ) == 58.99 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for MEPS users, this code replicates previously-presented examples: library(srvyr) meps_srvyr_design <- as_survey( meps_design ) Calculate the mean (average) of a linear variable, overall and by groups: meps_srvyr_design %>% summarize( mean = survey_mean( totexp22 ) ) meps_srvyr_design %>% group_by( region22 ) %>% summarize( mean = survey_mean( totexp22 ) ) "],["medical-large-claims-experience-study-mlces.html", "Medical Large Claims Experience Study (MLCES) Recommended Reading Download, Import, Preparation Analysis Examples with base R   Replication Example Analysis Examples with dplyr   Analysis Examples with data.table   Analysis Examples with duckdb  ", " Medical Large Claims Experience Study (MLCES) A high quality dataset of medical claims from seven private health insurance companies. One table with one row per individual with nonzero total paid charges. A convenience sample of group (employer-sponsored) health insurers in the United States. 1997 thru 1999 with no expected updates in the future. Provided by the Society of Actuaries (SOA). Recommended Reading Two Methodology Documents: Group Medical Insurance Claims Database Collection and Analysis Report Claim Severities, Claim Relativities, and Age: Evidence from SOA Group Health Data One Haiku: # skewed by black swan tails # means, medians sing adieu # claims distribution Download, Import, Preparation Download and import the 1999 medical claims file: tf <- tempfile() this_url <- "https://www.soa.org/Files/Research/1999.zip" download.file( this_url , tf , mode = 'wb' ) unzipped_file <- unzip( tf , exdir = tempdir() ) mlces_df <- read.csv( unzipped_file ) names( mlces_df ) <- tolower( names( mlces_df ) ) Save Locally   Save the object at any point: # mlces_fn <- file.path( path.expand( "~" ) , "MLCES" , "this_file.rds" ) # saveRDS( mlces_df , file = mlces_fn , compress = FALSE ) Load the same object: # mlces_df <- readRDS( mlces_fn ) Variable Recoding Add new columns to the data set: mlces_df <- transform( mlces_df , one = 1 , claimant_relationship_to_policyholder = ifelse( relation == "E" , "covered employee" , ifelse( relation == "S" , "spouse of covered employee" , ifelse( relation == "D" , "dependent of covered employee" , NA ) ) ) , ppo_plan = as.numeric( ppo == 'Y' ) ) Analysis Examples with base R   Unweighted Counts Count the unweighted number of records in the table, overall and by groups: nrow( mlces_df ) table( mlces_df[ , "claimant_relationship_to_policyholder" ] , useNA = "always" ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: mean( mlces_df[ , "totpdchg" ] ) tapply( mlces_df[ , "totpdchg" ] , mlces_df[ , "claimant_relationship_to_policyholder" ] , mean ) Calculate the distribution of a categorical variable, overall and by groups: prop.table( table( mlces_df[ , "patsex" ] ) ) prop.table( table( mlces_df[ , c( "patsex" , "claimant_relationship_to_policyholder" ) ] ) , margin = 2 ) Calculate the sum of a linear variable, overall and by groups: sum( mlces_df[ , "totpdchg" ] ) tapply( mlces_df[ , "totpdchg" ] , mlces_df[ , "claimant_relationship_to_policyholder" ] , sum ) Calculate the median (50th percentile) of a linear variable, overall and by groups: quantile( mlces_df[ , "totpdchg" ] , 0.5 ) tapply( mlces_df[ , "totpdchg" ] , mlces_df[ , "claimant_relationship_to_policyholder" ] , quantile , 0.5 ) Subsetting Limit your data.frame to persons under 18: sub_mlces_df <- subset( mlces_df , ( ( claimyr - patbrtyr ) < 18 ) ) Calculate the mean (average) of this subset: mean( sub_mlces_df[ , "totpdchg" ] ) Measures of Uncertainty Calculate the variance, overall and by groups: var( mlces_df[ , "totpdchg" ] ) tapply( mlces_df[ , "totpdchg" ] , mlces_df[ , "claimant_relationship_to_policyholder" ] , var ) Regression Models and Tests of Association Perform a t-test: t.test( totpdchg ~ ppo_plan , mlces_df ) Perform a chi-squared test of association: this_table <- table( mlces_df[ , c( "ppo_plan" , "patsex" ) ] ) chisq.test( this_table ) Perform a generalized linear model: glm_result <- glm( totpdchg ~ ppo_plan + patsex , data = mlces_df ) summary( glm_result ) Replication Example This example matches statistics in Table II-A’s 1999 row numbers 52 and 53 from the Database: Match Claimants Exceeding Deductible: # $0 deductible stopifnot( nrow( mlces_df ) == 1591738 ) # $1,000 deductible mlces_above_1000_df <- subset( mlces_df , totpdchg > 1000 ) stopifnot( nrow( mlces_above_1000_df ) == 402550 ) Match the Excess Charges Above Deductible: # $0 deductible stopifnot( round( sum( mlces_df[ , 'totpdchg' ] ) , 0 ) == 2599356658 ) # $1,000 deductible stopifnot( round( sum( mlces_above_1000_df[ , 'totpdchg' ] - 1000 ) , 0 ) == 1883768786 ) Analysis Examples with dplyr   The R dplyr library offers an alternative grammar of data manipulation to base R and SQL syntax. dplyr offers many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, and the tidyverse style of non-standard evaluation. This vignette details the available features. As a starting point for MLCES users, this code replicates previously-presented examples: library(dplyr) mlces_tbl <- as_tibble( mlces_df ) Calculate the mean (average) of a linear variable, overall and by groups: mlces_tbl %>% summarize( mean = mean( totpdchg ) ) mlces_tbl %>% group_by( claimant_relationship_to_policyholder ) %>% summarize( mean = mean( totpdchg ) ) Analysis Examples with data.table   The R data.table library provides a high-performance version of base R’s data.frame with syntax and feature enhancements for ease of use, convenience and programming speed. data.table offers concise syntax: fast to type, fast to read, fast speed, memory efficiency, a careful API lifecycle management, an active community, and a rich set of features. This vignette details the available features. As a starting point for MLCES users, this code replicates previously-presented examples: library(data.table) mlces_dt <- data.table( mlces_df ) Calculate the mean (average) of a linear variable, overall and by groups: mlces_dt[ , mean( totpdchg ) ] mlces_dt[ , mean( totpdchg ) , by = claimant_relationship_to_policyholder ] Analysis Examples with duckdb   The R duckdb library provides an embedded analytical data management system with support for the Structured Query Language (SQL). duckdb offers a simple, feature-rich, fast, and free SQL OLAP management system. This vignette details the available features. As a starting point for MLCES users, this code replicates previously-presented examples: library(duckdb) con <- dbConnect( duckdb::duckdb() , dbdir = 'my-db.duckdb' ) dbWriteTable( con , 'mlces' , mlces_df ) Calculate the mean (average) of a linear variable, overall and by groups: dbGetQuery( con , 'SELECT AVG( totpdchg ) FROM mlces' ) dbGetQuery( con , 'SELECT claimant_relationship_to_policyholder , AVG( totpdchg ) FROM mlces GROUP BY claimant_relationship_to_policyholder' ) "],["national-agricultural-workers-survey-naws.html", "National Agricultural Workers Survey (NAWS) Recommended Reading Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " National Agricultural Workers Survey (NAWS) The primary face-to-face interview of currently-employed crop workers in the United States, with detailed questions on demographics, occupational injury, health surveillance, and seasonal and migrant labor. One cumulative table containing all interviews since 1989, with one row per sampled respondent. A complex sample designed to generalize to crop production workers employed by establishments engaged in Crop Production (NAICS 111) and Support Activities for Crop Production (NAICS 1151). Released biennially since 1989. Administered by the Employment and Training Administration, in partnership with JBS International. Recommended Reading Four Example Strengths & Limitations: ✔️ Employer-based sample increases the likelihood migrant workers will be interviewed ✔️ Seasonal sampling in order to avoid bias ❌ Respondents not followed over time ❌ Except for California, the data are not available at the state level Three Example Findings: Across 2019-2020, 49% of US crop workers said their most recent health care visit for preventive or routine care was to a community health center or migrant health clinic. Pesticide exposure increased between 2002 and 2016 among US crop workers. Hired crop workers who responded negatively to “employer provides clean drinking water and disposable cups every day” were at greater odds of injury between 2002 and 2015. Two Methodology Documents: Findings from the National Agricultural Workers Survey (NAWS) 2021–2022: A Demographic and Employment Profile of United States Crop Workers Statistical Methods of the National Agricultural Workers Survey One Haiku: # were i king, my court: # arcimboldo's vertumnus # jester juggling self Download, Import, Preparation The public access dataset does not currently include the variables needed to get design-adjusted estimates. Previous data releases contained replicate weights; however, those have been discontinued. Although the PUF allows external researchers to match weighted shares, the UCLA Statistical Consulting Group cautions ignoring the clustering will likely lead to standard errors that are underestimated, possibly leading to results that seem to be statistically significant, when in fact, they are not. In order for the Employment and Training Administration (ETA) to consider a request for offsite use of the restricted NAWS data file, send these items to the contact listed here for inquiries about the survey: A brief description of the research aims and how NAWS data will support the research; A statement as to why the NAWS public data file is insufficient to meet the research aims; A description of how and when the resulting findings will be disseminated; and A brief description of the analysis plan, so that NAWS staff may assess the suitability of the NAWS given the research aims and analysis plan. Upon receipt of this microdata, begin by loading the SAS file: library(haven) naws_tbl <- read_sas( file.path( path.expand( "~" ) , "nawscrtdvars2db22.sas7bdat" ) ) naws_df <- data.frame( naws_tbl ) names( naws_df ) <- tolower( names( naws_df ) ) Save Locally   Save the object at any point: # naws_fn <- file.path( path.expand( "~" ) , "NAWS" , "this_file.rds" ) # saveRDS( naws_df , file = naws_fn , compress = FALSE ) Load the same object: # naws_df <- readRDS( naws_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) options( survey.lonely.psu = "adjust" ) naws_design <- svydesign( id = ~ cluster , strata = ~ interaction( fpc_region , cycle ) , data = naws_df , weights = ~ pwtycrd, nest = TRUE ) Variable Recoding Add new columns to the data set: naws_design <- update( naws_design , one = 1 , country_of_birth = factor( findInterval( a07 , c( 3 , 4 , 5 , 100 ) ) , levels = 0:4 , labels = c( 'us or pr' , 'mexico' , 'central america' , 'south america, carribean, asia, or other' , 'missing' ) ) , gender = factor( gender , levels = 0:1 , labels = c( 'male' , 'female' ) ) , interview_cohort = factor( findInterval( fy , seq( 1989 , 2021 , 2 ) ) , levels = seq_along( seq( 1989 , 2021 , 2 ) ) , labels = paste( seq( 1989 , 2021 , 2 ) , seq( 1990 , 2022 , 2 ) , sep = '-' ) ) , authorized_to_work = ifelse( l01 < 9 , as.numeric( l01 < 5 ) , NA ) , hours_worked_last_week_at_farm_job = d04 ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( naws_design , "sampling" ) != 0 ) svyby( ~ one , ~ interview_cohort , naws_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , naws_design ) svyby( ~ one , ~ interview_cohort , naws_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ waget1 , naws_design , na.rm = TRUE ) svyby( ~ waget1 , ~ interview_cohort , naws_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ country_of_birth , naws_design , na.rm = TRUE ) svyby( ~ country_of_birth , ~ interview_cohort , naws_design , svymean , na.rm = TRUE ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ waget1 , naws_design , na.rm = TRUE ) svyby( ~ waget1 , ~ interview_cohort , naws_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ country_of_birth , naws_design , na.rm = TRUE ) svyby( ~ country_of_birth , ~ interview_cohort , naws_design , svytotal , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ waget1 , naws_design , 0.5 , na.rm = TRUE ) svyby( ~ waget1 , ~ interview_cohort , naws_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ hours_worked_last_week_at_farm_job , denominator = ~ numfempl , naws_design , na.rm = TRUE ) Subsetting Restrict the survey design to California, the only standalone state with adequate sample: sub_naws_design <- subset( naws_design , region12 == 'CA' ) Calculate the mean (average) of this subset: svymean( ~ waget1 , sub_naws_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ waget1 , naws_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ waget1 , ~ interview_cohort , naws_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( naws_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ waget1 , naws_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ waget1 , naws_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ waget1 , naws_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ authorized_to_work , naws_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( waget1 ~ authorized_to_work , naws_design ) Perform a chi-squared test of association for survey data: svychisq( ~ authorized_to_work + country_of_birth , naws_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( waget1 ~ authorized_to_work + country_of_birth , naws_design ) summary( glm_result ) Replication Example This example matches the unweighted counts and weighted percents of the gender rows shown on PDF page 90 of the most current research report; however, the restricted-use dataset does not include information to implement a finite population correction (FPC). Since a FPC always reduces the standard error, omitting it only makes results more conservative. JBS International shared standard errors and coefficients of variation omitting the FPC, this exercise precisely matches those numbers as well: # less conservative options( survey.lonely.psu = "remove" ) published_unweighted_counts <- c( 1823 , 775 ) published_percentages <- c( 0.68 , 0.32 ) unpublished_se <- c( 0.024 , 0.024 ) unpublished_cv <- c( 0.04 , 0.08 ) current_cohort <- subset( naws_design , interview_cohort == '2021-2022' ) ( unwtd_n <- svyby( ~ one , ~ gender , current_cohort , unwtd.count ) ) stopifnot( all( coef( unwtd_n ) == published_unweighted_counts ) ) ( results <- svymean( ~ gender , current_cohort ) ) stopifnot( all( round( coef( results ) , 2 ) == published_percentages ) ) stopifnot( all( round( SE( results ) , 3 ) == unpublished_se ) ) stopifnot( all( round( cv( results ) , 2 ) == unpublished_cv ) ) # more conservative options( survey.lonely.psu = "adjust" ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for NAWS users, this code replicates previously-presented examples: library(srvyr) naws_srvyr_design <- as_survey( naws_design ) Calculate the mean (average) of a linear variable, overall and by groups: naws_srvyr_design %>% summarize( mean = survey_mean( waget1 , na.rm = TRUE ) ) naws_srvyr_design %>% group_by( interview_cohort ) %>% summarize( mean = survey_mean( waget1 , na.rm = TRUE ) ) "],["national-beneficiary-survey-nbs.html", "National Beneficiary Survey (NBS) Recommended Reading Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " National Beneficiary Survey (NBS) The principal microdata for U.S. disability researchers interested in Social Security program performance. One table with one row per respondent. A complex sample designed to generalize to Americans between age 18 and full retirement age, covered by either Social Security Disability Insurance (SSDI) or Supplemental Security Income (SSI). Released at irregular intervals, with 2004, 2005, 2006, 2010, 2015, 2017, and 2019 available. Administered by the Social Security Administration. Recommended Reading Four Example Strengths & Limitations: ✔️ Instrument designed to reduce challenges related to communication, stamina, cognitive barriers ✔️ Longitudinal 2019 sample includes beneficiaries working at prior round (2017) interview ❌ Not designed to produce regional or state-level estimates ❌ May overstate beneficiary poverty status and understate beneficiary income Three Example Findings: Large gaps in income and expenditure between Social Security Disability Insurance recipient households and working households generally increase with the number of dependents. The share of Social Security Disability Insurance beneficiaries who had work goals or work expectations rose from 34% in 2005 to 43% in 2015. In 2010, 9% of disabled-worker beneficiaries had a 4-year degree, 28% less than high school. Two Methodology Documents: National Beneficiary Survey: Disability Statistics, 2015 National Beneficiary Survey - General Waves Round 7: User’s Guide One Haiku: # social safety net # poverty acrobatics # trap or trampoline Download, Import, Preparation Download and import the round 7 file: library(haven) zip_tf <- tempfile() zip_url <- "https://www.ssa.gov/disabilityresearch/documents/R7NBSPUF_STATA.zip" download.file( zip_url , zip_tf , mode = 'wb' ) nbs_tbl <- read_stata( zip_tf ) nbs_df <- data.frame( nbs_tbl ) names( nbs_df ) <- tolower( names( nbs_df ) ) nbs_df[ , 'one' ] <- 1 Save Locally   Save the object at any point: # nbs_fn <- file.path( path.expand( "~" ) , "NBS" , "this_file.rds" ) # saveRDS( nbs_df , file = nbs_fn , compress = FALSE ) Load the same object: # nbs_df <- readRDS( nbs_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) options( survey.lonely.psu = "adjust" ) # representative beneficiary sample nbs_design <- svydesign( id = ~ r7_a_psu_pub , strata = ~ r7_a_strata , weights = ~ r7_wtr7_ben , data = subset( nbs_df , r7_wtr7_ben > 0 ) ) # cross-sectional successful worker sample nbs_design <- svydesign( id = ~ r7_a_psu_pub , strata = ~ r7_a_strata , weights = ~ r7_wtr7_cssws , data = subset( nbs_df , r7_wtr7_cssws > 0 ) ) # longitudinal successful worker sample lngsws_design <- svydesign( id = ~ r7_a_psu_pub , strata = ~ r7_a_strata , weights = ~ r7_wtr7_lngsws , data = subset( nbs_df , r7_wtr7_lngsws > 0 ) ) Variable Recoding Add new columns to the data set: nbs_design <- update( nbs_design , male = as.numeric( r7_orgsampinfo_sex == 1 ) , age_categories = factor( r7_c_intage_pub , labels = c( "18-25" , "26-40" , "41-55" , "56 and older" ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( nbs_design , "sampling" ) != 0 ) svyby( ~ one , ~ age_categories , nbs_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , nbs_design ) svyby( ~ one , ~ age_categories , nbs_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ r7_n_totssbenlastmnth_pub , nbs_design , na.rm = TRUE ) svyby( ~ r7_n_totssbenlastmnth_pub , ~ age_categories , nbs_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ r7_c_hhsize_pub , nbs_design , na.rm = TRUE ) svyby( ~ r7_c_hhsize_pub , ~ age_categories , nbs_design , svymean , na.rm = TRUE ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ r7_n_totssbenlastmnth_pub , nbs_design , na.rm = TRUE ) svyby( ~ r7_n_totssbenlastmnth_pub , ~ age_categories , nbs_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ r7_c_hhsize_pub , nbs_design , na.rm = TRUE ) svyby( ~ r7_c_hhsize_pub , ~ age_categories , nbs_design , svytotal , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ r7_n_totssbenlastmnth_pub , nbs_design , 0.5 , na.rm = TRUE ) svyby( ~ r7_n_totssbenlastmnth_pub , ~ age_categories , nbs_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ r7_n_ssilastmnth_pub , denominator = ~ r7_n_totssbenlastmnth_pub , nbs_design , na.rm = TRUE ) Subsetting Restrict the survey design to currently covered by Medicare: sub_nbs_design <- subset( nbs_design , r7_c_curmedicare == 1 ) Calculate the mean (average) of this subset: svymean( ~ r7_n_totssbenlastmnth_pub , sub_nbs_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ r7_n_totssbenlastmnth_pub , nbs_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ r7_n_totssbenlastmnth_pub , ~ age_categories , nbs_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( nbs_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ r7_n_totssbenlastmnth_pub , nbs_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ r7_n_totssbenlastmnth_pub , nbs_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ r7_n_totssbenlastmnth_pub , nbs_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ male , nbs_design , method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( r7_n_totssbenlastmnth_pub ~ male , nbs_design ) Perform a chi-squared test of association for survey data: svychisq( ~ male + r7_c_hhsize_pub , nbs_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( r7_n_totssbenlastmnth_pub ~ male + r7_c_hhsize_pub , nbs_design ) summary( glm_result ) Replication Example This example matches the percentages and t-tests from the final ten rows of Exhibit 4: ex_4 <- data.frame( variable_label = c( 'coping with stress' , 'concentrating' , 'getting around outside of the home' , 'shopping for personal items' , 'preparing meals' , 'getting into or out of bed' , 'bathing or dressing' , 'getting along with others' , 'getting around inside the house' , 'eating' ) , variable_name = c( "r3_i60_i" , "r3_i59_i" , "r3_i47_i" , "r3_i53_i" , "r3_i55_i" , "r3_i49_i" , "r3_i51_i" , "r3_i61_i" , "r3_i45_i" , "r3_i57_i" ) , overall = c( 61 , 58 , 47 , 39 , 37 , 34 , 30 , 27 , 23 , 14 ) , di_only = c( 60 , 54 , 47 , 36 , 35 , 36 , 30 , 23 , 24 , 13 ) , concurrent = c( 63 , 63 , 47 , 43 , 41 , 34 , 33 , 31 , 23 , 15 ) , concurrent_vs_di = c( F , T , F , F , F , F , F , T , F , F ) , ssi = c( 61 , 62 , 47 , 40 , 39 , 33 , 29 , 31 , 22 , 15 ) , ssi_vs_di = c( F , T , F , F , F , F , F , T , F , F ) ) Download, import, and recode the round 3 file: r3_tf <- tempfile() r3_url <- "https://www.ssa.gov/disabilityresearch/documents/nbsr3pufstata.zip" download.file( r3_url , r3_tf , mode = 'wb' ) r3_tbl <- read_stata( r3_tf ) r3_df <- data.frame( r3_tbl ) names( r3_df ) <- tolower( names( r3_df ) ) r3_design <- svydesign( id = ~ r3_a_psu_pub , strata = ~ r3_a_strata , weights = ~ r3_wtr3_ben , data = subset( r3_df , r3_wtr3_ben > 0 ) ) r3_design <- update( r3_design , benefit_type = factor( r3_orgsampinfo_bstatus , levels = c( 2 , 3 , 1 ) , labels = c( 'di_only' , 'concurrent' , 'ssi' ) ) ) Calculate the final ten rows of exhibit 4 and confirm each statistics and t-test matches: for( i in seq( nrow( ex_4 ) ) ){ this_formula <- as.formula( paste( "~" , ex_4[ i , 'variable_name' ] ) ) overall_percent <- svymean( this_formula , r3_design ) stopifnot( 100 * round( coef( overall_percent ) , 2 ) == ex_4[ i , 'overall_percent' ] ) benefit_percent <- svyby( this_formula , ~ benefit_type , r3_design , svymean ) stopifnot( all.equal( 100 * as.numeric( round( coef( benefit_percent ) , 2 ) ) , as.numeric( ex_4[ i , c( 'di_only' , 'concurrent' , 'ssi' ) ] ) ) ) ttest_formula <- as.formula( paste( ex_4[ i , 'variable_name' ] , "~ benefit_type" ) ) di_only_con_design <- subset( r3_design , benefit_type %in% c( 'di_only' , 'concurrent' ) ) con_ttest <- svyttest( ttest_formula , di_only_con_design ) stopifnot( all.equal( as.logical( con_ttest$p.value < 0.05 ) , as.logical( ex_4[ i , 'concurrent_vs_di' ] ) ) ) di_only_ssi_design <- subset( r3_design , benefit_type %in% c( 'di_only' , 'ssi' ) ) ssi_ttest <- svyttest( ttest_formula , di_only_ssi_design ) stopifnot( all.equal( as.logical( ssi_ttest$p.value < 0.05 ) , as.logical( ex_4[ i , 'ssi_vs_di' ] ) ) ) } Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for NBS users, this code replicates previously-presented examples: library(srvyr) nbs_srvyr_design <- as_survey( nbs_design ) Calculate the mean (average) of a linear variable, overall and by groups: nbs_srvyr_design %>% summarize( mean = survey_mean( r7_n_totssbenlastmnth_pub , na.rm = TRUE ) ) nbs_srvyr_design %>% group_by( age_categories ) %>% summarize( mean = survey_mean( r7_n_totssbenlastmnth_pub , na.rm = TRUE ) ) "],["national-crime-victimization-survey-ncvs.html", "National Crime Victimization Survey (NCVS) Recommended Reading Function Definitions Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " National Crime Victimization Survey (NCVS) The primary information source on victims of nonfatal personal crimes and household property crimes (especially those not reported to the police), and also victim experience within the justice system. Three tables, the first one row per household per interview, the second one per person-interview, the third one per incident reported across each sampled household’s seven-interview, three-year period. A complex survey designed to generalize to civilian, non-institutional americans aged 12 and older. Released annually since its 1992 rename and redesign, related surveys dating to the early 1970s. Sponsored by the Bureau of Justics Statistics and administered by the US Census Bureau. Recommended Reading Four Example Strengths & Limitations: ✔️ Detailed taxonomy of crime incidents ✔️ Estimates available for 22 largest states starting in 2017 ❌ May undercount rape and sexual assault ❌ Six month reference period despite respondent recall more accurate looking back only three months Three Example Findings: Nonfatal firearm violence for persons age 12 or older declined 72% from 1993 to 2023. In 2013, less than half of violent crime incidents victimizing individuals aged 12+ reported to police. Offenders armed with firearms accounted for 38% of nonfatal carjackings between 2012 and 2021. Two Methodology Documents: National Crime Victimization Survey, 2016: Technical Documentation A New Measure of Prevalence for the National Crime Victimization Survey One Haiku: # saint peter's sports bar # evil deed instant replay # sinful thought jukebox Function Definitions Define a function to extract values stored in parentheses: ncvs_numeric_to_factor <- function( this_column ) as.numeric( gsub( "^\\\\(([0-9]+)\\\\) (.*)" , "\\\\1" , this_column ) ) Define a function to merge aggregated information onto main data.frame objects: left_join_zero_missings <- function( left_df , right_df ){ final_df <- merge( left_df , right_df , all.x = TRUE ) stopifnot( nrow( final_df ) == nrow( left_df ) ) for( this_column in setdiff( names( right_df ) , names( left_df ) ) ){ final_df[ is.na( final_df[ , this_column ] ) , this_column ] <- 0 } gc() final_df } Download, Import, Preparation Register for the National Archive of Criminal Justice Data at https://www.icpsr.umich.edu/web/NACJD/series/95 Choose National Crime Victimization Survey, Concatenated File, [United States], 1992-2022 (ICPSR 38604) Download the R version of the September 18, 2023 file. Import the three main files: ncvs_household_df_name <- load( file.path( path.expand( "~" ) , "ICPSR_38604/DS0001/38604-0001-Data.rda" ) ) ncvs_person_df_name <- load( file.path( path.expand( "~" ) , "ICPSR_38604/DS0002/38604-0002-Data.rda" ) ) ncvs_incident_df_name <- load( file.path( path.expand( "~" ) , "ICPSR_38604/DS0003/38604-0003-Data.rda" ) ) ncvs_household_df <- get( ncvs_household_df_name ) ncvs_person_df <- get( ncvs_person_df_name ) ncvs_incident_df <- get( ncvs_incident_df_name ) rm( list = ncvs_household_df_name ) ; gc() rm( list = ncvs_person_df_name ) ; gc() rm( list = ncvs_incident_df_name ) ; gc() names( ncvs_household_df ) <- tolower( names( ncvs_household_df ) ) names( ncvs_person_df ) <- tolower( names( ncvs_person_df ) ) names( ncvs_incident_df ) <- tolower( names( ncvs_incident_df ) ) Determine which variables from each table to retain: household_variables_to_keep <- c( 'year' , 'yearq' , 'idhh' , 'wgthhcy' , 'v2002' , 'sc214a' , 'v2026' , 'v2126a' , 'v2126b' , 'v2015' , 'v2017' , 'v2117' , 'v2118' , 'v2125' , 'v2071' , 'v2072' , 'v2127b' , 'v2129' ) person_variables_to_keep <- c( 'year' , 'yearq' , 'v3018' , 'v3014' , 'sc214a' , 'v3023' , 'v3023a' , 'v3024' , 'v3024a' , 'v2117' , 'v2118' , 'v3002' , 'idhh' , 'idper' , 'wgtpercy' , 'v3015' , 'v3033' , 'v2026' ) incident_variables_to_keep <- c( 'year' , 'yearq' , 'v2117' , 'v2118' , 'v4022' , paste0( 'v401' , 6:9 ) , 'v4399' , 'v4529' , 'v4049' , paste0( 'v405' , 0:8 ) , 'v4060' , 'v4062' , paste0( 'v41' , 11:22 ) , 'v4064' , paste0( 'v41' , 27:37 ) , 'v4467' , 'v4234' , 'v4245' , 'v4243' , 'v4241' , 'v4256' , 'v4258' , 'v4278' , 'v4262' , paste0( 'v42' , 59:61 ) , 'v4269' , 'v4270' , 'v4268' , 'v4267' , 'v4271' , 'v4266' , 'v4265' , 'wgtviccy' , 'idhh' , 'idper' , 'v4002' , 'v4288' , 'v4290' , 'v4400' , 'v4437' , 'v4422' , 'v4024' ) Limit columns in each data.frame to those specified above: ncvs_household_df <- ncvs_household_df[ , household_variables_to_keep ] ncvs_person_df <- ncvs_person_df[ , person_variables_to_keep ] ncvs_incident_df <- ncvs_incident_df[ , incident_variables_to_keep ] gc() In this example, limit the 1993-2022 data.frame to only the first & last years for quicker processing: ncvs_household_df <- ncvs_household_df[ ncvs_household_df[ , 'year' ] %in% c( 1994 , 2022 ) , ] ncvs_person_df <- ncvs_person_df[ ncvs_person_df[ , 'year' ] %in% c( 1994 , 2022 ) , ] ncvs_incident_df <- ncvs_incident_df[ ncvs_incident_df[ , 'year' ] %in% c( 1994 , 2022 ) , ] gc() Recode identifiers to character class: ncvs_household_df[ , 'idhh' ] <- as.character( ncvs_household_df[ , 'idhh' ] ) ncvs_person_df[ c( 'idhh' , 'idper' ) ] <- sapply( ncvs_person_df[ c( 'idhh' , 'idper' ) ] , as.character ) ncvs_incident_df[ c( 'idhh' , 'idper' ) ] <- sapply( ncvs_incident_df[ c( 'idhh' , 'idper' ) ] , as.character ) Recode factor variables to numeric values: ncvs_household_df[ sapply( ncvs_household_df , class ) == 'factor' ] <- sapply( ncvs_household_df[ sapply( ncvs_household_df , class ) == 'factor' ] , ncvs_numeric_to_factor , simplify = FALSE ) ncvs_person_df[ sapply( ncvs_person_df , class ) == 'factor' ] <- sapply( ncvs_person_df[ sapply( ncvs_person_df , class ) == 'factor' ] , ncvs_numeric_to_factor , simplify = FALSE ) ncvs_incident_df[ sapply( ncvs_incident_df , class ) == 'factor' ] <- sapply( ncvs_incident_df[ sapply( ncvs_incident_df , class ) == 'factor' ] , ncvs_numeric_to_factor , simplify = FALSE ) Add a column of ones to each data.frame: ncvs_household_df[ , 'one' ] <- 1 ncvs_person_df[ , 'one' ] <- 1 ncvs_incident_df[ , 'one' ] <- 1 Add a year group variable to each data.frame: ncvs_household_df[ , 'yr_grp' ] <- findInterval( ncvs_household_df[ , 'year' ] , c( 1992 , 1997 , 2006 , 2016 ) ) ncvs_person_df[ , 'yr_grp' ] <- findInterval( ncvs_person_df[ , 'year' ] , c( 1992 , 1997 , 2006 , 2016 ) ) ncvs_incident_df[ , 'yr_grp' ] <- findInterval( ncvs_incident_df[ , 'year' ] , c( 1992 , 1997 , 2006 , 2016 ) ) Add a flag indicating whether each incident occurred inside the country: ncvs_incident_df[ , 'exclude_outus' ] <- ncvs_incident_df[ , 'v4022' ] %in% 1 Add a half-year indicator to the incident data.frame: ncvs_incident_df <- transform( ncvs_incident_df , half_year = ifelse( substr( yearq , 6 , 6 ) %in% c( '1' , '2' ) , 1 , ifelse( substr( yearq , 6 , 6 ) %in% c( '3' , '4' ) , 2 , NA ) ) ) stopifnot( all( ncvs_incident_df[ , 'half_year' ] %in% 1:2 ) ) Define violent crimes on the incident data.frame: # rape and sexual assault ncvs_incident_df[ , 'rsa' ] <- ncvs_incident_df[ , 'v4529' ] %in% c( 1:4 , 15 , 16 , 18 , 19 ) # robbery ncvs_incident_df[ , 'rob' ] <- ncvs_incident_df[ , 'v4529' ] %in% 5:10 # assault ncvs_incident_df[ , 'ast' ] <- ncvs_incident_df[ , 'v4529' ] %in% c( 11:14 , 17 , 20 ) # simple assault ncvs_incident_df[ , 'sast' ] <- ncvs_incident_df[ , 'v4529' ] %in% c( 14 , 17 , 20 ) # aggravated assault ncvs_incident_df[ , 'aast' ] <- ncvs_incident_df[ , 'v4529' ] %in% 11:13 # violent crime ncvs_incident_df[ , 'violent' ] <- apply( ncvs_incident_df[ c( 'rsa' , 'rob' , 'ast' ) ] , 1 , any ) # violent crime excluding simple assault ncvs_incident_df[ , 'sviolent' ] <- apply( ncvs_incident_df[ , c( 'rsa' , 'rob' , 'aast' ) ] , 1 , any ) Define personal theft and then person-crime on the incident data.frame: ncvs_incident_df[ , 'ptft' ] <- ncvs_incident_df[ , 'v4529' ] %in% 21:23 ncvs_incident_df[ , 'personcrime' ] <- apply( ncvs_incident_df[ , c( 'violent' , 'ptft' ) ] , 1 , any ) Define property crimes on the incident data.frame: ncvs_incident_df[ , 'hhburg' ] <- ncvs_incident_df[ , 'v4529' ] %in% 31:33 # completed theft with something taken ncvs_incident_df[ , 'burg_ct' ] <- ( ncvs_incident_df[ , 'v4529' ] %in% 31:33 ) & ( ncvs_incident_df[ , 'v4288' ] %in% 1 ) # attempted theft ncvs_incident_df[ , 'burg_at' ] <- ( ncvs_incident_df[ , 'v4529' ] %in% 31:33 ) & ( ncvs_incident_df[ , 'v4290' ] %in% 1 ) ncvs_incident_df[ , 'burg_ncat' ] <- ( ncvs_incident_df[ , 'v4529' ] %in% 31:33 ) & ( ncvs_incident_df[ , 'v4288' ] %in% 2 ) & ( ncvs_incident_df[ , 'v4290' ] %in% 2 ) ncvs_incident_df[ , 'burgcats2' ] <- 0 ncvs_incident_df[ ncvs_incident_df[ , 'burg_ncat' ] , 'burgcats2' ] <- 2 ncvs_incident_df[ ncvs_incident_df[ , 'burg_ct' ] | ncvs_incident_df[ , 'burg_at' ] , 'burgcats2' ] <- 1 ncvs_incident_df[ , 'burg' ] <- ncvs_incident_df[ , 'burgcats2' ] %in% 1 # trespassing ncvs_incident_df[ , 'tres' ] <- ncvs_incident_df[ , 'burgcats2' ] %in% 2 # motor vehicle theft ncvs_incident_df[ , 'mvtft' ] <- ncvs_incident_df[ , 'v4529' ] %in% 40:41 # household theft ncvs_incident_df[ , 'hhtft' ] <- ncvs_incident_df[ , 'v4529' ] %in% 54:59 # property crime ncvs_incident_df[ , 'property' ] <- apply( ncvs_incident_df[ c( 'hhburg' , 'mvtft' , 'hhtft' ) ] , 1 , any ) Define a series weight on the incident data.frame: ncvs_incident_df[ , 'series' ] <- 2 ncvs_incident_df[ ncvs_incident_df[ , 'v4017' ] %in% c( 1 , 8 ) | ncvs_incident_df[ , 'v4018' ] %in% c( 2 , 8 ) | ncvs_incident_df[ , 'v4019' ] %in% c( 1 , 8 ) , 'series' ] <- 1 ncvs_incident_df[ , 'serieswgt' ] <- 1 ncvs_incident_df[ !( ncvs_incident_df[ , 'v4016' ] %in% 997:998 ) , 'n10v4016' ] <- pmin( ncvs_incident_df[ !( ncvs_incident_df[ , 'v4016' ] %in% 997:998 ) , 'v4016' ] , 10 ) ncvs_incident_df[ ncvs_incident_df[ , 'series' ] == 2 , 'serieswgt' ] <- ncvs_incident_df[ ncvs_incident_df[ , 'series' ] == 2 , 'n10v4016' ] ncvs_incident_df[ ncvs_incident_df[ , 'series' ] == 2 & is.na( ncvs_incident_df[ , 'n10v4016' ] ) , 'serieswgt' ] <- 6 Aggregate property-crimes to the household-interview level: summed_hh_crimes <- aggregate( cbind( property * serieswgt , hhburg * serieswgt , mvtft * serieswgt , burg * serieswgt , tres * serieswgt ) ~ yearq + idhh + v4002 + wgtviccy , data = subset( ncvs_incident_df , !exclude_outus & property ) , sum ) names( summed_hh_crimes ) <- c( 'yearq' , 'idhh' , 'v2002' , 'wgtviccy' , 'property' , 'hhburg' , 'mvtft' , 'burg' , 'tres' ) Merge aggregated property-crimes on to the household-interview data.frame: ncvs_household_df <- left_join_zero_missings( ncvs_household_df , summed_hh_crimes ) rm( summed_hh_crimes ) ; gc() Aggregate person-crimes to the person-interview level: summed_person_crimes <- aggregate( cbind( violent * serieswgt , sviolent * serieswgt , rsa * serieswgt , rob * serieswgt , aast * serieswgt , sast * serieswgt , ptft * serieswgt ) ~ yearq + idhh + v4002 + idper + wgtviccy , data = subset( ncvs_incident_df , !exclude_outus & personcrime ) , sum ) names( summed_person_crimes ) <- c( 'yearq' , 'idhh' , 'v3002' , 'idper' , 'wgtviccy' , 'violent' , 'sviolent' , 'rsa' , 'rob' , 'aast' , 'sast' , 'ptft' ) Merge aggregated property-crimes on to the person-interview data.frame: ncvs_person_df <- left_join_zero_missings( ncvs_person_df , summed_person_crimes ) rm( summed_person_crimes ) ; gc() Starting here, the weight calculation prepares an adjustment for all violence combined with the variables violent and violent_year. To calculate the prevalence rate of a subset of person-crimes, starting at this point, replace these two values with variables like rob and rob_year. Aggregate violent crimes to the person-year level: summed_person_year_violent_crimes <- aggregate( violent * serieswgt ~ idhh + idper + year , data = subset( ncvs_incident_df , !exclude_outus & violent ) , sum ) names( summed_person_year_violent_crimes )[ ncol( summed_person_year_violent_crimes ) ] <- 'violent_year' Merge aggregated person-year violent crime series weights on to the person-interview data.frame: ncvs_person_df <- left_join_zero_missings( ncvs_person_df , summed_person_year_violent_crimes ) rm( summed_person_year_violent_crimes ) ; gc() Aggregate violent crimes to the person-half-year level, then reshape into a wide data.frame: summed_person_half_year_violent_crimes <- aggregate( wgtviccy ~ idhh + idper + year + half_year , data = subset( ncvs_incident_df , !exclude_outus & violent ) , mean ) first_half_violent_crimes <- subset( summed_person_half_year_violent_crimes , half_year == 1 ) second_half_violent_crimes <- subset( summed_person_half_year_violent_crimes , half_year == 2 ) first_half_violent_crimes[ , 'half_year' ] <- second_half_violent_crimes[ , 'half_year' ] <- NULL names( first_half_violent_crimes )[ ncol( first_half_violent_crimes ) ] <- 'vwgt1' names( second_half_violent_crimes )[ ncol( second_half_violent_crimes ) ] <- 'vwgt2' wide_person_half_year_violent_crimes <- merge( first_half_violent_crimes , second_half_violent_crimes , all = TRUE ) Merge both violent crime weights on to the person-interview data.frame: ncvs_person_df <- left_join_zero_missings( ncvs_person_df , wide_person_half_year_violent_crimes ) rm( wide_person_half_year_violent_crimes ) ; gc() Find the maximum incident victim weight among three half-year periods: max_half_v_crimes <- aggregate( wgtviccy ~ idhh + idper + year + half_year + v4002 , data = subset( ncvs_incident_df , !exclude_outus & violent ) , max ) max_half_v_crimes <- max_half_v_crimes[ do.call( order , max_half_v_crimes[ c( 'idhh' , 'idper' , 'year' , 'half_year' ) ] ) , ] max_half_v_crimes[ , 'byvar' ] <- apply( max_half_v_crimes[ c( 'idhh' , 'idper' , 'year' , 'half_year' ) ] , 1 , paste , collapse = ' ' ) max_half_v_crimes[ 1 , 'id' ] <- 1 for( i in seq( 2 , nrow( max_half_v_crimes ) ) ){ if( max_half_v_crimes[ i , 'byvar' ] == max_half_v_crimes[ i - 1 , 'byvar' ] ){ max_half_v_crimes[ i , 'id' ] <- max_half_v_crimes[ i - 1 , 'id' ] + 1 } else { max_half_v_crimes[ i , 'id' ] <- 1 } } max_half_v_crimes[ , 'label' ] <- paste0( '_' , max_half_v_crimes[ , 'half_year' ] , '_' , max_half_v_crimes[ , 'id' ] ) max_half_v_crimes[ , 'byvar' ] <- NULL stopifnot( all( max_half_v_crimes[ , 'label' ] %in% c( '_1_1' , '_2_1' , '_1_2' ) ) ) h_1_1_df <- max_half_v_crimes[ max_half_v_crimes[ , 'label' ] == '_1_1' , c( 'idhh' , 'idper' , 'year' , 'wgtviccy' ) ] names( h_1_1_df )[ ncol( h_1_1_df ) ] <- 'wgtviccy_1_1' h_2_1_df <- max_half_v_crimes[ max_half_v_crimes[ , 'label' ] == '_2_1' , c( 'idhh' , 'idper' , 'year' , 'wgtviccy' ) ] names( h_2_1_df )[ ncol( h_2_1_df ) ] <- 'wgtviccy_2_1' h_1_2_df <- max_half_v_crimes[ max_half_v_crimes[ , 'label' ] == '_1_2' , c( 'idhh' , 'idper' , 'year' , 'wgtviccy' ) ] names( h_1_2_df )[ ncol( h_1_2_df ) ] <- 'wgtviccy_1_2' three_half_df <- Reduce( function( ... ) merge( ... , all = TRUE ) , list( h_1_1_df , h_2_1_df , h_1_2_df ) ) rm( h_1_1_df , h_2_1_df , h_1_2_df ) ; gc() Merge these three half-year period weights on to the person-interview data.frame: ncvs_person_df <- left_join_zero_missings( ncvs_person_df , three_half_df ) rm( three_half_df ) ; gc() Aggregate interview counts to the person-year level: summed_person_year_interviews <- aggregate( one ~ idhh + idper + year , data = subset( ncvs_person_df , wgtpercy > 0 ) , sum ) names( summed_person_year_interviews )[ ncol( summed_person_year_interviews ) ] <- 'interview_count' Merge interview_count on to the person-interview data.frame: ncvs_person_df <- left_join_zero_missings( ncvs_person_df , summed_person_year_interviews ) rm( summed_person_year_interviews ) ; gc() Apply Interview/Incident Groups: ncvs_person_df <- transform( ncvs_person_df , interview_incident_groups = ifelse( violent_year == 0 , 1 , ifelse( interview_count == 1 & ( ( as.numeric( vwgt1 > 0 ) + as.numeric( vwgt2 > 0 ) ) == 1 ) & wgtviccy > 0 , 2 , ifelse( interview_count == 2 & ( ( as.numeric( vwgt1 > 0 ) + as.numeric( vwgt2 > 0 ) ) == 1 ) , 3 , ifelse( interview_count == 2 & ( vwgt1 > 0 ) & ( vwgt2 > 0 ) & ( wgtviccy > 0 ) , 4 , ifelse( interview_count == 3 & ( ( as.numeric( wgtviccy_1_1 > 0 ) + as.numeric( wgtviccy_2_1 > 0 ) + as.numeric( wgtviccy_1_2 > 0 ) ) == 1 ) , 5 , ifelse( interview_count == 3 & ( wgtviccy_1_1 > 0 ) & ( wgtviccy_2_1 > 0 ) & ( wgtviccy_1_2 > 0 ) , 6 , ifelse( interview_count == 3 & ( wgtviccy_1_1 > 0 ) & ( wgtviccy_2_1 > 0 ) & substr( yearq , 6 , 6 ) %in% 1:2 , 7 , ifelse( interview_count == 3 & ( wgtviccy_1_1 > 0 ) & ( wgtviccy_2_1 > 0 ) & substr( yearq , 6 , 6 ) %in% 3:4 , 8 , 9 ) ) ) ) ) ) ) ) ) # confirm all records in group 9 have both a wgtviccy == 0 & wgtpercy == 0 stopifnot( nrow( subset( ncvs_person_df , interview_incident_groups == 9 & wgtviccy > 0 ) ) == 0 ) stopifnot( nrow( subset( ncvs_person_df , interview_incident_groups == 9 & wgtpercy > 0 ) ) == 0 ) ncvs_person_df <- transform( ncvs_person_df , prev_wgt0 = ifelse( interview_incident_groups == 1 , wgtpercy , ifelse( interview_incident_groups == 2 , wgtviccy / 2 , ifelse( interview_incident_groups == 3 , pmax( vwgt1 , vwgt2 , na.rm = TRUE ) / 2 , ifelse( interview_incident_groups == 4 , wgtviccy / 2 , ifelse( interview_incident_groups == 5 , pmax( wgtviccy_1_1 , wgtviccy_1_2 , wgtviccy_2_1 , na.rm = TRUE ) / 2 , ifelse( interview_incident_groups == 6 , wgtviccy / 2 , ifelse( interview_incident_groups == 7 , wgtviccy_1_1 / 2 , ifelse( interview_incident_groups == 8 , wgtviccy_2_1 / 2 , ifelse( interview_incident_groups == 9 , 0 , NA ) ) ) ) ) ) ) ) ) ) # matches table 8 # https://www.ojp.gov/pdffiles1/bjs/grants/308745.pdf#page=44 Aggregate wgtviccy and prev_wgt0 sums to the year level, then merge: summed_year_weights <- aggregate( cbind( wgtviccy , prev_wgt0 ) ~ year , data = subset( ncvs_person_df , violent_year == 1 ) , sum ) names( summed_year_weights ) <- c( 'year' , 'vwgt_1v' , 'prev_1v' ) ncvs_person_df <- left_join_zero_missings( ncvs_person_df , summed_year_weights ) rm( summed_year_weights ) ; gc() Calibrate so that the weight sums to wgtviccy for persons with exactly one victimization: ncvs_person_df <- transform( ncvs_person_df , prev_wgt1 = ifelse( violent_year == 0 , prev_wgt0 , ifelse( violent_year > 0 & wgtpercy > 0 , prev_wgt0 * ( vwgt_1v / prev_1v ) , 0 ) ) ) Aggregate wgtviccy and prev_wgt0 sums to the year level, then merge: summed_year_crimes <- aggregate( cbind( wgtpercy , ifelse( violent_year > 0 , prev_wgt1 , 0 ) , ifelse( violent_year == 0 , prev_wgt1 , 0 ) ) ~ year , data = ncvs_person_df , sum ) names( summed_year_crimes ) <- c( 'year' , 'total_persons' , 'prev_with_crime' , 'prev_no_crime' ) ncvs_person_df <- left_join_zero_missings( ncvs_person_df , summed_year_crimes ) rm( summed_year_crimes ) ; gc() Calibrate so that the weight sums to wgtpercy for all persons: ncvs_person_df <- transform( ncvs_person_df , prev_wgt = ifelse( violent_year == 0 , prev_wgt1 * ( ( total_persons - prev_with_crime ) / prev_no_crime ) , prev_wgt1 ) ) Save Locally   Save the object at any point: # ncvs_fn <- file.path( path.expand( "~" ) , "NCVS" , "this_file.rds" ) # saveRDS( ncvs_df , file = ncvs_fn , compress = FALSE ) Load the same object: # ncvs_df <- readRDS( ncvs_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) options('survey.lonely.psu' = 'adjust') # replace missing clusters ncvs_person_df[ is.na( ncvs_person_df[ , 'v2118' ] ) , 'v2118' ] <- -1 ncvs_person_df[ is.na( ncvs_person_df[ , 'v2117' ] ) , 'v2117' ] <- -1 # subset this dataset to only 2022 ncvs_df <- subset( ncvs_person_df , year == max( year ) ) ncvs_design <- svydesign( ~ v2118 , strata = ~ interaction( yr_grp , v2117 ) , data = ncvs_df , weights = ~ prev_wgt , nest = TRUE ) Variable Recoding Add new columns to the data set: ncvs_design <- update( ncvs_design , one = 1 , victim = as.numeric( violent_year > 0 ) , sex = factor( v3018 , levels = 1:2 , labels = c( 'male' , 'female' ) ) , linear_age = ifelse( v3014 == 99 , NA , v3014 ) , times_moved_in_prior_five_years = ifelse( v3033 == 99 , NA , v3033 ) , current_marital_status = factor( v3015 , levels = c( 1:5 , 8 ) , labels = c( 'married' , 'widowed' , 'divorced' , 'separated' , 'single' , 'residue' ) ) , household_income_starting_2015q1 = factor( findInterval( sc214a , c( 1 , 9 , 13 , 16 , 18 ) ) , levels = 1:5 , labels = c( 'less than $25,000' , '$25,000 - $49,999' , '$50,000 - $99,999' , '$100,000 - $199,999' , '$200,000 or more' ) ) , household_income_75k = ifelse( v2026 == 98 , NA , as.numeric( v2026 %in% 14:18 ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( ncvs_design , "sampling" ) != 0 ) svyby( ~ one , ~ sex , ncvs_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , ncvs_design ) svyby( ~ one , ~ sex , ncvs_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ victim , ncvs_design ) svyby( ~ victim , ~ sex , ncvs_design , svymean ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ current_marital_status , ncvs_design ) svyby( ~ current_marital_status , ~ sex , ncvs_design , svymean ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ victim , ncvs_design ) svyby( ~ victim , ~ sex , ncvs_design , svytotal ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ current_marital_status , ncvs_design ) svyby( ~ current_marital_status , ~ sex , ncvs_design , svytotal ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ victim , ncvs_design , 0.5 ) svyby( ~ victim , ~ sex , ncvs_design , svyquantile , 0.5 , ci = TRUE ) Estimate a ratio: svyratio( numerator = ~ times_moved_in_prior_five_years , denominator = ~ linear_age , ncvs_design , na.rm = TRUE ) Subsetting Restrict the survey design to elderly americans: sub_ncvs_design <- subset( ncvs_design , linear_age >= 65 ) Calculate the mean (average) of this subset: svymean( ~ victim , sub_ncvs_design ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ victim , ncvs_design ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ victim , ~ sex , ncvs_design , svymean ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( ncvs_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ victim , ncvs_design ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ victim , ncvs_design , deff = TRUE ) # SRS with replacement svymean( ~ victim , ncvs_design , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ household_income_75k , ncvs_design , method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( victim ~ household_income_75k , ncvs_design ) Perform a chi-squared test of association for survey data: svychisq( ~ household_income_75k + current_marital_status , ncvs_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( victim ~ household_income_75k + current_marital_status , ncvs_design ) summary( glm_result ) Replication Example This example matches the 1994 and 2022 victimization rates and SEs in Appendix Table 1: new_prevalence_design <- svydesign( ~ v2118 , strata = ~ interaction( yr_grp , v2117 ) , data = ncvs_person_df , weights = ~ prev_wgt , nest = TRUE ) new_prevalence_results <- svyby( ~ as.numeric( violent_year > 0 ) , ~ year , new_prevalence_design , svymean ) # match new method (wgt_ovam) 1994 and 2022 estimates stopifnot( round( coef( new_prevalence_results )[ c( 1 , nrow( new_prevalence_results ) ) ] , 4 ) == c( 0.0442 , 0.0151 ) ) # match new method (wgt_ovam) 1994 and 2022 standard errors stopifnot( round( SE( new_prevalence_results )[ c( 1 , nrow( new_prevalence_results ) ) ] , 5 ) == c( 0.0010 , 0.00054 ) ) old_prevalence_design <- svydesign( ~ v2118 , strata = ~ interaction( yr_grp , v2117 ) , data = ncvs_person_df , weights = ~ wgtpercy , nest = TRUE ) old_prevalence_results <- svyby( ~ as.numeric( violent_year > 0 ) , ~ year , old_prevalence_design , svymean ) # match old method (wgtpercy) 1994 and 2022 estimates stopifnot( round( coef( old_prevalence_results )[ c( 1 , nrow( old_prevalence_results ) ) ] , 4 ) == c( 0.0328 , 0.0124 ) ) # match old method (wgtpercy) 1994 and 2022 standard errors stopifnot( round( SE( old_prevalence_results )[ c( 1 , nrow( old_prevalence_results ) ) ] , 5 ) == c( 0.00075 , 0.00042 ) ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for NCVS users, this code replicates previously-presented examples: library(srvyr) ncvs_srvyr_design <- as_survey( ncvs_design ) Calculate the mean (average) of a linear variable, overall and by groups: ncvs_srvyr_design %>% summarize( mean = survey_mean( victim ) ) ncvs_srvyr_design %>% group_by( sex ) %>% summarize( mean = survey_mean( victim ) ) "],["national-financial-capability-study-nfcs.html", "National Financial Capability Study (NFCS) Recommended Reading Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " National Financial Capability Study (NFCS) A study of financial knowledge and behavior, like making ends meet, planning ahead, managing assets. One state-by-state survey table with one row per sampled respondent, a separate investor survey. An online non-probability sample of U.S. adults (18+) calibrated to the American Community Survey. Released triennially since 2009. Funded by the FINRA Investor Education Foundation and conducted by FGS Global. Recommended Reading Four Example Strengths & Limitations: ✔️ Comprehensive assessment of financial literacy ✔️ Questionnaire replicated by other studies ❌ Non-probability quota sampling from online panels ❌ Limited income and asset detail compared to CPS or SCF Three Example Findings: In 2018, 33% of Americans aged 51-61 were satisfied with their personal financial situations. The gender gap in financial literacy widened with age in 2021. Average scores on a test of five financial literacy questions declined between 2009 and 2021. Two Methodology Documents: 2021 National Financial Capability Study: State-by-State Survey Methodology Financial Capability Insights: What the NFCS Reveals One Haiku: # lady madonna # laid bank balance goose egg, loves # gold unrequited Download, Import, Preparation Download and import the latest state-by-state microdata: library(haven) zip_tf <- tempfile() zip_url <- 'https://finrafoundation.org/sites/finrafoundation/files/2021-SxS-Data-and-Data-Info.zip' download.file( zip_url , zip_tf , mode = 'wb' ) unzipped_files <- unzip( zip_tf , exdir = tempdir() ) stata_fn <- grep( "\\\\.dta$" , unzipped_files , value = TRUE ) nfcs_tbl <- read_dta( stata_fn ) nfcs_df <- data.frame( nfcs_tbl ) names( nfcs_df ) <- tolower( names( nfcs_df ) ) Add a column of all ones, add labels to state names, add labels to the rainy day fund question: nfcs_df[ , 'one' ] <- 1 nfcs_df[ , 'state_name' ] <- factor( nfcs_df[ , 'stateq' ] , levels = 1:51 , labels = sort( c( 'District of Columbia' , state.name ) ) ) nfcs_df[ , 'rainy_day_fund' ] <- factor( nfcs_df[ , 'j5' ] , levels = c( 1 , 2 , 98 , 99 ) , labels = c( 'Yes' , 'No' , "Don't Know" , "Prefer not to say" ) ) Save Locally   Save the object at any point: # nfcs_fn <- file.path( path.expand( "~" ) , "NFCS" , "this_file.rds" ) # saveRDS( nfcs_df , file = nfcs_fn , compress = FALSE ) Load the same object: # nfcs_df <- readRDS( nfcs_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) nfcs_design <- svydesign( ~ 1 , data = nfcs_df , weights = ~ wgt_n2 ) divison_design <- svydesign( ~ 1 , data = nfcs_df , weights = ~ wgt_d2 ) state_design <- svydesign( ~ 1 , data = nfcs_df , weights = ~ wgt_s3 ) Variable Recoding Add new columns to the data set: nfcs_design <- update( nfcs_design , satisfaction_w_finances = ifelse( j1 > 10 , NA , j1 ) , risk_taking = ifelse( j2 > 10 , NA , j2 ) , difficult_to_pay_bills = factor( j4 , levels = c( 1 , 2 , 3 , 98 , 99 ) , labels = c( 'Very difficult' , 'Somewhat difficult' , 'Not at all difficult' , "Don't know" , 'Prefer not to say' ) ) , spending_vs_income = factor( j3 , levels = c( 1 , 2 , 3 , 98 , 99 ) , labels = c( 'Spending less than income' , 'Spending more than income' , 'Spending about equal to income' , "Don't know" , 'Prefer not to say' ) ) , unpaid_medical_bills = ifelse( g20 > 2 , NA , as.numeric( g20 == 1 ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( nfcs_design , "sampling" ) != 0 ) svyby( ~ one , ~ spending_vs_income , nfcs_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , nfcs_design ) svyby( ~ one , ~ spending_vs_income , nfcs_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ satisfaction_w_finances , nfcs_design , na.rm = TRUE ) svyby( ~ satisfaction_w_finances , ~ spending_vs_income , nfcs_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ difficult_to_pay_bills , nfcs_design ) svyby( ~ difficult_to_pay_bills , ~ spending_vs_income , nfcs_design , svymean ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ satisfaction_w_finances , nfcs_design , na.rm = TRUE ) svyby( ~ satisfaction_w_finances , ~ spending_vs_income , nfcs_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ difficult_to_pay_bills , nfcs_design ) svyby( ~ difficult_to_pay_bills , ~ spending_vs_income , nfcs_design , svytotal ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ satisfaction_w_finances , nfcs_design , 0.5 , na.rm = TRUE ) svyby( ~ satisfaction_w_finances , ~ spending_vs_income , nfcs_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ satisfaction_w_finances , denominator = ~ risk_taking , nfcs_design , na.rm = TRUE ) Subsetting Restrict the survey design to persons receiving pandemic-related stimulus payment: sub_nfcs_design <- subset( nfcs_design , j50 == 1 ) Calculate the mean (average) of this subset: svymean( ~ satisfaction_w_finances , sub_nfcs_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ satisfaction_w_finances , nfcs_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ satisfaction_w_finances , ~ spending_vs_income , nfcs_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( nfcs_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ satisfaction_w_finances , nfcs_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ satisfaction_w_finances , nfcs_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ satisfaction_w_finances , nfcs_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ unpaid_medical_bills , nfcs_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( satisfaction_w_finances ~ unpaid_medical_bills , nfcs_design ) Perform a chi-squared test of association for survey data: svychisq( ~ unpaid_medical_bills + difficult_to_pay_bills , nfcs_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( satisfaction_w_finances ~ unpaid_medical_bills + difficult_to_pay_bills , nfcs_design ) summary( glm_result ) Replication Example This example matches the unweighted count shown on PDF page 4: stopifnot( nrow( nfcs_df ) == 27118 ) This example matches the PDF page 7 estimate that 53% have three months of rainy day funds: national_rainy_day <- svymean( ~ rainy_day_fund , nfcs_design ) stopifnot( round( coef( national_rainy_day )[ 'rainy_day_fundYes' ] , 2 ) == 0.53 ) This example matches counts and rainy day estimates from The Geography of Financial Capability: state_counts <- svyby( ~ one , ~ state_name , state_design , unwtd.count ) stopifnot( state_counts[ 'California' , 'counts' ] == 1252 ) stopifnot( state_counts[ 'Missouri' , 'counts' ] == 501 ) stopifnot( state_counts[ 'Oregon' , 'counts' ] == 1261 ) state_rainy_day <- svyby( ~ rainy_day_fund , ~ state_name , state_design , svymean ) stopifnot( round( state_rainy_day[ 'California' , 'rainy_day_fundYes' ] , 2 ) == 0.57 ) stopifnot( round( state_rainy_day[ 'Missouri' , 'rainy_day_fundYes' ] , 2 ) == 0.51 ) stopifnot( round( state_rainy_day[ 'Oregon' , 'rainy_day_fundYes' ] , 2 ) == 0.52 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for NFCS users, this code replicates previously-presented examples: library(srvyr) nfcs_srvyr_design <- as_survey( nfcs_design ) Calculate the mean (average) of a linear variable, overall and by groups: nfcs_srvyr_design %>% summarize( mean = survey_mean( satisfaction_w_finances , na.rm = TRUE ) ) nfcs_srvyr_design %>% group_by( spending_vs_income ) %>% summarize( mean = survey_mean( satisfaction_w_finances , na.rm = TRUE ) ) "],["national-health-and-nutrition-examination-survey-nhanes.html", "National Health and Nutrition Examination Survey (NHANES) Recommended Reading Download, Import, Preparation Analysis Examples with the survey library   Direct Method of Age-Adjustment Replication Example Analysis Examples with srvyr  ", " National Health and Nutrition Examination Survey (NHANES) Doctors and dentists accompany survey interviewers in a mobile medical center that travels the country. While survey researchers read the questionnaires, medical professionals administer laboratory tests and conduct a full medical examination. The blood work and in-person check-up allow epidemiologists to answer questions like, “how many people have diabetes but don’t know they have diabetes?” Many tables containing information from the various examinations, generally one row per respondent. A complex sample survey designed to generalize to the civilian non-institutionalized U.S. population. Released biennially since 1999-2000. Administered by the Centers for Disease Control and Prevention. Recommended Reading Four Example Strengths & Limitations: ✔️ Biospecimen, dietary, and laboratory data ✔️ Basis for growth charts found on the walls of pediatricians’ offices and clinics worldwide ❌ Mobile Examination Centers require 150 minutes per interview ❌ Narrow set of demographic and family relationship questions Three Example Findings: Among US adults with diabetes across 2007-2010 and 2015-2018, the share achieving glycemic control (glycated hemoglobin level, <7%) declined from 57.4% to 50.5%. Ten million Americans alive in 2015 had childhood blood lead levels 5x above the level of concern. Among US children aged 2-5 years interviewed between 2017 and March of 2020 with at least one primary tooth, 11% had at least one untreated decayed primary tooth. Two Methodology Documents: About the National Health and Nutrition Examination Survey NHANES Tutorials One Haiku: # doctor, dentist, labs # mobile examination #vanlife interviews Download, Import, Preparation Download and import the demographics (demo) and total cholesterol laboratory (tchol) data: library(haven) nhanes_2015_2016_demo_url <- "https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/DEMO_I.XPT" nhanes_2017_2018_demo_url <- "https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DEMO_J.XPT" nhanes_2015_2016_tchol_url <- "https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/TCHOL_I.XPT" nhanes_2017_2018_tchol_url <- "https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/TCHOL_J.XPT" nhanes_2015_2016_demo_tbl <- read_xpt( nhanes_2015_2016_demo_url ) nhanes_2017_2018_demo_tbl <- read_xpt( nhanes_2017_2018_demo_url ) nhanes_2015_2016_tchol_tbl <- read_xpt( nhanes_2015_2016_tchol_url ) nhanes_2017_2018_tchol_tbl <- read_xpt( nhanes_2017_2018_tchol_url ) nhanes_2015_2016_demo_df <- data.frame( nhanes_2015_2016_demo_tbl ) nhanes_2017_2018_demo_df <- data.frame( nhanes_2017_2018_demo_tbl ) nhanes_2015_2016_tchol_df <- data.frame( nhanes_2015_2016_tchol_tbl ) nhanes_2017_2018_tchol_df <- data.frame( nhanes_2017_2018_tchol_tbl ) Specify which variables to keep from both the demo and tchol data files, then stack the four years: demo_vars <- c( # unique person identifier (merge variable) "SEQN" , # the two-year interviewed + MEC examined weight "WTMEC2YR" , # note that this is a special weight for only # individuals who took the mobile examination center (MEC) exam # there is one other weight available - WTINT2YR - # that should be used when MEC variables are not part of the analysis # interviewed only or interviewed + MEC "RIDSTATR" , # primary sampling unit varaible, used in complex design "SDMVPSU" , # strata variable, used in complex design "SDMVSTRA" , # race / ethnicity "RIDRETH3" , # age "RIDAGEYR" , # gender "RIAGENDR" , # pregnant at interview "RIDEXPRG" ) nhanes_2015_2018_demo_df <- rbind( nhanes_2015_2016_demo_df[ , demo_vars ] , nhanes_2017_2018_demo_df[ , demo_vars ] ) tchol_vars <- c( # unique person identifier (merge variable) "SEQN" , # laboratory total cholesterol variable # https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/TCHOL_J.htm "LBXTC" ) nhanes_2015_2018_tchol_df <- rbind( nhanes_2015_2016_tchol_df[ , tchol_vars ] , nhanes_2017_2018_tchol_df[ , tchol_vars ] ) Merge the two pooled datasets, limit the data.frame to mobile examination component respondents: nhanes_full_df <- merge( nhanes_2015_2018_demo_df , nhanes_2015_2018_tchol_df , all = TRUE ) names( nhanes_full_df ) <- tolower( names( nhanes_full_df ) ) nhanes_df <- subset( nhanes_full_df , ridstatr %in% 2 ) Scale the mobile examination component two-year weight to generalize to the pooled, four year period: nhanes_df[ , 'wtmec4yr' ] <- nhanes_df[ , 'wtmec2yr' ] / 2 Save Locally   Save the object at any point: # nhanes_fn <- file.path( path.expand( "~" ) , "NHANES" , "this_file.rds" ) # saveRDS( nhanes_df , file = nhanes_fn , compress = FALSE ) Load the same object: # nhanes_df <- readRDS( nhanes_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) nhanes_design <- svydesign( id = ~ sdmvpsu , strata = ~ sdmvstra , nest = TRUE , weights = ~ wtmec4yr , data = nhanes_df ) Variable Recoding Add new columns to the data set: nhanes_design <- update( nhanes_design , one = 1 , # define high total cholesterol as 1 if mg/dL is at or above 240 and zero otherwise. hi_tchol = ifelse( lbxtc >= 240 , 1 , 0 ) , gender = factor( riagendr , levels = 1:2 , labels = c( 'male' , 'female' ) ) , age_categories = factor( 1 + findInterval( ridageyr , c( 20 , 40 , 60 ) ) , levels = 1:4 , labels = c( "0-19" , "20-39" , "40-59" , "60+" ) ) , # recode the ridreth3 variable as: # mexican american and other hispanic -> 4 # non-hispanic white -> 1 # non-hispanic black -> 2 # non-hispanic asian -> 3 # other race including multi-racial -> 5 race_ethnicity = factor( c( 4 , 4 , 1 , 2 , NA , 3 , 5 )[ ridreth3 ] , levels = 1:5 , labels = c( 'nh white' , 'nh black' , 'nh asian' , 'hispanic' , 'other' ) ) , pregnant_at_interview = ifelse( ridexprg %in% 1:2 , as.numeric( ridexprg == 1 ) , NA ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( nhanes_design , "sampling" ) != 0 ) svyby( ~ one , ~ race_ethnicity , nhanes_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , nhanes_design ) svyby( ~ one , ~ race_ethnicity , nhanes_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ lbxtc , nhanes_design , na.rm = TRUE ) svyby( ~ lbxtc , ~ race_ethnicity , nhanes_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ riagendr , nhanes_design ) svyby( ~ riagendr , ~ race_ethnicity , nhanes_design , svymean ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ lbxtc , nhanes_design , na.rm = TRUE ) svyby( ~ lbxtc , ~ race_ethnicity , nhanes_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ riagendr , nhanes_design ) svyby( ~ riagendr , ~ race_ethnicity , nhanes_design , svytotal ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ lbxtc , nhanes_design , 0.5 , na.rm = TRUE ) svyby( ~ lbxtc , ~ race_ethnicity , nhanes_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ lbxtc , denominator = ~ ridageyr , nhanes_design , na.rm = TRUE ) Subsetting Restrict the survey design to respondents aged 60 or older: sub_nhanes_design <- subset( nhanes_design , age_categories == "60+" ) Calculate the mean (average) of this subset: svymean( ~ lbxtc , sub_nhanes_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ lbxtc , nhanes_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ lbxtc , ~ race_ethnicity , nhanes_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( nhanes_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ lbxtc , nhanes_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ lbxtc , nhanes_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ lbxtc , nhanes_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ pregnant_at_interview , nhanes_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( lbxtc ~ pregnant_at_interview , nhanes_design ) Perform a chi-squared test of association for survey data: svychisq( ~ pregnant_at_interview + riagendr , nhanes_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( lbxtc ~ pregnant_at_interview + riagendr , nhanes_design ) summary( glm_result ) Direct Method of Age-Adjustment Replication Example This example matches the total cholesterol statistics and standard errors in Table 1 from Data Brief 363: Match the crude estimates in the footnote and also in the unadjusted age categories: crude_overall <- svymean( ~ hi_tchol , subset( nhanes_design , ridageyr >= 20 ) , na.rm = TRUE ) stopifnot( round( coef( crude_overall ) , 3 ) == 0.115 ) crude_by_gender <- svyby( ~ hi_tchol , ~ gender , subset( nhanes_design , ridageyr >= 20 ) , svymean , na.rm = TRUE ) stopifnot( round( coef( crude_by_gender )[ 1 ] , 3 ) == 0.103 ) stopifnot( round( coef( crude_by_gender )[ 2 ] , 3 ) == 0.126 ) crude_by_age <- svyby( ~ hi_tchol , ~ age_categories , subset( nhanes_design , ridageyr >= 20 ) , svymean , na.rm = TRUE ) stopifnot( round( coef( crude_by_age )[ 1 ] , 3 ) == 0.075 ) stopifnot( round( coef( crude_by_age )[ 2 ] , 3 ) == 0.157 ) stopifnot( round( coef( crude_by_age )[ 3 ] , 3 ) == 0.114 ) stopifnot( round( SE( crude_by_age )[ 1 ] , 3 ) == 0.005 ) stopifnot( round( SE( crude_by_age )[ 2 ] , 3 ) == 0.011 ) stopifnot( round( SE( crude_by_age )[ 3 ] , 3 ) == 0.008 ) Sum up 2000 Census totals based on the age groupings specified in footnote: pop_by_age <- data.frame( age_categories = c( "0-19" , "20-39" , "40-59" , "60+" ) , Freq = c( 78782657 , 77670618 , 72816615 , 45363752 ) ) Create a design with the nationwide population stratified to the above census counts: nhanes_age_adjusted <- postStratify( subset( nhanes_design , !is.na( hi_tchol ) ) , ~ age_categories , pop_by_age ) Match the overall adjusted estimates: results_overall <- svymean( ~ hi_tchol , subset( nhanes_age_adjusted , ridageyr >= 20 ) , na.rm = TRUE ) stopifnot( round( coef( results_overall ) , 3 ) == 0.114 ) stopifnot( round( SE( results_overall ) , 3 ) == 0.006 ) Create a design stratified to census counts broken out by gender, then match those estimates: nhanes_by_gender <- svystandardize( nhanes_design , by = ~ age_categories , # stratification variable over = ~ gender , # break out variable population = pop_by_age , # data.frame containing census populations excluding.missing = ~ hi_tchol # analysis variable of interest ) results_by_gender <- svyby( ~ hi_tchol , ~ gender , subset( nhanes_by_gender , ridageyr >= 20 ) , svymean , na.rm=TRUE ) stopifnot( round( coef( results_by_gender )[ 1 ] , 3 ) == 0.105 ) stopifnot( round( coef( results_by_gender )[ 2 ] , 3 ) == 0.121 ) stopifnot( round( SE( results_by_gender )[ 1 ] , 3 ) == 0.007 ) stopifnot( round( SE( results_by_gender )[ 2 ] , 3 ) == 0.008 ) Create a design stratified to census counts broken out by race/ethnicity, then match those estimates: nhanes_by_race <- svystandardize( nhanes_design , by = ~ age_categories , # stratification variable over = ~ race_ethnicity , # break out variable population = pop_by_age , # data.frame containing census populations excluding.missing = ~ hi_tchol # analysis variable of interest ) results_by_race_ethnicity <- svyby( ~ hi_tchol , ~ race_ethnicity , design = subset( nhanes_by_race , ridageyr >= 20 ) , svymean , na.rm=TRUE ) stopifnot( round( coef( results_by_race_ethnicity )[ 1 ] , 3 ) == 0.117 ) stopifnot( round( coef( results_by_race_ethnicity )[ 2 ] , 3 ) == 0.100 ) stopifnot( round( coef( results_by_race_ethnicity )[ 3 ] , 3 ) == 0.116 ) stopifnot( round( coef( results_by_race_ethnicity )[ 4 ] , 3 ) == 0.109 ) stopifnot( round( SE( results_by_race_ethnicity )[ 1 ] , 3 ) == 0.007 ) stopifnot( round( SE( results_by_race_ethnicity )[ 2 ] , 3 ) == 0.009 ) stopifnot( round( SE( results_by_race_ethnicity )[ 3 ] , 3 ) == 0.011 ) stopifnot( round( SE( results_by_race_ethnicity )[ 4 ] , 3 ) == 0.009 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for NHANES users, this code replicates previously-presented examples: library(srvyr) nhanes_srvyr_design <- as_survey( nhanes_design ) Calculate the mean (average) of a linear variable, overall and by groups: nhanes_srvyr_design %>% summarize( mean = survey_mean( lbxtc , na.rm = TRUE ) ) nhanes_srvyr_design %>% group_by( race_ethnicity ) %>% summarize( mean = survey_mean( lbxtc , na.rm = TRUE ) ) "],["national-health-interview-survey-nhis.html", "National Health Interview Survey (NHIS) Recommended Reading Function Definitions Download, Import, Preparation Analysis Examples with the survey library   Replication Example", " National Health Interview Survey (NHIS) America’s most detailed household survey of health status and medical experience. One table with one row per sampled adult (18+) within each sampled household, one table with one row per sample child (when available, same family not required), multiply-imputed income tables. A complex sample survey designed to generalize to the U.S. civilian non-institutionalized population. Released annually since 1963, the most recent major re-design in 2019. Conducted by the National Center for Health Statistics at the Centers for Disease Control. Recommended Reading Four Example Strengths & Limitations: ✔️ PRICSSA ✔️ Annual and rotating core questionnaires allow trend analysis ❌ High rate of missingness for family income questions ❌ 2019 redesign interviews only one adult and one child (if present) per household Three Example Findings: In 2022, 14% of US seniors met both aerobic and muscle-strengthening physical activity guidelines. Adults living alone in 2021 reported feelings of depression more often than those living with others. Among US adults aged 18+ in 2022, 3% were categorized as having severe anxiety symptoms. Two Methodology Documents: 2022 Survey Description Wikipedia Entry One Haiku: # excellent health poor # wealth. "sup, doc?" bugs, daft bills, free # laughs best medicine Function Definitions Define a function to download, unzip, and import each comma-separated value file: nhis_csv_import <- function( this_url ){ this_tf <- tempfile() download.file( this_url , this_tf , mode = 'wb' ) unzipped_files <- unzip( this_tf , exdir = tempdir() ) this_csv <- grep( '\\\\.csv$' , unzipped_files , value = TRUE ) this_df <- read.csv( this_csv ) file.remove( c( this_tf , unzipped_files ) ) names( this_df ) <- tolower( names( this_df ) ) this_df } Download, Import, Preparation Download and import the sample adult interview and imputed income files: nhis_df <- nhis_csv_import( "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NHIS/2021/adult21csv.zip" ) imputed_income_df <- nhis_csv_import( "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NHIS/2021/adultinc21csv.zip" ) Save Locally   Save the object at any point: # nhis_fn <- file.path( path.expand( "~" ) , "NHIS" , "this_file.rds" ) # saveRDS( nhis_df , file = nhis_fn , compress = FALSE ) Load the same object: # nhis_df <- readRDS( nhis_fn ) Survey Design Definition Construct a multiply-imputed, complex sample survey design: Reshape the imputed income data.frame into a list based on the implicate number: imputed_income_list <- split( imputed_income_df , imputed_income_df[ , 'impnum_a' ] ) Remove overlapping columns except the merge variable: variables_to_remove <- setdiff( intersect( names( nhis_df ) , names( imputed_income_df ) ) , 'hhx' ) nhis_df <- nhis_df[ , !( names( nhis_df ) %in% variables_to_remove ) ] Merge each implicate onto the sample adult table: nhis_list <- lapply( imputed_income_list , function( w ){ this_df <- merge( nhis_df , w ) stopifnot( nrow( this_df ) == nrow( nhis_df ) ) this_df } ) Define the design: library(survey) library(mitools) nhis_design <- svydesign( id = ~ ppsu , strata = ~ pstrat , nest = TRUE , weights = ~ wtfa_a , data = imputationList( nhis_list ) ) Variable Recoding Add new columns to the data set: nhis_design <- update( nhis_design , one = 1 , poverty_category = factor( findInterval( povrattc_a , c( 1 , 2 , 4 ) ) , labels = c( "below poverty" , "100-199%" , "200-399%" , "400%+" ) ) , fair_or_poor_reported_health = ifelse( phstat_a %in% 1:5 , as.numeric( phstat_a >= 4 ) , NA ) , sex_a = factor( sex_a , levels = 1:2 , labels = c( "male" , "female" ) ) , annual_premium_first_plan = ifelse( hicostr1_a > 40000 , NA , hicostr1_a ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: MIcombine( with( nhis_design , svyby( ~ one , ~ one , unwtd.count ) ) ) MIcombine( with( nhis_design , svyby( ~ one , ~ poverty_category , unwtd.count ) ) ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: MIcombine( with( nhis_design , svytotal( ~ one ) ) ) MIcombine( with( nhis_design , svyby( ~ one , ~ poverty_category , svytotal ) ) ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: MIcombine( with( nhis_design , svymean( ~ agep_a ) ) ) MIcombine( with( nhis_design , svyby( ~ agep_a , ~ poverty_category , svymean ) ) ) Calculate the distribution of a categorical variable, overall and by groups: MIcombine( with( nhis_design , svymean( ~ sex_a ) ) ) MIcombine( with( nhis_design , svyby( ~ sex_a , ~ poverty_category , svymean ) ) ) Calculate the sum of a linear variable, overall and by groups: MIcombine( with( nhis_design , svytotal( ~ agep_a ) ) ) MIcombine( with( nhis_design , svyby( ~ agep_a , ~ poverty_category , svytotal ) ) ) Calculate the weighted sum of a categorical variable, overall and by groups: MIcombine( with( nhis_design , svytotal( ~ sex_a ) ) ) MIcombine( with( nhis_design , svyby( ~ sex_a , ~ poverty_category , svytotal ) ) ) Calculate the median (50th percentile) of a linear variable, overall and by groups: MIcombine( with( nhis_design , svyquantile( ~ agep_a , 0.5 , se = TRUE ) ) ) MIcombine( with( nhis_design , svyby( ~ agep_a , ~ poverty_category , svyquantile , 0.5 , se = TRUE , ci = TRUE ) ) ) Estimate a ratio: MIcombine( with( nhis_design , svyratio( numerator = ~ annual_premium_first_plan , denominator = ~ agep_a , na.rm = TRUE ) ) ) Subsetting Restrict the survey design to uninsured: sub_nhis_design <- subset( nhis_design , notcov_a == 1 ) Calculate the mean (average) of this subset: MIcombine( with( sub_nhis_design , svymean( ~ agep_a ) ) ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- MIcombine( with( nhis_design , svymean( ~ agep_a ) ) ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- MIcombine( with( nhis_design , svyby( ~ agep_a , ~ poverty_category , svymean ) ) ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( nhis_design$designs[[1]] ) Calculate the complex sample survey-adjusted variance of any statistic: MIcombine( with( nhis_design , svyvar( ~ agep_a ) ) ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement MIcombine( with( nhis_design , svymean( ~ agep_a , deff = TRUE ) ) ) # SRS with replacement MIcombine( with( nhis_design , svymean( ~ agep_a , deff = "replace" ) ) ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: # MIsvyciprop( ~ fair_or_poor_reported_health , nhis_design , # method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: # MIsvyttest( agep_a ~ fair_or_poor_reported_health , nhis_design ) Perform a chi-squared test of association for survey data: # MIsvychisq( ~ fair_or_poor_reported_health + sex_a , nhis_design ) Perform a survey-weighted generalized linear model: glm_result <- MIcombine( with( nhis_design , svyglm( agep_a ~ fair_or_poor_reported_health + sex_a ) ) ) summary( glm_result ) Replication Example This example matches statistics and standard errors within 0.01% from Figure 3 of this Characteristics of Adults Aged 18–64 Who Did Not Take Medication as Prescribed to Reduce Costs Data Brief: results <- MIcombine( with( subset( nhis_design , agep_a < 65 ) , svyby( ~ as.numeric( rxsk12m_a == 1 | rxls12m_a == 1 | rxdl12m_a == 1 ) , ~ poverty_category , svymean , na.rm = TRUE ) ) ) stopifnot( all( as.numeric( round( coef( results ) , 3 ) ) == c( 0.145 , 0.138 , 0.099 , 0.039 ) ) ) stopifnot( all( as.numeric( round( SE( results ) , 5 ) ) - c( 0.0126 , 0.0098 , 0.0062 , 0.0031 ) < 0.0001 ) ) "],["national-household-travel-survey-nhts.html", "National Household Travel Survey (NHTS) Recommended Reading Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " National Household Travel Survey (NHTS) The authoritative source on travel behavior, recording characteristics of people and vehicles of all modes. Four core linkable tables, with one record per household, person, trip, and vehicle, respectively. A complex sample survey designed to generalize to the civilian non-institutional U.S. population. Released every five to eight years since 1969. Funded by the Federal Highway Administration, with data collected by Ipsos Public Affairs. Recommended Reading Four Example Strengths & Limitations: ✔️ Origin-Destination passively collected data complement traditional household survey ✔️ Sample supports analysis of metro areas within census divisions ❌ 2022 redesign uses retrospective recorded travel day (1 day prior) rather than travel log ❌ Long-distance trip questions do not estimate respondent’s annual behavior or volume Three Example Findings: Online-purchased home deliveries grew over 2017-2022, from 2.5 to 5.4 per person per month. In 2022, 53% of K-12 students were dropped off at school in a private vehicle or drove themselves. Nearly 9 in 10 US households had a vehicle available to drive in 2022. Two Methodology Documents: 2022 NHTS Data User Guide 2022 NHTS Weighting Memo One Haiku: # commuter patterns, # truckin'. what a long strange trip # who went when where why Download, Import, Preparation Download and unzip each the 2022 files: library(haven) tf <- tempfile() download.file( "https://nhts.ornl.gov/assets/2022/download/sas.zip" , tf , mode = 'wb' ) unzipped_files <- unzip( tf , exdir = tempdir() ) Import the tables containing one record per household, person, trip, and vehicle: nhts_import <- function( this_prefix , this_unzip ){ this_sas7bdat <- grep( paste0( this_prefix , "\\\\.sas7bdat$" ) , this_unzip , value = TRUE ) this_tbl <- read_sas( this_sas7bdat ) this_df <- data.frame( this_tbl ) names( this_df ) <- tolower( names( this_df ) ) this_df } hhpub_df <- nhts_import( "hhv2pub" , unzipped_files ) perpub_df <- nhts_import( "perv2pub" , unzipped_files ) trippub_df <- nhts_import( "tripv2pub" , unzipped_files ) vehpub_df <- nhts_import( "vehv2pub" , unzipped_files ) Add a column of ones to three of those tables, then a column of non-missing mileage to the trips table: hhpub_df[ , 'one' ] <- 1 perpub_df[ , 'one' ] <- 1 trippub_df[ , 'one' ] <- 1 trippub_df[ !( trippub_df[ , 'trpmiles' ] %in% -9 ) , 'wtd_tripmiles_no_nines' ] <- trippub_df[ !( trippub_df[ , 'trpmiles' ] %in% -9 ) , 'trpmiles' ] * trippub_df[ !( trippub_df[ , 'trpmiles' ] %in% -9 ) , 'wttrdfin' ] Sum the total trip count and mileage to the person-level, both overall and restricted to walking only: trips_per_person <- with( trippub_df , aggregate( cbind( wttrdfin , wtd_tripmiles_no_nines ) , list( houseid , personid ) , sum , na.rm = TRUE ) ) names( trips_per_person ) <- c( 'houseid' , 'personid' , 'wtd_trips' , 'wtd_miles' ) walks_per_person <- with( subset( trippub_df , trptrans == '20' ) , aggregate( cbind( wttrdfin , wtd_tripmiles_no_nines ) , list( houseid , personid ) , sum , na.rm = TRUE ) ) names( walks_per_person ) <- c( 'houseid' , 'personid' , 'wtd_walks' , 'wtd_walk_miles' ) Merge these trip count and mileage values on to the person-level file, replacing non-matches with zero: nhts_df <- merge( perpub_df , trips_per_person , all.x = TRUE ) nhts_df <- merge( nhts_df , walks_per_person , all.x = TRUE ) for( this_variable in c( 'wtd_trips' , 'wtd_miles' , 'wtd_walks' , 'wtd_walk_miles' ) ){ nhts_df[ is.na( nhts_df[ , this_variable ] ) , this_variable ] <- 0 } stopifnot( nrow( nhts_df ) == nrow( perpub_df ) ) Save Locally   Save the object at any point: # nhts_fn <- file.path( path.expand( "~" ) , "NHTS" , "this_file.rds" ) # saveRDS( nhts_df , file = nhts_fn , compress = FALSE ) Load the same object: # nhts_df <- readRDS( nhts_fn ) Survey Design Definition Construct a complex sample survey design: Define household-level, person-level, and trip-level designs: library(survey) hh_design <- svydesign( id = ~ houseid , strata = ~ stratumid , data = hhpub_df , weights = ~ wthhfin ) nhts_design <- svydesign( id = ~ houseid , strata = ~ stratumid , data = nhts_df , weights = ~ wtperfin ) trip_design <- svydesign( id = ~ houseid , strata = ~ stratumid , data = trippub_df , weights = ~ wttrdfin ) Variable Recoding Add new columns to the data set: hh_design <- update( hh_design , hhsize_categories = factor( findInterval( hhsize , 1:4 ) , levels = 1:4 , labels = c( 1:3 , '4 or more' ) ) ) nhts_design <- update( nhts_design , urban_area = as.numeric( urbrur == '01' ) , daily_person_trips = ( wtd_trips / ( 365 * wtperfin ) ) , daily_person_miles_of_travel = ( wtd_miles / ( 365 * wtperfin ) ) , daily_person_walks = ( wtd_walks / ( 365 * wtperfin ) ) , daily_person_walk_miles_of_travel = ( wtd_walk_miles / ( 365 * wtperfin ) ) , work_status = factor( as.numeric( worker ) , levels = 2:1 , labels = c( 'non-worker' , 'worker' ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( nhts_design , "sampling" ) != 0 ) svyby( ~ one , ~ r_sex_imp , nhts_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , nhts_design ) svyby( ~ one , ~ r_sex_imp , nhts_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ daily_person_walks , nhts_design ) svyby( ~ daily_person_walks , ~ r_sex_imp , nhts_design , svymean ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ work_status , nhts_design , na.rm = TRUE ) svyby( ~ work_status , ~ r_sex_imp , nhts_design , svymean , na.rm = TRUE ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ daily_person_walks , nhts_design ) svyby( ~ daily_person_walks , ~ r_sex_imp , nhts_design , svytotal ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ work_status , nhts_design , na.rm = TRUE ) svyby( ~ work_status , ~ r_sex_imp , nhts_design , svytotal , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ daily_person_walks , nhts_design , 0.5 ) svyby( ~ daily_person_walks , ~ r_sex_imp , nhts_design , svyquantile , 0.5 , ci = TRUE ) Estimate a ratio: svyratio( numerator = ~ daily_person_walk_miles_of_travel , denominator = ~ daily_person_miles_of_travel , nhts_design ) Subsetting Restrict the survey design to individuals who have used a bicycle in last 30 days: sub_nhts_design <- subset( nhts_design , last30_bike == '01' ) Calculate the mean (average) of this subset: svymean( ~ daily_person_walks , sub_nhts_design ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ daily_person_walks , nhts_design ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ daily_person_walks , ~ r_sex_imp , nhts_design , svymean ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( nhts_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ daily_person_walks , nhts_design ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ daily_person_walks , nhts_design , deff = TRUE ) # SRS with replacement svymean( ~ daily_person_walks , nhts_design , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ urban_area , nhts_design , method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( daily_person_walks ~ urban_area , nhts_design ) Perform a chi-squared test of association for survey data: svychisq( ~ urban_area + work_status , nhts_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( daily_person_walks ~ urban_area + work_status , nhts_design ) summary( glm_result ) Replication Example This example matches the 2022 Household Size counts from Table 2-1: hhsize_counts <- svytotal( ~ hhsize_categories , hh_design ) stopifnot( all( round( coef( hhsize_counts ) / 1000 , 0 ) == c( 36409 , 44751 , 19001 , 27384 ) ) ) hhsize_ci <- confint( hhsize_counts ) hhsize_moe <- hhsize_ci[ , 2 ] - coef( hhsize_counts ) stopifnot( all( round( hhsize_moe / 1000 , 0 ) == c( 1807 , 1760 , 1448 , 1742 ) ) ) This example matches the 2022 Average Daily Person Trips per Person from Table 2-9: this_mean <- svymean( ~ daily_person_trips , nhts_design ) stopifnot( round( coef( this_mean ) , 2 ) == 2.28 ) this_ci <- confint( this_mean ) this_moe <- this_ci[ , 2 ] - coef( this_mean ) stopifnot( round( this_moe , 2 ) == 0.06 ) This example matches the 2022 Average Daily PMT per Person from Table 2-9: this_mean <- svymean( ~ daily_person_miles_of_travel , nhts_design ) stopifnot( round( coef( this_mean ) , 2 ) == 28.55 ) this_ci <- confint( this_mean ) this_moe <- this_ci[ , 2 ] - coef( this_mean ) stopifnot( round( this_moe , 2 ) == 2.39 ) This example matches the 2022 Average Person Trip Length (Miles) from Table 2-9: this_mean <- svymean( ~ trpmiles , subset( trip_design , trpmiles >= 0 ) ) stopifnot( round( coef( this_mean ) , 2 ) == 12.56 ) this_ci <- confint( this_mean ) this_moe <- this_ci[ , 2 ] - coef( this_mean ) stopifnot( round( this_moe , 2 ) == 1.04 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for NHTS users, this code replicates previously-presented examples: library(srvyr) nhts_srvyr_design <- as_survey( nhts_design ) Calculate the mean (average) of a linear variable, overall and by groups: nhts_srvyr_design %>% summarize( mean = survey_mean( daily_person_walks ) ) nhts_srvyr_design %>% group_by( r_sex_imp ) %>% summarize( mean = survey_mean( daily_person_walks ) ) "],["national-immunization-survey-nis.html", "National Immunization Survey (NIS) Recommended Reading Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " National Immunization Survey (NIS) The vaccination coverage rate tracker for national, state, and selected local areas. One table with one row per sampled toddler. A complex sample survey designed to generalize to children aged 19-35 months in the United States. Released annually since 1995, plus an adolescent (13-17 years) sample since 2008. Administered by the Centers for Disease Control and Prevention. Recommended Reading Four Example Strengths & Limitations: ✔️ Both parents and medical providers interviewed ✔️ Detailed health insurance questions ❌ Low household response rates and only half-completed provider data during 2019–2023 ❌ Although national estimates are precise, estimates for state and local areas should be interpreted with caution because their sample sizes are smaller, confidence intervals wider than national estimates Three Example Findings: In 2014 in the general population in Ohio, vaccination coverage with at least one dose or at least two doses of MMR among young children and adolescents was 96% and 88%, respectively. Completion of a 7-vaccine series by 19 months of age increased from 52% in 2011 to 59% in 2021. HPV vaccination initiation by age 13 rose from 27% to 70% among those born in 1999 versus 2009. Two Methodology Documents: About NIS National Immunization Survey-Child: A User’s Guide for the 2023 Public-Use Data File One Haiku: # i hear babies cry # protesting lungs of iron # a wonderful world Download, Import, Preparation Download the 2023 fixed-width file: dat_tf <- tempfile() dat_url <- "https://ftp.cdc.gov/pub/Vaccines_NIS/NISPUF23.DAT" download.file( dat_url , dat_tf , mode = 'wb' ) Edit then execute the import script provided by the CDC: library(Hmisc) r_tf <- tempfile() r_script_url <- "https://ftp.cdc.gov/pub/Vaccines_NIS/NISPUF23.R" r_input_lines <- readLines( r_script_url ) # do not let the script do the save() r_input_lines <- gsub( "^save\\\\(" , "# save(" , r_input_lines ) # redirect the path to the flat file to the local save location of `dat_tf` r_input_lines <- gsub( '\\\\"path\\\\-to\\\\-file\\\\/(.*)\\\\.DAT\\\\"' , "dat_tf" , r_input_lines ) # save the edited script locally writeLines( r_input_lines , r_tf ) # run the edited script source( r_tf , echo = TRUE ) # rename the resultant data.frame object nis_df <- NISPUF23 names( nis_df ) <- tolower( names( nis_df ) ) nis_df[ , 'one' ] <- 1 Save Locally   Save the object at any point: # nis_fn <- file.path( path.expand( "~" ) , "NIS" , "this_file.rds" ) # saveRDS( nis_df , file = nis_fn , compress = FALSE ) Load the same object: # nis_df <- readRDS( nis_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) options( survey.lonely.psu = "adjust" ) nis_design <- svydesign( id = ~ seqnumhh , strata = ~ stratum , weights = ~ provwt_c , data = subset( nis_df , provwt_c > 0 ) ) Variable Recoding Add new columns to the data set: nis_design <- update( nis_design , first_fed_formula = ifelse( bf_formr20 %in% 888 , NA , bf_formr20 ) , dtap_3p = as.numeric( ( p_numdah >= 3 ) | ( p_numdhi >= 3 ) | ( p_numdih >= 3 ) | ( p_numdta >= 3 ) | ( p_numdtp >= 3 ) ) , dtap_4p = as.numeric( ( p_numdah >= 4 ) | ( p_numdhi >= 4 ) | ( p_numdih >= 4 ) | ( p_numdta >= 4 ) | ( p_numdtp >= 4 ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( nis_design , "sampling" ) != 0 ) svyby( ~ one , ~ state , nis_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , nis_design ) svyby( ~ one , ~ state , nis_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ first_fed_formula , nis_design , na.rm = TRUE ) svyby( ~ first_fed_formula , ~ state , nis_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ sex , nis_design , na.rm = TRUE ) svyby( ~ sex , ~ state , nis_design , svymean , na.rm = TRUE ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ first_fed_formula , nis_design , na.rm = TRUE ) svyby( ~ first_fed_formula , ~ state , nis_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ sex , nis_design , na.rm = TRUE ) svyby( ~ sex , ~ state , nis_design , svytotal , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ first_fed_formula , nis_design , 0.5 , na.rm = TRUE ) svyby( ~ first_fed_formula , ~ state , nis_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ bf_exclr06 , denominator = ~ bf_endr06 , nis_design , na.rm = TRUE ) Subsetting Restrict the survey design to toddlers up to date on polio shots: sub_nis_design <- subset( nis_design , p_utdpol == 1 ) Calculate the mean (average) of this subset: svymean( ~ first_fed_formula , sub_nis_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ first_fed_formula , nis_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ first_fed_formula , ~ state , nis_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( nis_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ first_fed_formula , nis_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ first_fed_formula , nis_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ first_fed_formula , nis_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ dtap_3p , nis_design , method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( first_fed_formula ~ dtap_3p , nis_design ) Perform a chi-squared test of association for survey data: svychisq( ~ dtap_3p + sex , nis_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( first_fed_formula ~ dtap_3p + sex , nis_design ) summary( glm_result ) Replication Example This example matches the statistics and standard errors from Data User’s Guide Table 4: results <- svyby( ~ p_utd431h314_rout_s , ~ raceethk , nis_design , svymean ) coefficients <- results[ , "p_utd431h314_rout_sUTD" , drop = FALSE ] standard_errors <- results[ , "se.p_utd431h314_rout_sUTD" , drop = FALSE ] stopifnot( round( coefficients[ "HISPANIC" , ] , 3 ) == .674 ) stopifnot( round( coefficients[ "NON-HISPANIC WHITE ONLY" , ] , 3 ) == .716 ) stopifnot( round( coefficients[ "NON-HISPANIC BLACK ONLY" , ] , 3 ) == .666 ) stopifnot( round( standard_errors[ "HISPANIC" , ] , 3 ) == .017 ) stopifnot( round( standard_errors[ "NON-HISPANIC WHITE ONLY" , ] , 3 ) == .008 ) stopifnot( round( standard_errors[ "NON-HISPANIC BLACK ONLY" , ] , 3 ) == .023 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for NIS users, this code replicates previously-presented examples: library(srvyr) nis_srvyr_design <- as_survey( nis_design ) Calculate the mean (average) of a linear variable, overall and by groups: nis_srvyr_design %>% summarize( mean = survey_mean( first_fed_formula , na.rm = TRUE ) ) nis_srvyr_design %>% group_by( state ) %>% summarize( mean = survey_mean( first_fed_formula , na.rm = TRUE ) ) "],["national-plan-and-provider-enumeration-system-nppes.html", "National Plan and Provider Enumeration System (NPPES) Recommended Reading Download, Import, Preparation Analysis Examples with base R   Analysis Examples with dplyr   Analysis Examples with data.table   Analysis Examples with duckdb  ", " National Plan and Provider Enumeration System (NPPES) The registry of every medical practitioner actively operating in the United States healthcare industry. A single large table with one row per enumerated health care provider. A census of individuals and organizations that bill for medical services in the United States. Updated weekly with new providers. Maintained by the United States Centers for Medicare & Medicaid Services (CMS) Recommended Reading Two Methodology Documents: NPI: What You Need To Know Wikipedia Entry One Haiku: # how many doctors # ranked sergeant, last name pepper # practice in the states? Download, Import, Preparation Download and import the national file: library(readr) tf <- tempfile() npi_datapage <- readLines( "http://download.cms.gov/nppes/NPI_Files.html" ) latest_files <- grep( 'NPPES_Data_Dissemination_' , npi_datapage , value = TRUE ) latest_files <- latest_files[ !grepl( 'Weekly Update' , latest_files ) ] this_url <- paste0( "http://download.cms.gov/nppes/", gsub( "(.*)(NPPES_Data_Dissemination_.*\\\\.zip)(.*)$", "\\\\2", latest_files ) ) download.file( this_url , tf , mode = 'wb' ) npi_files <- unzip( tf , exdir = tempdir() ) npi_filepath <- grep( "npidata_pfile_20050523-([0-9]+)\\\\.csv" , npi_files , value = TRUE ) column_names <- names( read.csv( npi_filepath , nrow = 1 )[ FALSE , , ] ) column_names <- gsub( "\\\\." , "_" , tolower( column_names ) ) column_types <- ifelse( grepl( "code" , column_names ) & !grepl( "country|state|gender|taxonomy|postal" , column_names ) , 'n' , 'c' ) columns_to_import <- c( "entity_type_code" , "provider_gender_code" , "provider_enumeration_date" , "is_sole_proprietor" , "provider_business_practice_location_address_state_name" ) stopifnot( all( columns_to_import %in% column_names ) ) # readr::read_csv() columns must match their order in the csv file columns_to_import <- columns_to_import[ order( match( columns_to_import , column_names ) ) ] nppes_tbl <- readr::read_csv( npi_filepath , col_names = columns_to_import , col_types = paste0( ifelse( column_names %in% columns_to_import , column_types , '_' ) , collapse = "" ) , skip = 1 ) nppes_df <- data.frame( nppes_tbl ) Save Locally   Save the object at any point: # nppes_fn <- file.path( path.expand( "~" ) , "NPPES" , "this_file.rds" ) # saveRDS( nppes_df , file = nppes_fn , compress = FALSE ) Load the same object: # nppes_df <- readRDS( nppes_fn ) Variable Recoding Add new columns to the data set: nppes_df <- transform( nppes_df , individual = as.numeric( entity_type_code ) , provider_enumeration_year = as.numeric( substr( provider_enumeration_date , 7 , 10 ) ) , state_name = provider_business_practice_location_address_state_name ) Analysis Examples with base R   Unweighted Counts Count the unweighted number of records in the table, overall and by groups: nrow( nppes_df ) table( nppes_df[ , "provider_gender_code" ] , useNA = "always" ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: mean( nppes_df[ , "provider_enumeration_year" ] , na.rm = TRUE ) tapply( nppes_df[ , "provider_enumeration_year" ] , nppes_df[ , "provider_gender_code" ] , mean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: prop.table( table( nppes_df[ , "is_sole_proprietor" ] ) ) prop.table( table( nppes_df[ , c( "is_sole_proprietor" , "provider_gender_code" ) ] ) , margin = 2 ) Calculate the sum of a linear variable, overall and by groups: sum( nppes_df[ , "provider_enumeration_year" ] , na.rm = TRUE ) tapply( nppes_df[ , "provider_enumeration_year" ] , nppes_df[ , "provider_gender_code" ] , sum , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: quantile( nppes_df[ , "provider_enumeration_year" ] , 0.5 , na.rm = TRUE ) tapply( nppes_df[ , "provider_enumeration_year" ] , nppes_df[ , "provider_gender_code" ] , quantile , 0.5 , na.rm = TRUE ) Subsetting Limit your data.frame to California: sub_nppes_df <- subset( nppes_df , state_name = 'CA' ) Calculate the mean (average) of this subset: mean( sub_nppes_df[ , "provider_enumeration_year" ] , na.rm = TRUE ) Measures of Uncertainty Calculate the variance, overall and by groups: var( nppes_df[ , "provider_enumeration_year" ] , na.rm = TRUE ) tapply( nppes_df[ , "provider_enumeration_year" ] , nppes_df[ , "provider_gender_code" ] , var , na.rm = TRUE ) Regression Models and Tests of Association Perform a t-test: t.test( provider_enumeration_year ~ individual , nppes_df ) Perform a chi-squared test of association: this_table <- table( nppes_df[ , c( "individual" , "is_sole_proprietor" ) ] ) chisq.test( this_table ) Perform a generalized linear model: glm_result <- glm( provider_enumeration_year ~ individual + is_sole_proprietor , data = nppes_df ) summary( glm_result ) Analysis Examples with dplyr   The R dplyr library offers an alternative grammar of data manipulation to base R and SQL syntax. dplyr offers many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, and the tidyverse style of non-standard evaluation. This vignette details the available features. As a starting point for NPPES users, this code replicates previously-presented examples: library(dplyr) nppes_tbl <- as_tibble( nppes_df ) Calculate the mean (average) of a linear variable, overall and by groups: nppes_tbl %>% summarize( mean = mean( provider_enumeration_year , na.rm = TRUE ) ) nppes_tbl %>% group_by( provider_gender_code ) %>% summarize( mean = mean( provider_enumeration_year , na.rm = TRUE ) ) Analysis Examples with data.table   The R data.table library provides a high-performance version of base R’s data.frame with syntax and feature enhancements for ease of use, convenience and programming speed. data.table offers concise syntax: fast to type, fast to read, fast speed, memory efficiency, a careful API lifecycle management, an active community, and a rich set of features. This vignette details the available features. As a starting point for NPPES users, this code replicates previously-presented examples: library(data.table) nppes_dt <- data.table( nppes_df ) Calculate the mean (average) of a linear variable, overall and by groups: nppes_dt[ , mean( provider_enumeration_year , na.rm = TRUE ) ] nppes_dt[ , mean( provider_enumeration_year , na.rm = TRUE ) , by = provider_gender_code ] Analysis Examples with duckdb   The R duckdb library provides an embedded analytical data management system with support for the Structured Query Language (SQL). duckdb offers a simple, feature-rich, fast, and free SQL OLAP management system. This vignette details the available features. As a starting point for NPPES users, this code replicates previously-presented examples: library(duckdb) con <- dbConnect( duckdb::duckdb() , dbdir = 'my-db.duckdb' ) dbWriteTable( con , 'nppes' , nppes_df ) Calculate the mean (average) of a linear variable, overall and by groups: dbGetQuery( con , 'SELECT AVG( provider_enumeration_year ) FROM nppes' ) dbGetQuery( con , 'SELECT provider_gender_code , AVG( provider_enumeration_year ) FROM nppes GROUP BY provider_gender_code' ) "],["national-survey-of-childrens-health-nsch.html", "National Survey of Children’s Health (NSCH) Function Definitions Download, Import, Preparation Analysis Examples with the survey library   Replication Example", " National Survey of Children’s Health (NSCH) Estimates of children’s health care and family environments to inform reports like Healthy People 2030. One screener table with one row per eligible child (1+ rows per household), one topical table with the sampled child (only one row per household) from three stacked age-specific questionnaires. A complex sample survey designed to generalize to non-institutionalized U.S. children under 18. Released every four or five years since 2003, annually since 2016. Sponsored by the Maternal and Child Health Bureau, Health Resources and Services Administration. Please skim before you begin: 2021 National Survey of Children’s Health Methodology Report 2021 National Survey of Children’s Health Data Users Frequently Asked Questions (FAQs) A haiku regarding this microdata: # "age but a number" # lied babe from crib. "your nose grows" # cried gramps changing bib Function Definitions Define a function to download, unzip, and import each comma-separated value file: library(haven) nsch_stata_import <- function( this_url ){ this_tf <- tempfile() download.file( this_url , this_tf , mode = 'wb' ) unzipped_files <- unzip( this_tf , exdir = tempdir() ) this_stata <- grep( '\\\\.dta$' , unzipped_files , value = TRUE ) this_tbl <- read_stata( this_stata ) this_df <- data.frame( this_tbl ) file.remove( c( this_tf , unzipped_files ) ) names( this_df ) <- tolower( names( this_df ) ) this_df } Download, Import, Preparation Download and import the sample adult interview and imputed income files: nsch_screener_url <- "https://www2.census.gov/programs-surveys/nsch/datasets/2021/nsch_2021_screener_Stata.zip" nsch_topical_url <- "https://www2.census.gov/programs-surveys/nsch/datasets/2021/nsch_2021_topical_Stata.zip" nsch_screener_df <- nsch_stata_import( nsch_screener_url ) nsch_df <- nsch_stata_import( nsch_topical_url ) Save Locally   Save the object at any point: # nsch_fn <- file.path( path.expand( "~" ) , "NSCH" , "this_file.rds" ) # saveRDS( nsch_df , file = nsch_fn , compress = FALSE ) Load the same object: # nsch_df <- readRDS( nsch_fn ) Survey Design Definition Construct a multiply-imputed, complex sample survey design: Remove the fpl columns from the main data.frame: fpl_columns <- grep( '^fpl_i[0-9]' , names( nsch_df ) , value = TRUE ) fpl_wide_df <- nsch_df[ c( 'hhid' , fpl_columns ) ] nsch_df[ fpl_columns ] <- NULL Reshape the fpl columns from wide to long: fpl_long_df <- reshape( fpl_wide_df , varying = list( fpl_columns ) , direction = 'long' , timevar = 'implicate' , idvar = 'hhid' ) names( fpl_long_df )[ ncol( fpl_long_df ) ] <- 'fpl' Merge the fpl table with multiple records per child onto the main table: nsch_long_df <- merge( nsch_df , fpl_long_df ) stopifnot( nrow( nsch_long_df ) == nrow( fpl_long_df ) ) stopifnot( nrow( nsch_long_df ) / length( fpl_columns ) == nrow( nsch_df ) ) Reshape the imputed income data.frame into a list based on the implicate number: nsch_list <- split( nsch_long_df , nsch_long_df[ , 'implicate' ] ) Define the design: library(survey) library(mitools) nsch_design <- svydesign( id = ~ 1 , strata = ~ fipsst + stratum , weights = ~ fwc , data = imputationList( nsch_list ) , nest = TRUE ) Variable Recoding Add new columns to the data set: nsch_design <- update( nsch_design , one = 1 , state_name = factor( fipsst , levels = c(1L, 2L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 53L, 54L, 55L, 56L) , labels = c("Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "District of Columbia", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming") ) , overall_health = factor( c( 1 , 1 , 2 , 3 , 3 )[ k2q01 ] , levels = 1:3 , labels = c( 'excellent or very good' , 'good' , 'fair or poor' ) ) , poverty_categories = factor( 1 + findInterval( fpl , c( 100 , 200 , 400 ) ) , labels = c( "below poverty" , "100-199% fpl" , "200-399% fpl" , "400%+ fpl" ) ) , under_six_ever_breastfed = as.numeric( k6q40 == 1 ) , sc_sex = factor( ifelse( sc_sex %in% 1:2 , sc_sex , NA ) , labels = c( "male" , "female" ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: MIcombine( with( nsch_design , svyby( ~ one , ~ one , unwtd.count ) ) ) MIcombine( with( nsch_design , svyby( ~ one , ~ state_name , unwtd.count ) ) ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: MIcombine( with( nsch_design , svytotal( ~ one ) ) ) MIcombine( with( nsch_design , svyby( ~ one , ~ state_name , svytotal ) ) ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: MIcombine( with( nsch_design , svymean( ~ sc_age_years ) ) ) MIcombine( with( nsch_design , svyby( ~ sc_age_years , ~ state_name , svymean ) ) ) Calculate the distribution of a categorical variable, overall and by groups: MIcombine( with( nsch_design , svymean( ~ poverty_categories ) ) ) MIcombine( with( nsch_design , svyby( ~ poverty_categories , ~ state_name , svymean ) ) ) Calculate the sum of a linear variable, overall and by groups: MIcombine( with( nsch_design , svytotal( ~ sc_age_years ) ) ) MIcombine( with( nsch_design , svyby( ~ sc_age_years , ~ state_name , svytotal ) ) ) Calculate the weighted sum of a categorical variable, overall and by groups: MIcombine( with( nsch_design , svytotal( ~ poverty_categories ) ) ) MIcombine( with( nsch_design , svyby( ~ poverty_categories , ~ state_name , svytotal ) ) ) Calculate the median (50th percentile) of a linear variable, overall and by groups: MIcombine( with( nsch_design , svyquantile( ~ sc_age_years , 0.5 , se = TRUE ) ) ) MIcombine( with( nsch_design , svyby( ~ sc_age_years , ~ state_name , svyquantile , 0.5 , se = TRUE , ci = TRUE ) ) ) Estimate a ratio: MIcombine( with( nsch_design , svyratio( numerator = ~ liveusa_yr , denominator = ~ sc_age_years , na.rm = TRUE ) ) ) Subsetting Restrict the survey design to only children: sub_nsch_design <- subset( nsch_design , agepos4 == 1 ) Calculate the mean (average) of this subset: MIcombine( with( sub_nsch_design , svymean( ~ sc_age_years ) ) ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- MIcombine( with( nsch_design , svymean( ~ sc_age_years ) ) ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- MIcombine( with( nsch_design , svyby( ~ sc_age_years , ~ state_name , svymean ) ) ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( nsch_design$designs[[1]] ) Calculate the complex sample survey-adjusted variance of any statistic: MIcombine( with( nsch_design , svyvar( ~ sc_age_years ) ) ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement MIcombine( with( nsch_design , svymean( ~ sc_age_years , deff = TRUE ) ) ) # SRS with replacement MIcombine( with( nsch_design , svymean( ~ sc_age_years , deff = "replace" ) ) ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: # MIsvyciprop( ~ under_six_ever_breastfed , nsch_design , # method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: # MIsvyttest( sc_age_years ~ under_six_ever_breastfed , nsch_design ) Perform a chi-squared test of association for survey data: # MIsvychisq( ~ under_six_ever_breastfed + poverty_categories , nsch_design ) Perform a survey-weighted generalized linear model: glm_result <- MIcombine( with( nsch_design , svyglm( sc_age_years ~ under_six_ever_breastfed + poverty_categories ) ) ) summary( glm_result ) Replication Example As noted in the bold red footnotes of their published table, this technique is not correct and should not be used. The technical documents recommend a method matching the MIcombine syntax shown above. Nonetheless, this code matches statistics and confidence intervals within 0.5% from the Excellent or very good column of Indicator 1.1: In general, how would you describe this child’s health?: results <- svyby( ~ as.numeric( overall_health == 'excellent or very good' ) , ~ poverty_categories , nsch_design$designs[[1]] , svymean , na.rm = TRUE ) published_proportions <- c( 0.833 , 0.859 , 0.907 , 0.955 ) published_lb <- c( 0.810 , 0.838 , 0.894 , 0.949 ) published_ub <- c( 0.854 , 0.878 , 0.919 , 0.961 ) stopifnot( all( abs( round( coef( results ) , 3 ) - published_proportions ) < 0.005 ) ) ( ci_results <- confint( results ) ) stopifnot( all( abs( ci_results[ , 1 ] - published_lb ) < 0.005 ) ) stopifnot( all( abs( ci_results[ , 2 ] - published_ub ) < 0.005 ) ) "],["national-survey-on-drug-use-and-health-nsduh.html", "National Survey on Drug Use and Health (NSDUH) Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " National Survey on Drug Use and Health (NSDUH) The primary survey to measure of prevalence of substance use and its correlates in the United States. One table with one row per sampled respondent. A complex survey designed to generalize to civilian, non-institutional americans aged 12 and older. Released periodically since 1979 and annually since 1990. Administered by the Substance Abuse and Mental Health Services Administration. Please skim before you begin: 2021 National Survey on Drug Use and Health (NSDUH): Public Use File Codebook 2021 National Survey on Drug Use and Health (NSDUH): Methodological Summary and Definitions A haiku regarding this microdata: # drinking and thinking # about your first time, were you # smoking and joking? Download, Import, Preparation Download and import the national file: zip_tf <- tempfile() zip_url <- paste0( "https://www.datafiles.samhsa.gov/sites/default/files/field-uploads-protected/" , "studies/NSDUH-2021/NSDUH-2021-datasets/NSDUH-2021-DS0001/" , "NSDUH-2021-DS0001-bundles-with-study-info/NSDUH-2021-DS0001-bndl-data-r_v3.zip" ) download.file( zip_url , zip_tf , mode = 'wb' ) nsduh_rdata <- unzip( zip_tf , exdir = tempdir() ) nsduh_rdata_contents <- load( nsduh_rdata ) nsduh_df_name <- grep( 'PUF' , nsduh_rdata_contents , value = TRUE ) nsduh_df <- get( nsduh_df_name ) names( nsduh_df ) <- tolower( names( nsduh_df ) ) nsduh_df[ , 'one' ] <- 1 Save Locally   Save the object at any point: # nsduh_fn <- file.path( path.expand( "~" ) , "NSDUH" , "this_file.rds" ) # saveRDS( nsduh_df , file = nsduh_fn , compress = FALSE ) Load the same object: # nsduh_df <- readRDS( nsduh_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) nsduh_design <- svydesign( id = ~ verep , strata = ~ vestr_c , data = nsduh_df , weights = ~ analwt_c , nest = TRUE ) Variable Recoding Add new columns to the data set: nsduh_design <- update( nsduh_design , one = 1 , health = factor( health , levels = 1:5 , labels = c( "excellent" , "very good" , "good" , "fair" , "poor" ) ) , age_first_cigarette = ifelse( cigtry > 99 , NA , cigtry ) , age_tried_cocaine = ifelse( cocage > 99 , NA , cocage ) , ever_used_marijuana = as.numeric( ifelse( mjever < 4 , mjever == 1 , NA ) ) , county_type = factor( coutyp4 , levels = 1:3 , labels = c( "large metro" , "small metro" , "nonmetro" ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( nsduh_design , "sampling" ) != 0 ) svyby( ~ one , ~ county_type , nsduh_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , nsduh_design ) svyby( ~ one , ~ county_type , nsduh_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ age_first_cigarette , nsduh_design , na.rm = TRUE ) svyby( ~ age_first_cigarette , ~ county_type , nsduh_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ health , nsduh_design , na.rm = TRUE ) svyby( ~ health , ~ county_type , nsduh_design , svymean , na.rm = TRUE ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ age_first_cigarette , nsduh_design , na.rm = TRUE ) svyby( ~ age_first_cigarette , ~ county_type , nsduh_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ health , nsduh_design , na.rm = TRUE ) svyby( ~ health , ~ county_type , nsduh_design , svytotal , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ age_first_cigarette , nsduh_design , 0.5 , na.rm = TRUE ) svyby( ~ age_first_cigarette , ~ county_type , nsduh_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ age_first_cigarette , denominator = ~ age_tried_cocaine , nsduh_design , na.rm = TRUE ) Subsetting Restrict the survey design to individuals who are pregnant: sub_nsduh_design <- subset( nsduh_design , preg == 1 ) Calculate the mean (average) of this subset: svymean( ~ age_first_cigarette , sub_nsduh_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ age_first_cigarette , nsduh_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ age_first_cigarette , ~ county_type , nsduh_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( nsduh_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ age_first_cigarette , nsduh_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ age_first_cigarette , nsduh_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ age_first_cigarette , nsduh_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ ever_used_marijuana , nsduh_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( age_first_cigarette ~ ever_used_marijuana , nsduh_design ) Perform a chi-squared test of association for survey data: svychisq( ~ ever_used_marijuana + health , nsduh_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( age_first_cigarette ~ ever_used_marijuana + health , nsduh_design ) summary( glm_result ) Replication Example This matches the prevalence and SE of alcohol use in the past month from Codebook Table G.2: result <- svymean( ~ alcmon , nsduh_design ) stopifnot( round( coef( result ) , 3 ) == 0.474 ) stopifnot( round( SE( result ) , 4 ) == 0.0043 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for NSDUH users, this code replicates previously-presented examples: library(srvyr) nsduh_srvyr_design <- as_survey( nsduh_design ) Calculate the mean (average) of a linear variable, overall and by groups: nsduh_srvyr_design %>% summarize( mean = survey_mean( age_first_cigarette , na.rm = TRUE ) ) nsduh_srvyr_design %>% group_by( county_type ) %>% summarize( mean = survey_mean( age_first_cigarette , na.rm = TRUE ) ) "],["national-survey-of-family-growth-nsfg.html", "National Survey of Family Growth (NSFG) Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " National Survey of Family Growth (NSFG) The principal survey to measure reproductive behavior in the United States population. Multiple tables with one row per respondent for the female and male tables, then a separate table with one row per pregnancy. A complex sample survey designed to generalize to the 15-49 year old population of the United States, by gender. Released every couple of years since 1973. Administered by the Centers for Disease Control and Prevention. Please skim before you begin: Sample Design Documentation Wikipedia Entry A haiku regarding this microdata: # family structure # questions cuz radar fails at # storks with bassinets Download, Import, Preparation library(SAScii) library(readr) dat_url <- "https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NSFG/2017_2019_FemRespData.dat" sas_url <- file.path( dirname( dat_url ) , "sas/2017_2019_FemRespSetup.sas" ) sas_positions <- parse.SAScii( sas_url ) sas_positions[ , 'varname' ] <- tolower( sas_positions[ , 'varname' ] ) sas_positions[ , 'column_types' ] <- ifelse( sas_positions[ , 'char' ] , "c" , "d" ) nsfg_tbl <- read_fwf( dat_url , fwf_widths( abs( sas_positions[ , 'width' ] ) , col_names = sas_positions[ , 'varname' ] ) , col_types = paste0( sas_positions[ , 'column_types' ] , collapse = "" ) , na = c( "" , "." ) ) nsfg_df <- data.frame( nsfg_tbl ) Save Locally   Save the object at any point: # nsfg_fn <- file.path( path.expand( "~" ) , "NSFG" , "this_file.rds" ) # saveRDS( nsfg_df , file = nsfg_fn , compress = FALSE ) Load the same object: # nsfg_df <- readRDS( nsfg_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) options( survey.lonely.psu = "adjust" ) nsfg_design <- svydesign( id = ~ secu , strata = ~ sest , data = nsfg_df , weights = ~ wgt2017_2019 , nest = TRUE ) Variable Recoding Add new columns to the data set: nsfg_design <- update( nsfg_design , one = 1 , birth_control_pill = as.numeric( constat1 == 6 ) , age_categories = factor( findInterval( ager , c( 15 , 20 , 25 , 30 , 35 , 40 ) ) , labels = c( '15-19' , '20-24' , '25-29' , '30-34' , '35-39' , '40-49' ) ) , marstat = factor( marstat , levels = c( 1:6 , 8:9 ) , labels = c( "Married to a person of the opposite sex" , "Not married but living together with a partner of the opposite sex" , "Widowed" , "Divorced or annulled" , "Separated, because you and your spouse are not getting along" , "Never been married" , "Refused" , "Don't know" ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( nsfg_design , "sampling" ) != 0 ) svyby( ~ one , ~ age_categories , nsfg_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , nsfg_design ) svyby( ~ one , ~ age_categories , nsfg_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ pregnum , nsfg_design , na.rm = TRUE ) svyby( ~ pregnum , ~ age_categories , nsfg_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ marstat , nsfg_design ) svyby( ~ marstat , ~ age_categories , nsfg_design , svymean ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ pregnum , nsfg_design , na.rm = TRUE ) svyby( ~ pregnum , ~ age_categories , nsfg_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ marstat , nsfg_design ) svyby( ~ marstat , ~ age_categories , nsfg_design , svytotal ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ pregnum , nsfg_design , 0.5 , na.rm = TRUE ) svyby( ~ pregnum , ~ age_categories , nsfg_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ pregnum , denominator = ~ lbpregs , nsfg_design , na.rm = TRUE ) Subsetting Restrict the survey design to ever cohabited: sub_nsfg_design <- subset( nsfg_design , timescoh > 0 ) Calculate the mean (average) of this subset: svymean( ~ pregnum , sub_nsfg_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ pregnum , nsfg_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ pregnum , ~ age_categories , nsfg_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( nsfg_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ pregnum , nsfg_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ pregnum , nsfg_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ pregnum , nsfg_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ birth_control_pill , nsfg_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( pregnum ~ birth_control_pill , nsfg_design ) Perform a chi-squared test of association for survey data: svychisq( ~ birth_control_pill + marstat , nsfg_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( pregnum ~ birth_control_pill + marstat , nsfg_design ) summary( glm_result ) Replication Example This example matches the Variance Estimates for Percentages using SAS (9.4) and STATA (14): Match the sum of the weights: result <- svytotal( ~ one , nsfg_design ) stopifnot( round( coef( result ) , 0 ) == 72671926 ) stopifnot( round( SE( result ) , 0 ) == 3521465 ) Match row percentages of women currently using the pill by age: row_percents <- c( 19.5112 , 23.7833 , 19.6916 , 15.2800 , 6.4965 , 6.5215 ) std_err_row_percents <- c( 1.8670 , 2.1713 , 2.2773 , 1.7551 , 0.9895 , 1.0029 ) results <- svyby( ~ birth_control_pill , ~ age_categories , nsfg_design , svymean ) stopifnot( all( round( coef( results ) * 100 , 4 ) == row_percents ) ) stopifnot( all( round( SE( results ) * 100 , 4 ) == std_err_row_percents ) ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for NSFG users, this code replicates previously-presented examples: library(srvyr) nsfg_srvyr_design <- as_survey( nsfg_design ) Calculate the mean (average) of a linear variable, overall and by groups: nsfg_srvyr_design %>% summarize( mean = survey_mean( pregnum , na.rm = TRUE ) ) nsfg_srvyr_design %>% group_by( age_categories ) %>% summarize( mean = survey_mean( pregnum , na.rm = TRUE ) ) "],["national-sample-survey-of-registered-nurses-nssrn.html", "National Sample Survey of Registered Nurses (NSSRN) Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " National Sample Survey of Registered Nurses (NSSRN) The employment, education, and demographics of the RN, NP, and APRN workforce in the United States. One table with one row per surveyed registered nurse (RN) or nurse practitioner (NP). A complex sample designed to generalize to RNs and NPs at both state and national levels. Released every four to ten years since 1977. Administered by the Health Services and Resources Administration, in partnership with Census. Please skim before you begin: 2022 NSSRN Methodology Report Frequently Asked Questions A haiku regarding this microdata: # florence nightingale # founder of modern nursing # a statistician Download, Import, Preparation Download and import the state file: library(haven) nssrn_tf <- tempfile() nssrn_url <- "https://data.hrsa.gov/DataDownload/NSSRN/GeneralPUF22/2022_NSSRN_PUF_Stata_Package.zip" download.file( nssrn_url , nssrn_tf , mode = 'wb' ) nssrn_files <- unzip( nssrn_tf , exdir = tempdir() ) nssrn_dta <- grep( "\\\\.dta$" , nssrn_files , ignore.case = TRUE , value = TRUE ) nssrn_tbl <- read_dta( nssrn_dta ) nssrn_df <- data.frame( nssrn_tbl ) names( nssrn_df ) <- tolower( names( nssrn_df ) ) nssrn_df[ , 'one' ] <- 1 Save Locally   Save the object at any point: # nssrn_fn <- file.path( path.expand( "~" ) , "NSSRN" , "this_file.rds" ) # saveRDS( nssrn_df , file = nssrn_fn , compress = FALSE ) Load the same object: # nssrn_df <- readRDS( nssrn_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) nssrn_design <- svrepdesign( weight = ~rkrnwgta , repweights = 'rkrnwgta[0-9]+' , scale = 4 / 80 , rscales = rep( 1 , 80 ) , mse = TRUE , type = 'JK1' , data = nssrn_df ) Variable Recoding Add new columns to the data set: nssrn_design <- update( nssrn_design , # all advanced practice registered nurses # (including nurse practitioners) all_aprn = as.numeric( ed_lcrn == 2 ) , age_group = factor( findInterval( age_gp_puf , c( 0 , 3 , 5 , 7 , 9 ) ) , levels = 1:5 , labels = c( '34 or younger' , '35 to 44' , '45 to 54' , '55 to 64' , '65 or older' ) ) , primary_position_state = factor( as.numeric( pn_loc_code_puf ) , levels = c(1L, 2L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 53L, 54L, 55L, 56L, 72L, # note collapsed geographies from codebook 500L, 800L) , labels = c("Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "District of Columbia", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming", "Puerto Rico", # note collapsed geographies from codebook "District of Columbia & Delaware", "Montana & Wyoming") ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( nssrn_design , "sampling" ) != 0 ) svyby( ~ one , ~ age_group , nssrn_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , nssrn_design ) svyby( ~ one , ~ age_group , nssrn_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ pn_earn_puf , nssrn_design , na.rm = TRUE ) svyby( ~ pn_earn_puf , ~ age_group , nssrn_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ primary_position_state , nssrn_design , na.rm = TRUE ) svyby( ~ primary_position_state , ~ age_group , nssrn_design , svymean , na.rm = TRUE ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ pn_earn_puf , nssrn_design , na.rm = TRUE ) svyby( ~ pn_earn_puf , ~ age_group , nssrn_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ primary_position_state , nssrn_design , na.rm = TRUE ) svyby( ~ primary_position_state , ~ age_group , nssrn_design , svytotal , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ pn_earn_puf , nssrn_design , 0.5 , na.rm = TRUE ) svyby( ~ pn_earn_puf , ~ age_group , nssrn_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ pn_earn_puf , denominator = ~ hrs_yr_puf , nssrn_design , na.rm = TRUE ) Subsetting Restrict the survey design to individuals working as RNs or APRNs (excluding RNs working as LPNs): sub_nssrn_design <- subset( nssrn_design , pn_lcreq_none == 2 ) Calculate the mean (average) of this subset: svymean( ~ pn_earn_puf , sub_nssrn_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ pn_earn_puf , nssrn_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ pn_earn_puf , ~ age_group , nssrn_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( nssrn_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ pn_earn_puf , nssrn_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ pn_earn_puf , nssrn_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ pn_earn_puf , nssrn_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ all_aprn , nssrn_design , method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( pn_earn_puf ~ all_aprn , nssrn_design ) Perform a chi-squared test of association for survey data: svychisq( ~ all_aprn + primary_position_state , nssrn_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( pn_earn_puf ~ all_aprn + primary_position_state , nssrn_design ) summary( glm_result ) Replication Example This example matches statistics and relative standard errors from the “Demographics” tab of Nursing Workforce 2022 NSSRN Dashboard Data: unwtd_count_result <- svyby( ~ one , ~ age_group , nssrn_design , unwtd.count ) # cells L398 thru L402 stopifnot( coef( unwtd_count_result ) == c( 6693 , 12268 , 10804 , 10538 , 8811 ) ) wtd_n_result <- svytotal( ~ age_group , nssrn_design ) # cells J398 thru J402 stopifnot( round( coef( wtd_n_result ) , 0 ) == c( 861060 , 1078187 , 935778 , 834939 , 639412 ) ) share_result <- svymean( ~ age_group , nssrn_design ) # cells K398 thru K402 stopifnot( round( coef( share_result ) , 3 ) == c( 0.198 , 0.248 , 0.215 , 0.192 , 0.147 ) ) # cells M398 thru M402 stopifnot( round( SE( share_result ) / coef( share_result ) , 4 ) == c( 0.0206 , 0.0155 , 0.0192 , 0.0187 , 0.0125 ) ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for NSSRN users, this code replicates previously-presented examples: library(srvyr) nssrn_srvyr_design <- as_survey( nssrn_design ) Calculate the mean (average) of a linear variable, overall and by groups: nssrn_srvyr_design %>% summarize( mean = survey_mean( pn_earn_puf , na.rm = TRUE ) ) nssrn_srvyr_design %>% group_by( age_group ) %>% summarize( mean = survey_mean( pn_earn_puf , na.rm = TRUE ) ) "],["new-york-city-housing-and-vacancy-survey-nychvs.html", "New York City Housing and Vacancy Survey (NYCHVS) Function Definitions Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " New York City Housing and Vacancy Survey (NYCHVS) A city-wide assessment of the rental vacancy rate and other characteristics related to housing stock. One table with one record per occupied housing unit, a second table with one record per person inside each occupied housing unit, and a third table with one record per unoccupied housing unit. A complex sample designed to generalize to occupied & unoccupied housing in the five boroughs. Released more or less triennially since 1991. Funded by the NYC Dept. of Housing Preservation & Development, run by the Census Bureau. Please skim before you begin: Public Use File User Guide and Codebook Sample Design, Weighting, and Error Estimation A haiku regarding this microdata: # all i want is a # room somewhere / with clawfoot tub # and a frigidaire Function Definitions Define a function to download and import each comma-separated value file: nychvs_csv_import <- function( this_url ){ tf <- tempfile() download.file( this_url , tf , mode = 'wb' ) this_df <- read.csv( tf ) names( this_df ) <- tolower( names( this_df ) ) this_df } Download, Import, Preparation Download and import the all units, occupied units, person, and vacant units tables: datasets_url <- "https://www2.census.gov/programs-surveys/nychvs/datasets/2021/microdata/" all_units_df <- nychvs_csv_import( paste0( datasets_url , "allunits_puf_21.csv" ) ) occupied_units_df <- nychvs_csv_import( paste0( datasets_url , "occupied_puf_21.csv" ) ) person_df <- nychvs_csv_import( paste0( datasets_url , "person_puf_21.csv" ) ) vacant_units_df <- nychvs_csv_import( paste0( datasets_url , "vacant_puf_21.csv" ) ) stopifnot( nrow( all_units_df ) == nrow( occupied_units_df ) + nrow( vacant_units_df ) ) Merge the information stored in the all units table onto both the occupied and vacant unit tables, then merge the information (not related to weighting) from the occupied unit table onto the person table: before_nrow <- nrow( occupied_units_df ) occupied_units_df <- merge( all_units_df , occupied_units_df ) stopifnot( nrow( occupied_units_df ) == before_nrow ) before_nrow <- nrow( vacant_units_df ) vacant_units_df <- merge( all_units_df , vacant_units_df ) stopifnot( nrow( vacant_units_df ) == before_nrow ) before_nrow <- nrow( person_df ) weighting_variables <- grep( "^fw([0-9]+)?$" , names( occupied_units_df ) , value = TRUE ) person_df <- merge( occupied_units_df[ setdiff( names( occupied_units_df ) , weighting_variables ) ] , person_df ) stopifnot( nrow( person_df ) == before_nrow ) all_units_df[ , 'one' ] <- occupied_units_df[ , 'one' ] <- vacant_units_df[ , 'one' ] <- person_df[ , 'one' ] <- 1 Save Locally   Save the object at any point: # nychvs_fn <- file.path( path.expand( "~" ) , "NYCHVS" , "this_file.rds" ) # saveRDS( nychvs_df , file = nychvs_fn , compress = FALSE ) Load the same object: # nychvs_df <- readRDS( nychvs_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) all_units_design <- svrepdesign( weight = ~fw , repweights = 'fw[0-9]+' , scale = 4 / 80 , rscales = rep( 1 , 80 ) , mse = TRUE , type = 'JK1' , data = all_units_df ) occupied_units_design <- svrepdesign( weight = ~fw , repweights = 'fw[0-9]+' , scale = 4 / 80 , rscales = rep( 1 , 80 ) , mse = TRUE , type = 'JK1' , data = occupied_units_df ) vacant_units_design <- svrepdesign( weight = ~fw , repweights = 'fw[0-9]+' , scale = 4 / 80 , rscales = rep( 1 , 80 ) , mse = TRUE , type = 'JK1' , data = vacant_units_df ) person_design <- svrepdesign( weight = ~pw , repweights = 'pw[0-9]+' , scale = 4 / 80 , rscales = rep( 1 , 80 ) , mse = TRUE , type = 'JK1' , data = person_df ) nychvs_design <- occupied_units_design Variable Recoding Add new columns to the data set: nychvs_design <- update( nychvs_design , one = 1 , home_owners = as.numeric( tenure == 2 ) , yearly_household_income = hhinc_rec1 , rent_amount = ifelse( rent_amount == -2 , NA , rent_amount ) , borough = factor( boro , levels = 1:5 , labels = c( 'Bronx' , 'Brooklyn' , 'Manhattan' , 'Queens' , 'Staten Island' ) ) , food_insecurity = factor( foodinsecure , levels = 1:3 , labels = c( 'not insecure' , 'insecure' , 'very insecure' ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( nychvs_design , "sampling" ) != 0 ) svyby( ~ one , ~ borough , nychvs_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , nychvs_design ) svyby( ~ one , ~ borough , nychvs_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ hhinc_rec1 , nychvs_design , na.rm = TRUE ) svyby( ~ hhinc_rec1 , ~ borough , nychvs_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ food_insecurity , nychvs_design , na.rm = TRUE ) svyby( ~ food_insecurity , ~ borough , nychvs_design , svymean , na.rm = TRUE ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ hhinc_rec1 , nychvs_design , na.rm = TRUE ) svyby( ~ hhinc_rec1 , ~ borough , nychvs_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ food_insecurity , nychvs_design , na.rm = TRUE ) svyby( ~ food_insecurity , ~ borough , nychvs_design , svytotal , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ hhinc_rec1 , nychvs_design , 0.5 , na.rm = TRUE ) svyby( ~ hhinc_rec1 , ~ borough , nychvs_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ rent_amount , denominator = ~ hhinc_rec1 , nychvs_design , na.rm = TRUE ) Subsetting Restrict the survey design to rent burdened units (more than 30% of income paid toward rent alone): sub_nychvs_design <- subset( nychvs_design , rentburden_cat %in% 1:2 ) Calculate the mean (average) of this subset: svymean( ~ hhinc_rec1 , sub_nychvs_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ hhinc_rec1 , nychvs_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ hhinc_rec1 , ~ borough , nychvs_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( nychvs_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ hhinc_rec1 , nychvs_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ hhinc_rec1 , nychvs_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ hhinc_rec1 , nychvs_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ home_owners , nychvs_design , method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( hhinc_rec1 ~ home_owners , nychvs_design ) Perform a chi-squared test of association for survey data: svychisq( ~ home_owners + food_insecurity , nychvs_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( hhinc_rec1 ~ home_owners + food_insecurity , nychvs_design ) summary( glm_result ) Replication Example This example matches the estimate and standard error of the number of occupied housing units across the five boroughs shown at minute 6:05: result <- svytotal( ~ one , nychvs_design ) stopifnot( round( coef( result ) , 0 ) == 3157105 ) stopifnot( round( SE( result ) , 0 ) == 13439 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for NYCHVS users, this code replicates previously-presented examples: library(srvyr) nychvs_srvyr_design <- as_survey( nychvs_design ) Calculate the mean (average) of a linear variable, overall and by groups: nychvs_srvyr_design %>% summarize( mean = survey_mean( hhinc_rec1 , na.rm = TRUE ) ) nychvs_srvyr_design %>% group_by( borough ) %>% summarize( mean = survey_mean( hhinc_rec1 , na.rm = TRUE ) ) "],["pew-research-center-pew.html", "Pew Research Center (PEW) Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " Pew Research Center (PEW) Public opinion polling on U.S. Politics & Policy, Journalism & Media, Internet, Science & Tech, Religion & Public Life, Hispanic Trends, Global Attitudes & Trends, and Social & Demographic Trends. Generally one table per survey, with one row per sampled respondent. Complex samples generalizing to the noninstitutionalized adults in the nation(s) surveyed. Varying publication dates for both American Trends Panel surveys of the United States and also for International Surveys. National Public Opinion Reference Survey released annually since 2020. Administered by the Pew Research Center. Please skim before you begin: U.S. Surveys Country Specific Methodology, for example the 2022 Global Attitudes Survey A haiku regarding this microdata: # sock puppet pundit # throws 'ssue, cites pew-laced news, sighs # "unbutton your eyes!" Download, Import, Preparation Register for a Pew Research Center account at https://www.pewresearch.org/profile/registration/. DOWNLOAD THIS DATASET at https://www.pewresearch.org/global/dataset/spring-2022-survey-data/. Download the SPSS dataset Pew-Research-Center-Global-Attitudes-Spring-2022-Survey-Data.zip: library(haven) pew_fn <- file.path( path.expand( "~" ) , "Pew Research Center Global Attitudes Spring 2022 Dataset.sav" ) pew_tbl <- read_sav( pew_fn ) pew_label <- lapply( pew_tbl , function( w ) attributes( w )[['label']] ) pew_labels <- lapply( pew_tbl , function( w ) attributes( w )[['labels']] ) pew_tbl <- zap_labels( pew_tbl ) pew_df <- data.frame( pew_tbl ) names( pew_df ) <- tolower( names( pew_df ) ) Collapse country-specific cluster and strata variables into two all-country cluster and strata variables: # create the constructed psu and strata variables from among the # non-missing country-specific columns pew_df[ , 'psu_constructed' ] <- apply( pew_df[ , grep( "^psu_" , names( pew_df ) ) ] , 1 , function( w ) w[ which.min( is.na( w ) ) ] ) pew_df[ , 'stratum_constructed' ] <- apply( pew_df[ , grep( "^stratum_" , names( pew_df ) ) ] , 1 , function( w ) w[ which.min( is.na( w ) ) ] ) # for countries without clustering variables, give every record a unique identifier for the psu.. pew_df[ is.na( pew_df[ , 'psu_constructed' ] ) , 'psu_constructed' ] <- rownames( pew_df[ is.na( pew_df[ , 'psu_constructed' ] ) , ] ) # ..and zeroes for the stratum pew_df[ is.na( pew_df[ , 'stratum_constructed' ] ) , 'stratum_constructed' ] <- 0 Save Locally   Save the object at any point: # pew_fn <- file.path( path.expand( "~" ) , "PEW" , "this_file.rds" ) # saveRDS( pew_df , file = pew_fn , compress = FALSE ) Load the same object: # pew_df <- readRDS( pew_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) options( survey.lonely.psu = "adjust" ) pew_design <- svydesign( id = ~ psu_constructed , strata = ~ interaction( country , stratum_constructed ) , data = pew_df , weights = ~ weight , nest = TRUE ) Variable Recoding Add new columns to the data set: pew_design <- update( pew_design , one = 1 , topcoded_respondent_age = ifelse( age >= 99 , NA , ifelse( age >= 97 , 97 , age ) ) , human_rights_priority_with_china = ifelse( china_humanrights_priority > 2 , NA , as.numeric( china_humanrights_priority == 1 ) ) , favorable_unfavorable_one_to_four_us = ifelse( fav_us > 4 , NA , fav_us ) , favorable_unfavorable_one_to_four_un = ifelse( fav_un > 4 , NA , fav_un ) , country_name = factor( country , levels = as.integer( pew_labels[[ 'country' ]] ) , labels = names( pew_labels[['country']] ) ) , econ_sit = factor( econ_sit , levels = 1:4 , labels = c( 'Very good' , 'Somewhat good' , 'Somewhat bad' , 'Very bad' ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( pew_design , "sampling" ) != 0 ) svyby( ~ one , ~ country_name , pew_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , pew_design ) svyby( ~ one , ~ country_name , pew_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ topcoded_respondent_age , pew_design , na.rm = TRUE ) svyby( ~ topcoded_respondent_age , ~ country_name , pew_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ econ_sit , pew_design , na.rm = TRUE ) svyby( ~ econ_sit , ~ country_name , pew_design , svymean , na.rm = TRUE ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ topcoded_respondent_age , pew_design , na.rm = TRUE ) svyby( ~ topcoded_respondent_age , ~ country_name , pew_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ econ_sit , pew_design , na.rm = TRUE ) svyby( ~ econ_sit , ~ country_name , pew_design , svytotal , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ topcoded_respondent_age , pew_design , 0.5 , na.rm = TRUE ) svyby( ~ topcoded_respondent_age , ~ country_name , pew_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE , na.rm.all = TRUE ) Estimate a ratio: svyratio( numerator = ~ favorable_unfavorable_one_to_four_un , denominator = ~ favorable_unfavorable_one_to_four_us , pew_design , na.rm = TRUE ) Subsetting Restrict the survey design to : sub_pew_design <- subset( pew_design , country_name == 'South Korea' ) Calculate the mean (average) of this subset: svymean( ~ topcoded_respondent_age , sub_pew_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ topcoded_respondent_age , pew_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ topcoded_respondent_age , ~ country_name , pew_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( pew_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ topcoded_respondent_age , pew_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ topcoded_respondent_age , pew_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ topcoded_respondent_age , pew_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ human_rights_priority_with_china , pew_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( topcoded_respondent_age ~ human_rights_priority_with_china , pew_design ) Perform a chi-squared test of association for survey data: svychisq( ~ human_rights_priority_with_china + econ_sit , pew_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( topcoded_respondent_age ~ human_rights_priority_with_china + econ_sit , pew_design ) summary( glm_result ) Replication Example This matches statistics and standard errors from How to analyze Pew Research Center survey data in R: DOWNLOAD THIS DATASET at https://www.pewresearch.org/politics/dataset/april-2017-political-survey/. Download the SPSS dataset Apr17-public-4.3-update.zip dated 12/29/2017: political_survey_2017_fn <- file.path( path.expand( "~" ) , "Apr17 public.sav" ) political_survey_2017_tbl <- read_sav( political_survey_2017_fn ) political_survey_2017_df <- data.frame( political_survey_2017_tbl ) names( political_survey_2017_df ) <- tolower( names( political_survey_2017_df ) ) Construct a complex sample survey design: political_survey_2017_design <- svydesign( ~ 0 , data = political_survey_2017_df , weights = ~ weight ) Add new columns to the data set: political_survey_2017_design <- update( political_survey_2017_design , q1 = factor( q1 , levels = c( 1 , 2 , 9 ) , labels = c( 'Approve' , 'Disapprove' , 'DK/RF' ) ) ) Reproduce statistics and standard errors shown under Estimating frequencies with survey weights: result <- svymean( ~ q1 , political_survey_2017_design , na.rm = TRUE ) stopifnot( round( coef( result ) , 4 ) == c( 0.3940 , 0.5424 , 0.0636 ) ) stopifnot( round( SE( result ) , 4 ) == c( 0.0144 , 0.0147 , 0.0078 ) ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for PEW users, this code replicates previously-presented examples: library(srvyr) pew_srvyr_design <- as_survey( pew_design ) Calculate the mean (average) of a linear variable, overall and by groups: pew_srvyr_design %>% summarize( mean = survey_mean( topcoded_respondent_age , na.rm = TRUE ) ) pew_srvyr_design %>% group_by( country_name ) %>% summarize( mean = survey_mean( topcoded_respondent_age , na.rm = TRUE ) ) "],["programme-for-the-international-assessment-of-adult-competencies-piaac.html", "Programme for the International Assessment of Adult Competencies (PIAAC) Download, Import, Preparation Analysis Examples with the survey library   Replication Example", " Programme for the International Assessment of Adult Competencies (PIAAC) A cross-national study designed to understand the skills of workers in advanced-nation labor markets. One row per sampled adult. A multiply-imputed, complex sample survey designed to generalize to the population aged 16 to 65 across thirty three OECD nations. No expected release timeline. Administered by the Organisation for Economic Co-operation and Development. Please skim before you begin: Technical Report of the Survey of Adult Skills Wikipedia Entry A haiku regarding this microdata: # what color collar # workforce poets, potters, or # pythagoreans Download, Import, Preparation library(haven) library(httr) tf <- tempfile() this_url <- "https://webfs.oecd.org/piaac/puf-data/SAS/SAS7BDAT/prgusap1_2012.sas7bdat" GET( this_url , write_disk( tf ) , progress() ) piaac_tbl <- read_sas( tf ) piaac_df <- data.frame( piaac_tbl ) names( piaac_df ) <- tolower( names( piaac_df ) ) Save Locally   Save the object at any point: # piaac_fn <- file.path( path.expand( "~" ) , "PIAAC" , "this_file.rds" ) # saveRDS( piaac_df , file = piaac_fn , compress = FALSE ) Load the same object: # piaac_df <- readRDS( piaac_fn ) Survey Design Definition Construct a multiply-imputed, complex sample survey design: library(survey) library(mitools) pvals <- c( "pvlit" , "pvnum" , "pvpsl" ) pvars <- outer( pvals , 1:10 , paste0 ) non.pvals <- names(piaac_df)[ !( names(piaac_df) %in% pvars ) ] for(k in 1:10){ piaac_imp <- piaac_df[ , c( non.pvals , paste0( pvals , k ) ) ] for( j in pvals ){ piaac_imp[ , j ] <- piaac_imp[ , paste0( j , k ) ] piaac_imp[ , paste0( j , k ) ] <- NULL } if( k == 1 ){ piaac_mi <- list( piaac_imp ) } else { piaac_mi <- c( piaac_mi , list( piaac_imp ) ) } } jk.method <- unique( piaac_df[ , 'vemethod' ] ) stopifnot(length(jk.method) == 1) stopifnot(jk.method %in% c("JK1", "JK2")) if (jk.method == "JK2") jk.method <- "JKn" piaac_design <- svrepdesign( weights = ~spfwt0 , repweights = "spfwt[1-9]" , rscales = rep( 1 , 80 ) , scale = ifelse( jk.method == "JKn" , 1 , 79/80 ) , type = jk.method , data = imputationList( piaac_mi ) , mse = TRUE ) Variable Recoding Add new columns to the data set: piaac_design <- update( piaac_design , one = 1 , sex = factor( gender_r , labels = c( "male" , "female" ) ) , age_categories = factor( ageg10lfs , levels = 1:5 , labels = c( "24 or less" , "25-34" , "35-44" , "45-54" , "55 plus" ) ) , working_at_paid_job_last_week = as.numeric( c_q01a == 1 ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: MIcombine( with( piaac_design , svyby( ~ one , ~ one , unwtd.count ) ) ) MIcombine( with( piaac_design , svyby( ~ one , ~ age_categories , unwtd.count ) ) ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: MIcombine( with( piaac_design , svytotal( ~ one ) ) ) MIcombine( with( piaac_design , svyby( ~ one , ~ age_categories , svytotal ) ) ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: MIcombine( with( piaac_design , svymean( ~ pvnum , na.rm = TRUE ) ) ) MIcombine( with( piaac_design , svyby( ~ pvnum , ~ age_categories , svymean , na.rm = TRUE ) ) ) Calculate the distribution of a categorical variable, overall and by groups: MIcombine( with( piaac_design , svymean( ~ sex ) ) ) MIcombine( with( piaac_design , svyby( ~ sex , ~ age_categories , svymean ) ) ) Calculate the sum of a linear variable, overall and by groups: MIcombine( with( piaac_design , svytotal( ~ pvnum , na.rm = TRUE ) ) ) MIcombine( with( piaac_design , svyby( ~ pvnum , ~ age_categories , svytotal , na.rm = TRUE ) ) ) Calculate the weighted sum of a categorical variable, overall and by groups: MIcombine( with( piaac_design , svytotal( ~ sex ) ) ) MIcombine( with( piaac_design , svyby( ~ sex , ~ age_categories , svytotal ) ) ) Calculate the median (50th percentile) of a linear variable, overall and by groups: MIcombine( with( piaac_design , svyquantile( ~ pvnum , 0.5 , se = TRUE , na.rm = TRUE ) ) ) MIcombine( with( piaac_design , svyby( ~ pvnum , ~ age_categories , svyquantile , 0.5 , se = TRUE , ci = TRUE , na.rm = TRUE ) ) ) Estimate a ratio: MIcombine( with( piaac_design , svyratio( numerator = ~ pvnum , denominator = ~ pvlit , na.rm = TRUE ) ) ) Subsetting Restrict the survey design to self-reported fair or poor health: sub_piaac_design <- subset( piaac_design , i_q08 %in% 4:5 ) Calculate the mean (average) of this subset: MIcombine( with( sub_piaac_design , svymean( ~ pvnum , na.rm = TRUE ) ) ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- MIcombine( with( piaac_design , svymean( ~ pvnum , na.rm = TRUE ) ) ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- MIcombine( with( piaac_design , svyby( ~ pvnum , ~ age_categories , svymean , na.rm = TRUE ) ) ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( piaac_design$designs[[1]] ) Calculate the complex sample survey-adjusted variance of any statistic: MIcombine( with( piaac_design , svyvar( ~ pvnum , na.rm = TRUE ) ) ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement MIcombine( with( piaac_design , svymean( ~ pvnum , na.rm = TRUE , deff = TRUE ) ) ) # SRS with replacement MIcombine( with( piaac_design , svymean( ~ pvnum , na.rm = TRUE , deff = "replace" ) ) ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: # MIsvyciprop( ~ working_at_paid_job_last_week , piaac_design , # method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: # MIsvyttest( pvnum ~ working_at_paid_job_last_week , piaac_design ) Perform a chi-squared test of association for survey data: # MIsvychisq( ~ working_at_paid_job_last_week + sex , piaac_design ) Perform a survey-weighted generalized linear model: glm_result <- MIcombine( with( piaac_design , svyglm( pvnum ~ working_at_paid_job_last_week + sex ) ) ) summary( glm_result ) Replication Example This example matches the statistics and standard errors from OECD’s Technical Report Table 18.9: usa_pvlit <- MIcombine( with( piaac_design , svymean( ~ pvlit , na.rm = TRUE ) ) ) usa_pvnum <- MIcombine( with( piaac_design , svymean( ~ pvnum , na.rm = TRUE ) ) ) usa_pvpsl <- MIcombine( with( piaac_design , svymean( ~ pvpsl , na.rm = TRUE ) ) ) stopifnot( round( coef( usa_pvlit ) ) == 270 ) stopifnot( round( SE( usa_pvlit ) , 1 ) == 1.0 ) stopifnot( round( coef( usa_pvnum ) ) == 253 ) stopifnot( round( SE( usa_pvnum ) , 1 ) == 1.2 ) stopifnot( round( coef( usa_pvpsl ) ) == 277 ) stopifnot( round( SE( usa_pvpsl ) , 1 ) == 1.1 ) "],["progress-in-international-reading-literacy-study-pirls.html", "Progress in International Reading Literacy Study (PIRLS) Function Definitions Download, Import, Preparation Analysis Examples with the survey library   Replication Example", " Progress in International Reading Literacy Study (PIRLS) A comparative study of student achievement in reading and literacy across more than 50 nations. Grade-specific tables with one record per school, student, teacher, plus files containing student achievement, home background, student-teacher linkage, and within-country scoring reliability. A complex survey generalizing to fourth-grade populations of participating countries. Released quinquennially since 2001. Funded by the International Association for the Evaluation of Educational Achievement, run at BC. Please skim before you begin: PIRLS 2021 User Guide for the International Database Methods and Procedures: PIRLS 2021 Technical Report A haiku regarding this microdata: # lascaux canary # glyph jump reveal caged bard notes # cryogenesis Function Definitions This survey uses a multiply-imputed variance estimation technique described in Methods Chapter 13. Most users do not need to study this function carefully. Define a function specific to only this dataset: pirls_MIcombine <- function (results, variances, call = sys.call(), df.complete = Inf, ...) { m <- length(results) oldcall <- attr(results, "call") if (missing(variances)) { variances <- suppressWarnings(lapply(results, vcov)) results <- lapply(results, coef) } vbar <- variances[[1]] cbar <- results[[1]] for (i in 2:m) { cbar <- cbar + results[[i]] vbar <- vbar + variances[[i]] } cbar <- cbar/m vbar <- vbar/m # MODIFICATION # evar <- var(do.call("rbind", results)) evar <- sum( ( unlist( results ) - cbar )^2 / 4 ) r <- (1 + 1/m) * evar/vbar df <- (m - 1) * (1 + 1/r)^2 if (is.matrix(df)) df <- diag(df) if (is.finite(df.complete)) { dfobs <- ((df.complete + 1)/(df.complete + 3)) * df.complete * vbar/(vbar + evar) if (is.matrix(dfobs)) dfobs <- diag(dfobs) df <- 1/(1/dfobs + 1/df) } if (is.matrix(r)) r <- diag(r) rval <- list(coefficients = cbar, variance = vbar + evar * (m + 1)/m, call = c(oldcall, call), nimp = m, df = df, missinfo = (r + 2/(df + 3))/(r + 1)) class(rval) <- "MIresult" rval } Download, Import, Preparation Download and unzip the 2021 fourth grade international database: library(httr) tf <- tempfile() this_url <- "https://pirls2021.org/data/downloads/P21_Data_R.zip" GET( this_url , write_disk( tf ) , progress() ) unzipped_files <- unzip( tf , exdir = tempdir() ) Import and stack each of the student context data files for Abu Dhabi through Bulgaria: library(haven) # limit unzipped files to those starting with `asg` followed by three letters followed by `r5` asg_fns <- unzipped_files[ grepl( '^asg[a-z][a-z][a-z]r5' , basename( unzipped_files ) , ignore.case = TRUE ) ] # further limit asg files to the first ten countries countries_thru_bulgaria <- c("aad", "adu", "alb", "are", "aus", "aut", "aze", "bfl", "bfr", "bgr") fns_thru_bulgaria <- paste0( paste0( '^asg' , countries_thru_bulgaria , 'r5' ) , collapse = "|" ) asg_aad_bgr_fns <- asg_fns[ grepl( fns_thru_bulgaria , basename( asg_fns ) , ignore.case = TRUE ) ] pirls_df <- NULL for( rdata_fn in asg_aad_bgr_fns ){ this_tbl_name <- load( rdata_fn ) this_tbl <- get( this_tbl_name ) ; rm( this_tbl_name ) this_tbl <- zap_labels( this_tbl ) this_df <- data.frame( this_tbl ) names( this_df ) <- tolower( names( this_df ) ) pirls_df <- rbind( pirls_df , this_df ) } # order the data.frame by unique student id pirls_df <- pirls_df[ with( pirls_df , order( idcntry , idstud ) ) , ] Save Locally   Save the object at any point: # pirls_fn <- file.path( path.expand( "~" ) , "PIRLS" , "this_file.rds" ) # saveRDS( pirls_df , file = pirls_fn , compress = FALSE ) Load the same object: # pirls_df <- readRDS( pirls_fn ) Survey Design Definition Construct a multiply-imputed, complex sample survey design: From among possibly plausible values, determine all columns that are multiply-imputed plausible values: # identify all columns ending with `01` thru `05` ppv <- grep( "(.*)0[1-5]$" , names( pirls_df ) , value = TRUE ) # remove those ending digits ppv_prefix <- gsub( "0[1-5]$" , "" , ppv ) # identify each of the possibilities with exactly five matches (five implicates) pv <- names( table( ppv_prefix )[ table( ppv_prefix ) == 5 ] ) # identify each of the `01` thru `05` plausible value columns pv_columns <- grep( paste0( "^" , pv , "0[1-5]$" , collapse = "|" ) , names( pirls_df ) , value = TRUE ) Extract those multiply-imputed columns into a separate data.frame, then remove them from the source: pv_wide_df <- pirls_df[ c( 'idcntry' , 'idstud' , pv_columns ) ] pirls_df[ pv_columns ] <- NULL Reshape these columns from one record per student to one record per student per implicate: pv_long_df <- reshape( pv_wide_df , varying = lapply( paste0( pv , '0' ) , paste0 , 1:5 ) , direction = 'long' , timevar = 'implicate' , idvar = c( 'idcntry' , 'idstud' ) ) names( pv_long_df ) <- gsub( "01$" , "" , names( pv_long_df ) ) Merge the columns from the source data.frame onto the one record per student per implicate data.frame: pirls_long_df <- merge( pirls_df , pv_long_df ) pirls_long_df <- pirls_long_df[ with( pirls_long_df , order( idcntry , idstud ) ) , ] stopifnot( nrow( pirls_long_df ) == nrow( pv_long_df ) ) stopifnot( nrow( pirls_long_df ) / 5 == nrow( pirls_df ) ) Divide the five plausible value implicates into a list with five data.frames based on the implicate number: pirls_list <- split( pirls_long_df , pirls_long_df[ , 'implicate' ] ) Construct a replicate weights table following the estimation technique described in Methods Chapter 13: weights_df <- pirls_df[ c( 'jkrep' , 'jkzone' ) ] for( j in 1:75 ){ for( i in 0:1 ){ weights_df[ weights_df[ , 'jkzone' ] != j , paste0( 'rw' , i , j ) ] <- 1 weights_df[ weights_df[ , 'jkzone' ] == j , paste0( 'rw' , i , j ) ] <- 2 * ( weights_df[ weights_df[ , 'jkzone' ] == j , 'jkrep' ] == i ) } } weights_df[ c( 'jkrep' , 'jkzone' ) ] <- NULL Define the design: library(survey) library(mitools) pirls_design <- svrepdesign( weights = ~totwgt , repweights = weights_df , data = imputationList( pirls_list ) , type = "other" , scale = 0.5 , rscales = rep( 1 , 150 ) , combined.weights = FALSE , mse = TRUE ) Variable Recoding Add new columns to the data set: pirls_design <- update( pirls_design , one = 1 , countries_thru_bulgaria = factor( as.numeric( idcntry ) , levels = c(7842L, 7841L, 8L, 784L, 36L, 40L, 31L, 956L, 957L, 100L) , labels = c("Abu Dhabi, UAE", "Dubai, UAE", "Albania", "UAE", "Australia", "Austria", "Azerbaijan", "Belgium (Flemish)", "Belgium (French)","Bulgaria") ) , sex = factor( itsex , levels = 1:2 , labels = c( "female" , "male" ) ) , always_speak_language_of_test_at_home = ifelse( asbg03 %in% 1:4 , as.numeric( asbg03 == 1 ) , NA ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: pirls_MIcombine( with( pirls_design , svyby( ~ one , ~ one , unwtd.count ) ) ) pirls_MIcombine( with( pirls_design , svyby( ~ one , ~ sex , unwtd.count ) ) ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: pirls_MIcombine( with( pirls_design , svytotal( ~ one ) ) ) pirls_MIcombine( with( pirls_design , svyby( ~ one , ~ sex , svytotal ) ) ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: pirls_MIcombine( with( pirls_design , svymean( ~ asrrea , na.rm = TRUE ) ) ) pirls_MIcombine( with( pirls_design , svyby( ~ asrrea , ~ sex , svymean , na.rm = TRUE ) ) ) Calculate the distribution of a categorical variable, overall and by groups: pirls_MIcombine( with( pirls_design , svymean( ~ countries_thru_bulgaria ) ) ) pirls_MIcombine( with( pirls_design , svyby( ~ countries_thru_bulgaria , ~ sex , svymean ) ) ) Calculate the sum of a linear variable, overall and by groups: pirls_MIcombine( with( pirls_design , svytotal( ~ asrrea , na.rm = TRUE ) ) ) pirls_MIcombine( with( pirls_design , svyby( ~ asrrea , ~ sex , svytotal , na.rm = TRUE ) ) ) Calculate the weighted sum of a categorical variable, overall and by groups: pirls_MIcombine( with( pirls_design , svytotal( ~ countries_thru_bulgaria ) ) ) pirls_MIcombine( with( pirls_design , svyby( ~ countries_thru_bulgaria , ~ sex , svytotal ) ) ) Calculate the median (50th percentile) of a linear variable, overall and by groups: pirls_MIcombine( with( pirls_design , svyquantile( ~ asrrea , 0.5 , se = TRUE , na.rm = TRUE ) ) ) pirls_MIcombine( with( pirls_design , svyby( ~ asrrea , ~ sex , svyquantile , 0.5 , se = TRUE , ci = TRUE , na.rm = TRUE ) ) ) Estimate a ratio: pirls_MIcombine( with( pirls_design , svyratio( numerator = ~ asrlit , denominator = ~ asrrea ) ) ) Subsetting Restrict the survey design to Australia, Austria, Azerbaijan, Belgium (French): sub_pirls_design <- subset( pirls_design , idcntry %in% c( 36 , 40 , 31 , 956 ) ) Calculate the mean (average) of this subset: pirls_MIcombine( with( sub_pirls_design , svymean( ~ asrrea , na.rm = TRUE ) ) ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- pirls_MIcombine( with( pirls_design , svymean( ~ asrrea , na.rm = TRUE ) ) ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- pirls_MIcombine( with( pirls_design , svyby( ~ asrrea , ~ sex , svymean , na.rm = TRUE ) ) ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( pirls_design$designs[[1]] ) Calculate the complex sample survey-adjusted variance of any statistic: pirls_MIcombine( with( pirls_design , svyvar( ~ asrrea , na.rm = TRUE ) ) ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement pirls_MIcombine( with( pirls_design , svymean( ~ asrrea , na.rm = TRUE , deff = TRUE ) ) ) # SRS with replacement pirls_MIcombine( with( pirls_design , svymean( ~ asrrea , na.rm = TRUE , deff = "replace" ) ) ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: # MIsvyciprop( ~ always_speak_language_of_test_at_home , pirls_design , # method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: # MIsvyttest( asrrea ~ always_speak_language_of_test_at_home , pirls_design ) Perform a chi-squared test of association for survey data: # MIsvychisq( ~ always_speak_language_of_test_at_home + countries_thru_bulgaria , pirls_design ) Perform a survey-weighted generalized linear model: glm_result <- pirls_MIcombine( with( pirls_design , svyglm( asrrea ~ always_speak_language_of_test_at_home + countries_thru_bulgaria ) ) ) summary( glm_result ) Replication Example This example matches the mean proficiency and standard error of the Australia row of the Summary Statistics and Standard Errors for Proficiency in Overall Reading table from the Appendix 13A: Summary Statistics and Standard Errors for Proficiency in Reading: australia_design <- subset( pirls_design , countries_thru_bulgaria %in% "Australia" ) stopifnot( nrow( australia_design ) == 5487 ) result <- pirls_MIcombine( with( australia_design , svymean( ~ asrrea ) ) ) stopifnot( round( coef( result ) , 3 ) == 540.134 ) stopifnot( round( SE( result ) , 3 ) == 1.728 ) This example matches the jackknife sampling, imputation, and total variances of the same row: australia_fn <- unzipped_files[ grepl( 'ASGAUS' , basename( unzipped_files ) ) ] australia_tbl_name <- load( australia_fn ) australia_tbl <- get( australia_tbl_name ) ; rm( australia_tbl_name ) australia_tbl <- zap_labels( australia_tbl ) australia_df <- data.frame( australia_tbl ) names( australia_df ) <- tolower( names( australia_df ) ) estimate <- mean( c( with( australia_df , weighted.mean( asrrea01 , totwgt ) ) , with( australia_df , weighted.mean( asrrea02 , totwgt ) ) , with( australia_df , weighted.mean( asrrea03 , totwgt ) ) , with( australia_df , weighted.mean( asrrea04 , totwgt ) ) , with( australia_df , weighted.mean( asrrea05 , totwgt ) ) ) ) stopifnot( round( estimate , 3 ) == 540.134 ) for( k in 1:5 ){ this_variance <- 0 for( j in 1:75 ){ for( i in 0:1 ){ this_variance <- this_variance + ( weighted.mean( australia_df[ , paste0( 'asrrea0' , k ) ] , ifelse( j == australia_df[ , 'jkzone' ] , australia_df[ , 'totwgt' ] * 2 * ( australia_df[ , 'jkrep' ] == i ) , australia_df[ , 'totwgt' ] ) ) - weighted.mean( australia_df[ , paste0( 'asrrea0' , k ) ] , australia_df[ , 'totwgt' ] ) )^2 } } assign( paste0( 'v' , k ) , this_variance * 0.5 ) } sampling_variance <- mean( c( v1 , v2 , v3 , v4 , v5 ) ) stopifnot( round( sampling_variance , 3 ) == 2.653 ) imputation_variance <- ( 6 / 5 ) * ( ( ( with( australia_df , weighted.mean( asrrea01 , totwgt ) ) - estimate )^2 / 4 ) + ( ( with( australia_df , weighted.mean( asrrea02 , totwgt ) ) - estimate )^2 / 4 ) + ( ( with( australia_df , weighted.mean( asrrea03 , totwgt ) ) - estimate )^2 / 4 ) + ( ( with( australia_df , weighted.mean( asrrea04 , totwgt ) ) - estimate )^2 / 4 ) + ( ( with( australia_df , weighted.mean( asrrea05 , totwgt ) ) - estimate )^2 / 4 ) ) stopifnot( round( imputation_variance , 3 ) == 0.333 ) stopifnot( round( sampling_variance + imputation_variance , 3 ) == 2.987 ) "],["public-libraries-survey-pls.html", "Public Libraries Survey (PLS) Recommended Reading Download, Import, Preparation Analysis Examples with base R   Replication Example Analysis Examples with dplyr   Analysis Examples with data.table   Analysis Examples with duckdb  ", " Public Libraries Survey (PLS) A comprehensive compilation of administrative information on all public libraries in the United States. Two tables, with one record per library system and one record per library building or bookmobile. Released annually since 1992. Conducted by the Institute of Museum and Library Services (IMLS), collected by the Census Bureau. Recommended Reading Two Methodology Documents: Data File Documentation and User’s Guide README FY #### PLS PUD.txt included in each zipped file One Haiku: # census, not survey. # dewey decimal index # finger to lips shush Download, Import, Preparation Download and import the most recent administrative entity csv file: this_tf <- tempfile() csv_url <- "https://www.imls.gov/sites/default/files/2023-06/pls_fy2021_csv.zip" download.file( csv_url , this_tf, mode = 'wb' ) unzipped_files <- unzip( this_tf , exdir = tempdir() ) administrative_entity_csv_fn <- unzipped_files[ grepl( 'AE(.*)csv$' , basename( unzipped_files ) ) ] pls_df <- read.csv( administrative_entity_csv_fn ) names( pls_df ) <- tolower( names( pls_df ) ) pls_df[ , 'one' ] <- 1 Recode missing values as described in the readme included with each zipped file: for( this_col in names( pls_df ) ){ if( class( pls_df[ , this_col ] ) == 'character' ){ pls_df[ pls_df[ , this_col ] %in% 'M' , this_col ] <- NA } if( ( class( pls_df[ , this_col ] ) == 'numeric' ) | ( this_col %in% c( 'phone' , 'startdat' , 'enddate' ) ) ){ pls_df[ pls_df[ , this_col ] %in% c( -1 , -3 , -4 , -9 ) , this_col ] <- NA } } Save Locally   Save the object at any point: # pls_fn <- file.path( path.expand( "~" ) , "PLS" , "this_file.rds" ) # saveRDS( pls_df , file = pls_fn , compress = FALSE ) Load the same object: # pls_df <- readRDS( pls_fn ) Variable Recoding Add new columns to the data set: pls_df <- transform( pls_df , c_relatn = factor( c_relatn , levels = c( "HQ" , "ME" , "NO" ) , c( "HQ-Headquarters of a federation or cooperative" , "ME-Member of a federation or cooperative" , "NO-Not a member of a federation or cooperative" ) ) , more_than_one_librarian = as.numeric( libraria > 1 ) ) Analysis Examples with base R   Unweighted Counts Count the unweighted number of records in the table, overall and by groups: nrow( pls_df ) table( pls_df[ , "stabr" ] , useNA = "always" ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: mean( pls_df[ , "popu_lsa" ] , na.rm = TRUE ) tapply( pls_df[ , "popu_lsa" ] , pls_df[ , "stabr" ] , mean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: prop.table( table( pls_df[ , "c_relatn" ] ) ) prop.table( table( pls_df[ , c( "c_relatn" , "stabr" ) ] ) , margin = 2 ) Calculate the sum of a linear variable, overall and by groups: sum( pls_df[ , "popu_lsa" ] , na.rm = TRUE ) tapply( pls_df[ , "popu_lsa" ] , pls_df[ , "stabr" ] , sum , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: quantile( pls_df[ , "popu_lsa" ] , 0.5 , na.rm = TRUE ) tapply( pls_df[ , "popu_lsa" ] , pls_df[ , "stabr" ] , quantile , 0.5 , na.rm = TRUE ) Subsetting Limit your data.frame to more than one million annual visits: sub_pls_df <- subset( pls_df , visits > 1000000 ) Calculate the mean (average) of this subset: mean( sub_pls_df[ , "popu_lsa" ] , na.rm = TRUE ) Measures of Uncertainty Calculate the variance, overall and by groups: var( pls_df[ , "popu_lsa" ] , na.rm = TRUE ) tapply( pls_df[ , "popu_lsa" ] , pls_df[ , "stabr" ] , var , na.rm = TRUE ) Regression Models and Tests of Association Perform a t-test: t.test( popu_lsa ~ more_than_one_librarian , pls_df ) Perform a chi-squared test of association: this_table <- table( pls_df[ , c( "more_than_one_librarian" , "c_relatn" ) ] ) chisq.test( this_table ) Perform a generalized linear model: glm_result <- glm( popu_lsa ~ more_than_one_librarian + c_relatn , data = pls_df ) summary( glm_result ) Replication Example This example matches Interlibrary Relationship Frequencies on PDF page 169 of the User’s Guide: # remove closed and temporarily closed libraries results <- table( pls_df[ !( pls_df[ , 'statstru' ] %in% c( 3 , 23 ) ) , 'c_relatn' ] ) stopifnot( results[ "HQ-Headquarters of a federation or cooperative" ] == 112 ) stopifnot( results[ "ME-Member of a federation or cooperative" ] == 6859 ) stopifnot( results[ "NO-Not a member of a federation or cooperative" ] == 2236 ) Analysis Examples with dplyr   The R dplyr library offers an alternative grammar of data manipulation to base R and SQL syntax. dplyr offers many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, and the tidyverse style of non-standard evaluation. This vignette details the available features. As a starting point for PLS users, this code replicates previously-presented examples: library(dplyr) pls_tbl <- as_tibble( pls_df ) Calculate the mean (average) of a linear variable, overall and by groups: pls_tbl %>% summarize( mean = mean( popu_lsa , na.rm = TRUE ) ) pls_tbl %>% group_by( stabr ) %>% summarize( mean = mean( popu_lsa , na.rm = TRUE ) ) Analysis Examples with data.table   The R data.table library provides a high-performance version of base R’s data.frame with syntax and feature enhancements for ease of use, convenience and programming speed. data.table offers concise syntax: fast to type, fast to read, fast speed, memory efficiency, a careful API lifecycle management, an active community, and a rich set of features. This vignette details the available features. As a starting point for PLS users, this code replicates previously-presented examples: library(data.table) pls_dt <- data.table( pls_df ) Calculate the mean (average) of a linear variable, overall and by groups: pls_dt[ , mean( popu_lsa , na.rm = TRUE ) ] pls_dt[ , mean( popu_lsa , na.rm = TRUE ) , by = stabr ] Analysis Examples with duckdb   The R duckdb library provides an embedded analytical data management system with support for the Structured Query Language (SQL). duckdb offers a simple, feature-rich, fast, and free SQL OLAP management system. This vignette details the available features. As a starting point for PLS users, this code replicates previously-presented examples: library(duckdb) con <- dbConnect( duckdb::duckdb() , dbdir = 'my-db.duckdb' ) dbWriteTable( con , 'pls' , pls_df ) Calculate the mean (average) of a linear variable, overall and by groups: dbGetQuery( con , 'SELECT AVG( popu_lsa ) FROM pls' ) dbGetQuery( con , 'SELECT stabr , AVG( popu_lsa ) FROM pls GROUP BY stabr' ) "],["pesquisa-nacional-por-amostra-de-domicilios-pnad.html", "Pesquisa Nacional por Amostra de Domicilios (PNAD) Download, Import, Preparation Analysis Examples with the survey library   Replication Example Poverty and Inequality Estimation with convey   Analysis Examples with srvyr  ", " Pesquisa Nacional por Amostra de Domicilios (PNAD) Brazil’s principal labor force survey, measuring education, employment, income, housing characteristics. One consolidated table with one row per individual within each sampled household. A complex sample survey designed to generalize to the civilian non-institutional population of Brazil. Released quarterly since 2012, with microdata available both quarterly and annually. Administered by the Instituto Brasileiro de Geografia e Estatistica. Please skim before you begin: Conceitos e métodos Wikipedia Entry A haiku regarding this microdata: # mineiro data # love verdade gave to me # twelve karaoke.. Download, Import, Preparation Download and import the dictionary file: dictionary_tf <- tempfile() dictionary_url <- paste0( "https://ftp.ibge.gov.br/Trabalho_e_Rendimento/" , "Pesquisa_Nacional_por_Amostra_de_Domicilios_continua/" , "Trimestral/Microdados/Documentacao/Dicionario_e_input_20221031.zip" ) download.file( dictionary_url , dictionary_tf , mode = 'wb' ) dictionary_files <- unzip( dictionary_tf , exdir = tempdir() ) sas_fn <- grep( '\\\\.sas$' , dictionary_files , value = TRUE ) sas_lines <- readLines( sas_fn , encoding = 'latin1' ) Determine fixed-width file positions from the SAS import script: sas_start <- grep( '@0001' , sas_lines ) sas_end <- grep( ';' , sas_lines ) sas_end <- sas_end[ sas_end > sas_start ][ 1 ] sas_lines <- sas_lines[ seq( sas_start , sas_end - 1 ) ] # remove SAS comments sas_lines <- gsub( "\\\\/\\\\*(.*)" , "" , sas_lines ) # remove multiple spaces and spaces at the end of each string sas_lines <- gsub( "( +)" , " " , sas_lines ) sas_lines <- gsub( " $" , "" , sas_lines ) sas_df <- read.table( textConnection( sas_lines ) , sep = ' ' , col.names = c( 'position' , 'column_name' , 'length' ) , header = FALSE ) sas_df[ , 'character' ] <- grepl( '\\\\$' , sas_df[ , 'length' ] ) sas_df[ , 'position' ] <- as.integer( gsub( "\\\\@" , "" , sas_df[ , 'position' ] ) ) sas_df[ , 'length' ] <- as.integer( gsub( "\\\\$" , "" , sas_df[ , 'length' ] ) ) stopifnot( sum( sas_df[ , 'length' ] ) == ( sas_df[ nrow( sas_df ) , 'position' ] + sas_df[ nrow( sas_df ) , 'length' ] - 1 ) ) Download the latest quarterly file: this_tf <- tempfile() this_url <- paste0( "https://ftp.ibge.gov.br/Trabalho_e_Rendimento/" , "Pesquisa_Nacional_por_Amostra_de_Domicilios_continua/" , "Trimestral/Microdados/2023/PNADC_012023.zip" ) download.file( this_url , this_tf , mode = 'wb' ) Import the latest quarterly file: library(readr) pnad_tbl <- read_fwf( this_tf , fwf_widths( widths = sas_df[ , 'length' ] , col_names = sas_df[ , 'column_name' ] ) , col_types = paste0( ifelse( sas_df[ , 'character' ] , "c" , "d" ) , collapse = '' ) ) pnad_df <- data.frame( pnad_tbl ) names( pnad_df ) <- tolower( names( pnad_df ) ) pnad_df[ , 'one' ] <- 1 Save Locally   Save the object at any point: # pnad_fn <- file.path( path.expand( "~" ) , "PNAD" , "this_file.rds" ) # saveRDS( pnad_df , file = pnad_fn , compress = FALSE ) Load the same object: # pnad_df <- readRDS( pnad_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) pnad_design <- svrepdesign( data = pnad_df , weight = ~ v1028 , type = 'bootstrap' , repweights = 'v1028[0-9]+' , mse = TRUE , ) Variable Recoding Add new columns to the data set: pnad_design <- update( pnad_design , pia = as.numeric( v2009 >= 14 ) ) pnad_design <- update( pnad_design , ocup_c = ifelse( pia == 1 , as.numeric( vd4002 %in% 1 ) , NA ) , desocup30 = ifelse( pia == 1 , as.numeric( vd4002 %in% 2 ) , NA ) ) pnad_design <- update( pnad_design , uf_name = factor( as.numeric( uf ) , levels = c(11L, 12L, 13L, 14L, 15L, 16L, 17L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 31L, 32L, 33L, 35L, 41L, 42L, 43L, 50L, 51L, 52L, 53L) , labels = c("Rondonia", "Acre", "Amazonas", "Roraima", "Para", "Amapa", "Tocantins", "Maranhao", "Piaui", "Ceara", "Rio Grande do Norte", "Paraiba", "Pernambuco", "Alagoas", "Sergipe", "Bahia", "Minas Gerais", "Espirito Santo", "Rio de Janeiro", "Sao Paulo", "Parana", "Santa Catarina", "Rio Grande do Sul", "Mato Grosso do Sul", "Mato Grosso", "Goias", "Distrito Federal") ) , age_categories = factor( 1 + findInterval( v2009 , seq( 5 , 60 , 5 ) ) ) , male = as.numeric( v2007 == 1 ) , region = substr( uf , 1 , 1 ) , # calculate usual income from main job # (rendimento habitual do trabalho principal) vd4016n = ifelse( pia %in% 1 & vd4015 %in% 1 , vd4016 , NA ) , # calculate effective income from main job # (rendimento efetivo do trabalho principal) vd4017n = ifelse( pia %in% 1 & vd4015 %in% 1 , vd4017 , NA ) , # calculate usual income from all jobs # (variavel rendimento habitual de todos os trabalhos) vd4019n = ifelse( pia %in% 1 & vd4015 %in% 1 , vd4019 , NA ) , # calculate effective income from all jobs # (rendimento efetivo do todos os trabalhos) vd4020n = ifelse( pia %in% 1 & vd4015 %in% 1 , vd4020 , NA ) , # determine the potential labor force pea_c = as.numeric( ocup_c == 1 | desocup30 == 1 ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( pnad_design , "sampling" ) != 0 ) svyby( ~ one , ~ uf_name , pnad_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , pnad_design ) svyby( ~ one , ~ uf_name , pnad_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ vd4020n , pnad_design , na.rm = TRUE ) svyby( ~ vd4020n , ~ uf_name , pnad_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ age_categories , pnad_design ) svyby( ~ age_categories , ~ uf_name , pnad_design , svymean ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ vd4020n , pnad_design , na.rm = TRUE ) svyby( ~ vd4020n , ~ uf_name , pnad_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ age_categories , pnad_design ) svyby( ~ age_categories , ~ uf_name , pnad_design , svytotal ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ vd4020n , pnad_design , 0.5 , na.rm = TRUE ) svyby( ~ vd4020n , ~ uf_name , pnad_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ ocup_c , denominator = ~ pea_c , pnad_design , na.rm = TRUE ) Subsetting Restrict the survey design to employed persons: sub_pnad_design <- subset( pnad_design , ocup_c == 1 ) Calculate the mean (average) of this subset: svymean( ~ vd4020n , sub_pnad_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ vd4020n , pnad_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ vd4020n , ~ uf_name , pnad_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( pnad_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ vd4020n , pnad_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ vd4020n , pnad_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ vd4020n , pnad_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ male , pnad_design , method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( vd4020n ~ male , pnad_design ) Perform a chi-squared test of association for survey data: svychisq( ~ male + age_categories , pnad_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( vd4020n ~ male + age_categories , pnad_design ) summary( glm_result ) Replication Example This example matches statistics and coefficients of variation from Tabela 4092 - Pessoas de 14 anos ou mais de idade, por condição em relação à força de trabalho e condição de ocupação: nationwide_adult_population <- svytotal( ~ pia , pnad_design , na.rm = TRUE ) stopifnot( round( coef( nationwide_adult_population ) / 1000000 , 3 ) == 174.228 ) stopifnot( round( cv( nationwide_adult_population ) / 1000000 , 3 ) == 0 ) nationwide_labor_force <- svytotal( ~ pea_c , pnad_design , na.rm = TRUE ) stopifnot( round( coef( nationwide_labor_force ) / 1000000 , 3 ) == 107.257 ) stopifnot( round( cv( nationwide_labor_force ) * 100 , 1 ) == 0.2 ) nationwide_employed <- svytotal( ~ ocup_c , pnad_design , na.rm = TRUE ) stopifnot( round( coef( nationwide_employed ) / 1000000 , 3 ) == 97.825 ) stopifnot( round( cv( nationwide_employed ) * 100 , 1 ) == 0.2 ) nationwide_unemployed <- svytotal( ~ desocup30 , pnad_design , na.rm = TRUE ) stopifnot( round( coef( nationwide_unemployed ) / 1000000 , 3 ) == 9.432 ) stopifnot( round( cv( nationwide_unemployed ) * 100 , 1 ) == 1.2 ) nationwide_not_in_labor_force <- svytotal( ~ as.numeric( pia & !pea_c ) , pnad_design , na.rm = TRUE ) stopifnot( round( coef( nationwide_not_in_labor_force ) / 1000000 , 3 ) == 66.972 ) stopifnot( round( cv( nationwide_not_in_labor_force ) * 100 , 1 ) == 0.3 ) Poverty and Inequality Estimation with convey   The R convey library estimates measures of income concentration, poverty, inequality, and wellbeing. This textbook details the available features. As a starting point for PNAD users, this code calculates the gini coefficient on complex sample survey data: library(convey) pnad_design <- convey_prep( pnad_design ) svygini( ~ vd4020n , pnad_design , na.rm = TRUE ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for PNAD users, this code replicates previously-presented examples: library(srvyr) pnad_srvyr_design <- as_survey( pnad_design ) Calculate the mean (average) of a linear variable, overall and by groups: pnad_srvyr_design %>% summarize( mean = survey_mean( vd4020n , na.rm = TRUE ) ) pnad_srvyr_design %>% group_by( uf_name ) %>% summarize( mean = survey_mean( vd4020n , na.rm = TRUE ) ) "],["pesquisa-nacional-de-saude-pns.html", "Pesquisa Nacional de Saude (PNS) Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " Pesquisa Nacional de Saude (PNS) Brazil’s health survey, measuring medical conditions, risk behaviors, access to and use of care. One consolidated table with one row per individual within each sampled household. A complex sample survey designed to generalize to Brazil’s civilian population. Released at approximately five year intervals starting in 2013. Administered by Instituto Brasileiro de Geografia e Estatistica partnered with the Ministério da Saúde. Please skim before you begin: Conceitos e métodos Wikipedia Entry A haiku regarding this microdata: # cheer the ministry! # with each caipirinha, or # fail sex life module Download, Import, Preparation Download and import the dictionary file: dictionary_tf <- tempfile() dictionary_url <- "https://ftp.ibge.gov.br/PNS/2019/Microdados/Documentacao/Dicionario_e_input_20220530.zip" download.file( dictionary_url , dictionary_tf , mode = 'wb' ) dictionary_files <- unzip( dictionary_tf , exdir = tempdir() ) sas_fn <- grep( '\\\\.sas$' , dictionary_files , value = TRUE ) sas_lines <- readLines( sas_fn , encoding = 'latin1' ) Determine fixed-width file positions from the SAS import script: sas_start <- grep( '@00001' , sas_lines ) sas_end <- grep( ';' , sas_lines ) sas_end <- sas_end[ sas_end > sas_start ][ 1 ] sas_lines <- sas_lines[ seq( sas_start , sas_end - 1 ) ] # remove SAS comments sas_lines <- gsub( "\\\\/\\\\*(.*)" , "" , sas_lines ) # remove tabs, multiple spaces and spaces at the end of each string sas_lines <- gsub( "\\t" , " " , sas_lines ) sas_lines <- gsub( "( +)" , " " , sas_lines ) sas_lines <- gsub( " $" , "" , sas_lines ) sas_df <- read.table( textConnection( sas_lines ) , sep = ' ' , col.names = c( 'position' , 'column_name' , 'length' ) , header = FALSE ) sas_df[ , 'character' ] <- grepl( '\\\\$' , sas_df[ , 'length' ] ) sas_df[ , 'position' ] <- as.integer( gsub( "\\\\@" , "" , sas_df[ , 'position' ] ) ) sas_df[ , 'length' ] <- as.integer( gsub( "\\\\$" , "" , sas_df[ , 'length' ] ) ) stopifnot( sum( sas_df[ , 'length' ] ) == ( sas_df[ nrow( sas_df ) , 'position' ] + sas_df[ nrow( sas_df ) , 'length' ] - 1 ) ) Download the latest data file: this_tf <- tempfile() this_url <- "https://ftp.ibge.gov.br/PNS/2019/Microdados/Dados/PNS_2019_20220525.zip" download.file( this_url , this_tf , mode = 'wb' ) Import the latest data file: library(readr) pns_tbl <- read_fwf( this_tf , fwf_widths( widths = sas_df[ , 'length' ] , col_names = sas_df[ , 'column_name' ] ) , col_types = paste0( ifelse( sas_df[ , 'character' ] , "c" , "d" ) , collapse = '' ) ) pns_df <- data.frame( pns_tbl ) names( pns_df ) <- tolower( names( pns_df ) ) pns_df[ , 'one' ] <- 1 Save Locally   Save the object at any point: # pns_fn <- file.path( path.expand( "~" ) , "PNS" , "this_file.rds" ) # saveRDS( pns_df , file = pns_fn , compress = FALSE ) Load the same object: # pns_df <- readRDS( pns_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) options( survey.lonely.psu = "adjust" ) pns_prestratified_design <- svydesign( id = ~ upa_pns , strata = ~v0024 , data = subset( pns_df , !is.na( v0028 ) ) , weights = ~v0028 , nest = TRUE ) popc.types <- data.frame( v00283 = as.character( unique( pns_df[ , 'v00283' ] ) ) , Freq = as.numeric( unique( pns_df[ , 'v00282' ] ) ) ) popc.types <- popc.types[ order( popc.types[ , 'v00283' ] ) , ] pns_design <- postStratify( pns_prestratified_design , strata = ~v00283 , population = popc.types ) Variable Recoding Add new columns to the data set: pns_design <- update( pns_design , medical_insurance = ifelse( i00102 %in% 1:2 , as.numeric( i00102 == 1 ) , NA ) , uf_name = factor( as.numeric( v0001 ) , levels = c(11L, 12L, 13L, 14L, 15L, 16L, 17L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 31L, 32L, 33L, 35L, 41L, 42L, 43L, 50L, 51L, 52L, 53L) , labels = c("Rondonia", "Acre", "Amazonas", "Roraima", "Para", "Amapa", "Tocantins", "Maranhao", "Piaui", "Ceara", "Rio Grande do Norte", "Paraiba", "Pernambuco", "Alagoas", "Sergipe", "Bahia", "Minas Gerais", "Espirito Santo", "Rio de Janeiro", "Sao Paulo", "Parana", "Santa Catarina", "Rio Grande do Sul", "Mato Grosso do Sul", "Mato Grosso", "Goias", "Distrito Federal") ) , age_categories = factor( 1 + findInterval( c008 , seq( 5 , 90 , 5 ) ) ) , male = as.numeric( v006 == 1 ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( pns_design , "sampling" ) != 0 ) svyby( ~ one , ~ uf_name , pns_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , pns_design ) svyby( ~ one , ~ uf_name , pns_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ e01602 , pns_design , na.rm = TRUE ) svyby( ~ e01602 , ~ uf_name , pns_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ c006 , pns_design ) svyby( ~ c006 , ~ uf_name , pns_design , svymean ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ e01602 , pns_design , na.rm = TRUE ) svyby( ~ e01602 , ~ uf_name , pns_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ c006 , pns_design ) svyby( ~ c006 , ~ uf_name , pns_design , svytotal ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ e01602 , pns_design , 0.5 , na.rm = TRUE ) svyby( ~ e01602 , ~ uf_name , pns_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ p00104 , denominator = ~ p00404 , pns_design , na.rm = TRUE ) Subsetting Restrict the survey design to individuals that exercise three or more days per week: sub_pns_design <- subset( pns_design , p035 %in% 3:7 ) Calculate the mean (average) of this subset: svymean( ~ e01602 , sub_pns_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ e01602 , pns_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ e01602 , ~ uf_name , pns_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( pns_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ e01602 , pns_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ e01602 , pns_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ e01602 , pns_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ medical_insurance , pns_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( e01602 ~ medical_insurance , pns_design ) Perform a chi-squared test of association for survey data: svychisq( ~ medical_insurance + c006 , pns_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( e01602 ~ medical_insurance + c006 , pns_design ) summary( glm_result ) Replication Example This example matches Estimando totais of gross monthly income from the official PNSIBGE R package: total_renda <- svytotal( ~ e01602 , pns_design , na.rm = TRUE ) stopifnot( round( coef( total_renda ) , 0 ) == 213227874692 ) stopifnot( round( SE( total_renda ) , 0 ) == 3604489769 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for PNS users, this code replicates previously-presented examples: library(srvyr) pns_srvyr_design <- as_survey( pns_design ) Calculate the mean (average) of a linear variable, overall and by groups: pns_srvyr_design %>% summarize( mean = survey_mean( e01602 , na.rm = TRUE ) ) pns_srvyr_design %>% group_by( uf_name ) %>% summarize( mean = survey_mean( e01602 , na.rm = TRUE ) ) "],["pesquisa-de-orcamentos-familiares-pof.html", "Pesquisa de Orcamentos Familiares (POF) Download, Import, Preparation Analysis Examples with the survey library   Replication Example Poverty and Inequality Estimation with convey   Analysis Examples with srvyr  ", " Pesquisa de Orcamentos Familiares (POF) Brazil’s household budget survey designed to guide major economic indicators like the Contas nacionais. Various tables with one record per sampled household, resident, job, expenditure. A complex sample survey designed to generalize to the civilian population of Brazil. Released at irregular intervals, 2002-2003, 2008-2009, and 2017-2018 microdata available. Administered by the Instituto Brasileiro de Geografia e Estatistica. Please skim before you begin: Pesquisa de Orçamentos Familiares 2017-2018 Perfil das despesas no Brasil Conceitos e métodos A haiku regarding this microdata: # shopping na praia # roupa, comida, pede # tres havaianas Download, Import, Preparation Download the dictionary files: library(archive) dictionary_tf <- tempfile() dictionary_url <- paste0( "https://ftp.ibge.gov.br/Orcamentos_Familiares/" , "Pesquisa_de_Orcamentos_Familiares_2017_2018/Microdados/Documentacao_20230713.zip" ) download.file( dictionary_url , dictionary_tf , mode = 'wb' ) dictionary_files <- archive_extract( dictionary_tf , dir = tempdir() ) Import the household variable dictionary: library(readxl) dictionary_fn <- file.path( tempdir() , "Dicionários de váriaveis.xls" ) domicilio_dictionary_tbl <- read_excel( dictionary_fn , sheet = "Domicílio" , skip = 3 ) domicilio_dictionary_df <- data.frame( domicilio_dictionary_tbl ) names( domicilio_dictionary_df ) <- c( 'position' , 'length' , 'decimals' , 'column_name' , 'description' , 'variable_labels' ) domicilio_dictionary_df[ c( 'position' , 'length' , 'decimals' ) ] <- sapply( domicilio_dictionary_df[ c( 'position' , 'length' , 'decimals' ) ] , as.integer ) domicilio_dictionary_df <- subset( domicilio_dictionary_df , !is.na( position ) ) Import the resident variable dictionary: morador_dictionary_tbl <- read_excel( dictionary_fn , sheet = "Morador" , skip = 3 ) morador_dictionary_df <- data.frame( morador_dictionary_tbl ) names( morador_dictionary_df ) <- c( 'position' , 'length' , 'decimals' , 'column_name' , 'description' , 'variable_labels' ) morador_dictionary_df[ c( 'position' , 'length' , 'decimals' ) ] <- sapply( morador_dictionary_df[ c( 'position' , 'length' , 'decimals' ) ] , as.integer ) morador_dictionary_df <- subset( morador_dictionary_df , !is.na( position ) ) Import the post-stratification totals: post_stratification_fn <- file.path( tempdir() , "Pos_estratos_totais.xlsx" ) post_stratification_tbl <- read_excel( post_stratification_fn , skip = 5 ) post_stratification_df <- data.frame( post_stratification_tbl ) names( post_stratification_df ) <- c( 'estrato_pof' , 'pos_estrato' , 'total_pessoas' , 'uf' , 'cod_upa' ) Download the full dataset: this_tf <- tempfile() this_url <- paste0( "https://ftp.ibge.gov.br/Orcamentos_Familiares/" , "Pesquisa_de_Orcamentos_Familiares_2017_2018/Microdados/Dados_20230713.zip" ) download.file( this_url , this_tf , mode = 'wb' ) unzipped_files <- unzip( this_tf , exdir = tempdir() ) Import the household table: library(readr) domicilio_fn <- grep( 'DOMICILIO\\\\.txt$' , unzipped_files , value = TRUE ) domicilio_tbl <- read_fwf( domicilio_fn , fwf_widths( widths = domicilio_dictionary_df[ , 'length' ] , col_names = domicilio_dictionary_df[ , 'column_name' ] ) ) domicilio_df <- data.frame( domicilio_tbl ) names( domicilio_df ) <- tolower( names( domicilio_df ) ) Import the resident table: morador_fn <- grep( 'MORADOR\\\\.txt$' , unzipped_files , value = TRUE ) morador_tbl <- read_fwf( morador_fn , fwf_widths( widths = morador_dictionary_df[ , 'length' ] , col_names = morador_dictionary_df[ , 'column_name' ] ) ) morador_df <- data.frame( morador_tbl ) names( morador_df ) <- tolower( names( morador_df ) ) Merge one household-level variable and also the post-stratification info onto the person-level table: dom_mor_df <- merge( domicilio_df[ c( 'cod_upa' , 'num_dom' , 'v6199' ) ] , morador_df ) pof_df <- merge( dom_mor_df , post_stratification_df ) stopifnot( nrow( pof_df ) == nrow( morador_df ) ) Save Locally   Save the object at any point: # pof_fn <- file.path( path.expand( "~" ) , "POF" , "this_file.rds" ) # saveRDS( pof_df , file = pof_fn , compress = FALSE ) Load the same object: # pof_df <- readRDS( pof_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) options( survey.lonely.psu = "adjust" ) pre_stratified_design <- svydesign( id = ~ cod_upa , strata = ~ estrato_pof , weights = ~ peso , data = pof_df , nest = TRUE ) population_totals <- aggregate( peso_final ~ pos_estrato , data = pof_df , sum ) names( population_totals ) <- c( 'pos_estrato' , 'Freq' ) pof_design <- postStratify( pre_stratified_design , ~ pos_estrato , population_totals ) Variable Recoding Add new columns to the data set: pof_design <- update( pof_design , one = 1 , food_security = factor( v6199 , levels = 1:4 , labels = c( 'food secure' , 'mild' , 'moderate' , 'severe' ) ) , age_categories = factor( 1 + findInterval( v0403 , c( 20 , 25 , 30 , 35 , 45 , 55 , 65 , 75 ) ) , levels = 1:9 , labels = c( "under 20" , "20-24" , "25-29" , "30-34" , "35-44" , "45-54" , "55-64" , "65-74" , "75+" ) ) , sex = factor( v0404 , levels = 1:2 , labels = c( 'male' , 'female' ) ) , urban = as.numeric( tipo_situacao_reg == 1 ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( pof_design , "sampling" ) != 0 ) svyby( ~ one , ~ sex , pof_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , pof_design ) svyby( ~ one , ~ sex , pof_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ renda_total , pof_design ) svyby( ~ renda_total , ~ sex , pof_design , svymean ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ age_categories , pof_design ) svyby( ~ age_categories , ~ sex , pof_design , svymean ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ renda_total , pof_design ) svyby( ~ renda_total , ~ sex , pof_design , svytotal ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ age_categories , pof_design ) svyby( ~ age_categories , ~ sex , pof_design , svytotal ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ renda_total , pof_design , 0.5 ) svyby( ~ renda_total , ~ sex , pof_design , svyquantile , 0.5 , ci = TRUE ) Estimate a ratio: svyratio( numerator = ~ renda_total , denominator = ~ anos_estudo , pof_design , na.rm = TRUE ) Subsetting Restrict the survey design to credit card holders: sub_pof_design <- subset( pof_design , v0409 > 0 ) Calculate the mean (average) of this subset: svymean( ~ renda_total , sub_pof_design ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ renda_total , pof_design ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ renda_total , ~ sex , pof_design , svymean ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( pof_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ renda_total , pof_design ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ renda_total , pof_design , deff = TRUE ) # SRS with replacement svymean( ~ renda_total , pof_design , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ urban , pof_design , method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( renda_total ~ urban , pof_design ) Perform a chi-squared test of association for survey data: svychisq( ~ urban + age_categories , pof_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( renda_total ~ urban + age_categories , pof_design ) summary( glm_result ) Replication Example This example matches the 2017-2018 person-level food security estimates from Tabela 3: person_level_food_security <- svymean( ~ food_security , pof_design , na.rm = TRUE ) stopifnot( all.equal( round( coef( person_level_food_security ) , 2 ) , c( 0.59 , 0.27 , 0.09 , 0.05 ) , check.attributes = FALSE ) ) Poverty and Inequality Estimation with convey   The R convey library estimates measures of income concentration, poverty, inequality, and wellbeing. This textbook details the available features. As a starting point for POF users, this code calculates the gini coefficient on complex sample survey data: library(convey) pof_design <- convey_prep( pof_design ) svygini( ~ renda_total , pof_design , na.rm = TRUE ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for POF users, this code replicates previously-presented examples: library(srvyr) pof_srvyr_design <- as_survey( pof_design ) Calculate the mean (average) of a linear variable, overall and by groups: pof_srvyr_design %>% summarize( mean = survey_mean( renda_total ) ) pof_srvyr_design %>% group_by( sex ) %>% summarize( mean = survey_mean( renda_total ) ) "],["residential-energy-consumption-survey-recs.html", "Residential Energy Consumption Survey (RECS) Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " Residential Energy Consumption Survey (RECS) A periodic study conducted to provide detailed information about energy usage in U.S. homes. One table with one row per sampled housing unit. A complex sample survey designed to generalize to U.S. homes occupied as primary residences. Released approximately every five years since 1979. Prepared by the Energy Information Administration, with help from IMG-Crown and RTI International. Please skim before you begin: Annual Energy Outlook 2023 Household Characteristics Technical Documentation Summary A haiku regarding this microdata: # housing code dogma # even satan ceased sweat since # eighth sin: central air Download, Import, Preparation Download and import the most recent sas file: library(haven) sas_tf <- tempfile() sas_url <- "https://www.eia.gov/consumption/residential/data/2020/sas/recs2020_public_v2.zip" download.file( sas_url , sas_tf , mode = 'wb' ) recs_tbl <- read_sas( sas_tf ) recs_df <- data.frame( recs_tbl ) names( recs_df ) <- tolower( names( recs_df ) ) recs_df[ , 'one' ] <- 1 Save Locally   Save the object at any point: # recs_fn <- file.path( path.expand( "~" ) , "RECS" , "this_file.rds" ) # saveRDS( recs_df , file = recs_fn , compress = FALSE ) Load the same object: # recs_df <- readRDS( recs_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) recs_design <- svrepdesign( data = recs_df , weight = ~ nweight , repweights = 'nweight[1-9]+' , type = 'JK1' , combined.weights = TRUE , scale = 59 / 60 , mse = TRUE ) Variable Recoding Add new columns to the data set: recs_design <- update( recs_design , main_heating_fuel = factor( fuelheat , levels = c( -2 , 5 , 1 , 2 , 3 , 7 , 99 ) , labels = c( 'Not applicable' , 'Electricity' , 'Natural gas from underground pipes' , 'Propane (bottled gas)' , 'Fuel oil' , 'Wood or pellets' , 'Other' ) ) , rooftype = factor( rooftype , levels = c( -2 , 1:6 , 99 ) , labels = c( 'Not applicable' , 'Ceramic or clay tiles' , 'Wood shingles/shakes' , 'Metal' , 'Slate or synthetic slate' , 'Shingles (composition or asphalt)' , 'Concrete tiles' , 'Other' ) ) , swimpool_binary = ifelse( swimpool %in% 0:1 , swimpool , NA ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( recs_design , "sampling" ) != 0 ) svyby( ~ one , ~ main_heating_fuel , recs_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , recs_design ) svyby( ~ one , ~ main_heating_fuel , recs_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ totsqft_en , recs_design ) svyby( ~ totsqft_en , ~ main_heating_fuel , recs_design , svymean ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ rooftype , recs_design ) svyby( ~ rooftype , ~ main_heating_fuel , recs_design , svymean ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ totsqft_en , recs_design ) svyby( ~ totsqft_en , ~ main_heating_fuel , recs_design , svytotal ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ rooftype , recs_design ) svyby( ~ rooftype , ~ main_heating_fuel , recs_design , svytotal ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ totsqft_en , recs_design , 0.5 ) svyby( ~ totsqft_en , ~ main_heating_fuel , recs_design , svyquantile , 0.5 , ci = TRUE ) Estimate a ratio: svyratio( numerator = ~ totcsqft , denominator = ~ totsqft_en , recs_design ) Subsetting Restrict the survey design to households that cook three or more hot meals per day: sub_recs_design <- subset( recs_design , nummeal == 1 ) Calculate the mean (average) of this subset: svymean( ~ totsqft_en , sub_recs_design ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ totsqft_en , recs_design ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ totsqft_en , ~ main_heating_fuel , recs_design , svymean ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( recs_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ totsqft_en , recs_design ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ totsqft_en , recs_design , deff = TRUE ) # SRS with replacement svymean( ~ totsqft_en , recs_design , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ swimpool_binary , recs_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( totsqft_en ~ swimpool_binary , recs_design ) Perform a chi-squared test of association for survey data: svychisq( ~ swimpool_binary + rooftype , recs_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( totsqft_en ~ swimpool_binary + rooftype , recs_design ) summary( glm_result ) Replication Example This example matches the statistic, standard error, and relative standard error shown on PDF page 8 of Using the microdata file to compute estimates and relative standard errors (RSEs) sas_v1_tf <- tempfile() sas_v1_url <- "https://www.eia.gov/consumption/residential/data/2020/sas/recs2020_public_v1.zip" download.file( sas_v1_url , sas_v1_tf , mode = 'wb' ) recs_v1_tbl <- read_sas( sas_v1_tf ) recs_v1_df <- data.frame( recs_v1_tbl ) names( recs_v1_df ) <- tolower( names( recs_v1_df ) ) recs_v1_design <- svrepdesign( data = recs_v1_df , weight = ~ nweight , repweights = 'nweight[1-9]+' , type = 'JK1' , combined.weights = TRUE , scale = 59 / 60 , mse = TRUE ) recs_v1_design <- update( recs_v1_design , natural_gas_mainspace_heat = as.numeric( fuelheat == 1 ) ) result <- svytotal( ~ natural_gas_mainspace_heat , recs_v1_design ) stopifnot( round( coef( result ) , 0 ) == 56245389 ) stopifnot( round( SE( result ) , 0 ) == 545591 ) stopifnot( round( 100 * SE( result ) / coef( result ) , 2 ) == 0.97 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for RECS users, this code replicates previously-presented examples: library(srvyr) recs_srvyr_design <- as_survey( recs_design ) Calculate the mean (average) of a linear variable, overall and by groups: recs_srvyr_design %>% summarize( mean = survey_mean( totsqft_en ) ) recs_srvyr_design %>% group_by( main_heating_fuel ) %>% summarize( mean = survey_mean( totsqft_en ) ) "],["rapid-surveys-system-rss.html", "Rapid Surveys System (RSS) Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " Rapid Surveys System (RSS) The standardized platform to answer time-sensitive questions about emerging and priority health issues. One table with one row per AmeriSpeak or KnowledgePanel respondent. A cross-sectional survey generalizing to the noninstitutionalized adult population of the U.S. Releases expected four times per year. Conducted by the National Center for Health Statistics at the Centers for Disease Control. Please skim before you begin: NCHS Rapid Surveys System (RSS): Round 1 Survey Description Quality Profile, Rapid Surveys System Round 1 A haiku regarding this microdata: # first response heroes # question design thru publish # time 'doxed by zeno Download, Import, Preparation Download and import the first round: library(haven) sas_url <- "https://www.cdc.gov/nchs/data/rss/rss1_puf_t1.sas7bdat" rss_tbl <- read_sas( sas_url ) rss_df <- data.frame( rss_tbl ) names( rss_df ) <- tolower( names( rss_df ) ) rss_df[ , 'one' ] <- 1 Save Locally   Save the object at any point: # rss_fn <- file.path( path.expand( "~" ) , "RSS" , "this_file.rds" ) # saveRDS( rss_df , file = rss_fn , compress = FALSE ) Load the same object: # rss_df <- readRDS( rss_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) options( survey.lonely.psu = "adjust" ) rss_design <- svydesign( ~ p_psu , strata = ~ p_strata , data = rss_df , weights = ~ weight_m1 , nest = TRUE ) Variable Recoding Add new columns to the data set: rss_design <- update( rss_design , how_often_use_cleaner_purifier = factor( ven_use , levels = c( -9:-6 , 0:3 ) , labels = c( "Don't Know" , "Question not asked" , "Explicit refusal/REF" , "Skipped/Implied refusal" , "Never" , "Rarely" , "Sometimes" , "Always" ) ) , has_health_insurance = ifelse( p_insur >= 0 , p_insur , NA ) , metropolitan = factor( as.numeric( p_metro_r == 1 ) , levels = 0:1 , labels = c( 'No' , 'Yes' ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( rss_design , "sampling" ) != 0 ) svyby( ~ one , ~ metropolitan , rss_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , rss_design ) svyby( ~ one , ~ metropolitan , rss_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ p_hhsize_r , rss_design ) svyby( ~ p_hhsize_r , ~ metropolitan , rss_design , svymean ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ how_often_use_cleaner_purifier , rss_design ) svyby( ~ how_often_use_cleaner_purifier , ~ metropolitan , rss_design , svymean ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ p_hhsize_r , rss_design ) svyby( ~ p_hhsize_r , ~ metropolitan , rss_design , svytotal ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ how_often_use_cleaner_purifier , rss_design ) svyby( ~ how_often_use_cleaner_purifier , ~ metropolitan , rss_design , svytotal ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ p_hhsize_r , rss_design , 0.5 ) svyby( ~ p_hhsize_r , ~ metropolitan , rss_design , svyquantile , 0.5 , ci = TRUE ) Estimate a ratio: svyratio( numerator = ~ p_agec_r , denominator = ~ p_hhsize_r , rss_design ) Subsetting Restrict the survey design to adults that most of the time or always wear sunscreen: sub_rss_design <- subset( rss_design , sun_useface >= 3 ) Calculate the mean (average) of this subset: svymean( ~ p_hhsize_r , sub_rss_design ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ p_hhsize_r , rss_design ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ p_hhsize_r , ~ metropolitan , rss_design , svymean ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( rss_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ p_hhsize_r , rss_design ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ p_hhsize_r , rss_design , deff = TRUE ) # SRS with replacement svymean( ~ p_hhsize_r , rss_design , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ has_health_insurance , rss_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( p_hhsize_r ~ has_health_insurance , rss_design ) Perform a chi-squared test of association for survey data: svychisq( ~ has_health_insurance + how_often_use_cleaner_purifier , rss_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( p_hhsize_r ~ has_health_insurance + how_often_use_cleaner_purifier , rss_design ) summary( glm_result ) Replication Example This example matches the statistic and confidence intervals from the “Ever uses a portable air cleaner or purifier in home” page of the Air cleaners and purifiers dashboard: result <- svymean( ~ as.numeric( ven_use > 0 ) , subset( rss_design , ven_use >= 0 ) ) stopifnot( round( coef( result ) , 3 ) == .379 ) stopifnot( round( confint( result )[1] , 3 ) == 0.366 ) stopifnot( round( confint( result )[2] , 3 ) == 0.393 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for RSS users, this code replicates previously-presented examples: library(srvyr) rss_srvyr_design <- as_survey( rss_design ) Calculate the mean (average) of a linear variable, overall and by groups: rss_srvyr_design %>% summarize( mean = survey_mean( p_hhsize_r ) ) rss_srvyr_design %>% group_by( metropolitan ) %>% summarize( mean = survey_mean( p_hhsize_r ) ) "],["survey-of-business-owners-sbo.html", "Survey of Business Owners (SBO) Function Definitions Download, Import, Preparation Analysis Examples with the survey library   Replication Example", " Survey of Business Owners (SBO) Before its replacement in 2018 by the Annual Business Survey, nearly every tax-filing sole proprietorship, partnership, and corporation nationwide completed this questionnaire, with 2007 the only microdata year. One table with one row per firm per state per industry, except eight collapsed geographies. A complex sample survey designed to generalize to most firms in the United States, public microdata includes classifiable (non-identifiable) firms, i.e. nearly all businesses but only about half of workers. Released quinquennially from 1972 until 2012 in the Economic Census with no updates expected. Administered by the U.S. Census Bureau. Annual Business Survey now conducted jointly with the National Center for Science and Engineering Statistics within the National Science Foundation. Please skim before you begin: 2007 Survey of Business Owners (SBO) Public Use Microdata Sample (PUMS) Data Users Guide Comparability to the Annual Business Survey (ABS), the Nonemployer Statistics by Demographics (NES-D) series, and the Annual Survey of Entrepreneurs (ASE) At a Glance A haiku regarding this microdata: # butchers, chandlers, baked # sea shanty, filial pie # call your mom and pop Function Definitions This survey uses a dual design variance estimation technique described in the Data Users Guide. Most users do not need to study these functions carefully. Define functions specific to only this dataset: sbo_MIcombine <- function( x , adjustment = 1.992065 ){ # pull the structure of a variance-covariance matrix variance.shell <- suppressWarnings( vcov( x$var[[1]] ) ) # initiate a function that will overwrite the diagonals. diag.replacement <- function( z ){ diag( variance.shell ) <- coef( z ) variance.shell } # overwrite all the diagonals in the variance this_design object coef.variances <- lapply( x$var , diag.replacement ) # add then divide by ten midpoint <- Reduce( '+' , coef.variances ) / 10 # initiate another function that takes some object, # subtracts the midpoint, squares it, divides by ninety midpoint.var <- function( z ){ 1/10 * ( ( midpoint - z )^2 / 9 ) } # sum up all the differences into a single object variance <- Reduce( '+' , lapply( coef.variances , midpoint.var ) ) # adjust every number with the factor in the user guide adj_var <- adjustment * variance # construct a result that looks like other sbo_MIcombine methods rval <- list( coefficients = coef( x$coef ) , variance = adj_var ) # call it an MIresult class, like other sbo_MIcombine results class( rval ) <- 'MIresult' rval } sbo_with <- function ( this_design , expr , ... ){ pf <- parent.frame() expr <- substitute( expr ) expr$design <- as.name(".design") # this pulls in means, medians, totals, etc. # notice it uses this_design$coef results <- eval( expr , list( .design = this_design$coef ) ) # this is used to calculate the variance, adjusted variance, standard error # notice it uses the this_design$var object variances <- lapply( this_design$var$designs , function( .design ){ eval( expr , list( .design = .design ) , enclos = pf ) } ) # combine both results.. rval <- list( coef = results , var = variances ) # ..into a brand new object class class( rval ) <- 'imputationResultList' rval } sbo_subset <- function( x , ... ){ # subset the survey object coef.sub <- subset( x$coef , ... ) # replicate `var.sub` so it's got all the same attributes as `x$var` var.sub <- x$var # but then overwrite the `designs` attribute with a subset var.sub$designs <- lapply( x$var$designs , subset , ... ) # now re-create the `sbosvyimputationList` just as before.. sub.svy <- list( coef = coef.sub , var = var.sub ) # ..and give it the same class sub.svy$call <- sys.call(-1) sub.svy } sbo_update <- function( x , ... ){ # update the survey object that's going to be used for # means, medians, totals, etc. coef.upd <- update( x$coef , ... ) # replicate `var.upd` so it's got all the same attributes as `x$var` var.upd <- x$var # but then overwrite the `designs` attribute with an update var.upd$designs <- lapply( x$var$designs , update , ... ) # now re-create the `sbosvyimputationList` just as before upd.svy <- list( coef = coef.upd , var = var.upd ) upd.svy } sbo_degf <- function( x ) degf( x$coef ) Download, Import, Preparation Download and import the file containing records for both coefficient estimates and variance estimation: library(httr) library(readr) tf <- tempfile() this_url <- "https://www2.census.gov/programs-surveys/sbo/datasets/2007/pums_csv.zip" GET( this_url , write_disk( tf ) , progress() ) sbo_tbl <- read_csv( tf ) sbo_df <- data.frame( sbo_tbl ) names( sbo_df ) <- tolower( names( sbo_df ) ) sbo_df[ , 'one' ] <- 1 Calculate the weights used for variance estimation: sbo_df[ , 'newwgt' ] <- 10 * sbo_df[ , 'tabwgt' ] * sqrt( 1 - 1 / sbo_df[ , 'tabwgt' ] ) Add business ownership percentages for both gender and ethnicity: # replace percent missings with zeroes for( i in 1:4 ) sbo_df[ is.na( sbo_df[ , paste0( 'pct' , i ) ] ) , paste0( 'pct' , i ) ] <- 0 # sum up ownership ethnicity and gender sbo_df[ , 'hispanic_pct' ] <- sbo_df[ , 'nonhispanic_pct' ] <- 0 sbo_df[ , 'male_pct' ] <- sbo_df[ , 'female_pct' ] <- 0 # loop through the first four owners' ethnicity and sex variables for( i in 1:4 ) { sbo_df[ sbo_df[ , paste0( 'eth' , i ) ] %in% 'H' , 'hispanic_pct' ] <- sbo_df[ sbo_df[ , paste0( 'eth' , i ) ] %in% 'H' , 'hispanic_pct' ] + sbo_df[ sbo_df[ , paste0( 'eth' , i ) ] %in% 'H' , paste0( 'pct' , i ) ] sbo_df[ sbo_df[ , paste0( 'eth' , i ) ] %in% 'N' , 'nonhispanic_pct' ] <- sbo_df[ sbo_df[ , paste0( 'eth' , i ) ] %in% 'N' , 'nonhispanic_pct' ] + sbo_df[ sbo_df[ , paste0( 'eth' , i ) ] %in% 'N' , paste0( 'pct' , i ) ] sbo_df[ sbo_df[ , paste0( 'sex' , i ) ] %in% 'M' , 'male_pct' ] <- sbo_df[ sbo_df[ , paste0( 'sex' , i ) ] %in% 'M' , 'male_pct' ] + sbo_df[ sbo_df[ , paste0( 'sex' , i ) ] %in% 'M' , paste0( 'pct' , i ) ] sbo_df[ sbo_df[ , paste0( 'sex' , i ) ] %in% 'F' , 'female_pct' ] <- sbo_df[ sbo_df[ , paste0( 'sex' , i ) ] %in% 'F' , 'female_pct' ] + sbo_df[ sbo_df[ , paste0( 'sex' , i ) ] %in% 'F' , paste0( 'pct' , i ) ] } Save Locally   Save the object at any point: # sbo_fn <- file.path( path.expand( "~" ) , "SBO" , "this_file.rds" ) # saveRDS( sbo_df , file = sbo_fn , compress = FALSE ) Load the same object: # sbo_df <- readRDS( sbo_fn ) Survey Design Definition Construct a multiply-imputed, complex sample survey design: library(survey) library(mitools) # break random groups into ten separate data.frame objects within a list var_list <- NULL for( i in 1:10 ) { var_list <- c( var_list , list( subset( sbo_df , rg == i ) ) ) } sbo_coef <- svydesign( id = ~ 1 , weight = ~ tabwgt , data = sbo_df ) sbo_var <- svydesign( id = ~ 1 , weight = ~ newwgt , data = imputationList( var_list ) ) sbo_design <- list( coef = sbo_coef , var = sbo_var ) class( sbo_design ) <- 'sbosvyimputationList' Variable Recoding Add new columns to the data set: sbo_design <- sbo_update( sbo_design , established_before_2000 = ifelse( established %in% c( '0' , 'A' ) , NA , as.numeric( established < 4 ) ) , healthins = factor( healthins , levels = 1:2 , labels = c( "offered health insurance" , "did not offer health insurance" ) ) , hispanic_ownership = factor( ifelse( hispanic_pct == nonhispanic_pct , 2 , ifelse( hispanic_pct > nonhispanic_pct , 1 , ifelse( nonhispanic_pct > hispanic_pct , 3 , NA ) ) ) , levels = 1:3 , labels = c( 'hispanic' , 'equally hisp/non' , 'non-hispanic' ) ) , gender_ownership = factor( ifelse( male_pct == female_pct , 2 , ifelse( male_pct > female_pct , 1 , ifelse( female_pct > male_pct , 3 , NA ) ) ) , levels = 1:3 , labels = c( 'male' , 'equally male/female' , 'female' ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sbo_MIcombine( sbo_with( sbo_design , svyby( ~ one , ~ one , unwtd.count ) ) ) sbo_MIcombine( sbo_with( sbo_design , svyby( ~ one , ~ gender_ownership , unwtd.count ) ) ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: sbo_MIcombine( sbo_with( sbo_design , svytotal( ~ one ) ) ) sbo_MIcombine( sbo_with( sbo_design , svyby( ~ one , ~ gender_ownership , svytotal ) ) ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: sbo_MIcombine( sbo_with( sbo_design , svymean( ~ receipts_noisy ) ) ) sbo_MIcombine( sbo_with( sbo_design , svyby( ~ receipts_noisy , ~ gender_ownership , svymean ) ) ) Calculate the distribution of a categorical variable, overall and by groups: sbo_MIcombine( sbo_with( sbo_design , svymean( ~ n07_employer , na.rm = TRUE ) ) ) sbo_MIcombine( sbo_with( sbo_design , svyby( ~ n07_employer , ~ gender_ownership , svymean , na.rm = TRUE ) ) ) Calculate the sum of a linear variable, overall and by groups: sbo_MIcombine( sbo_with( sbo_design , svytotal( ~ receipts_noisy ) ) ) sbo_MIcombine( sbo_with( sbo_design , svyby( ~ receipts_noisy , ~ gender_ownership , svytotal ) ) ) Calculate the weighted sum of a categorical variable, overall and by groups: sbo_MIcombine( sbo_with( sbo_design , svytotal( ~ n07_employer , na.rm = TRUE ) ) ) sbo_MIcombine( sbo_with( sbo_design , svyby( ~ n07_employer , ~ gender_ownership , svytotal , na.rm = TRUE ) ) ) Calculate the median (50th percentile) of a linear variable, overall and by groups: sbo_MIcombine( sbo_with( sbo_design , svyquantile( ~ receipts_noisy , 0.5 , se = TRUE ) ) ) sbo_MIcombine( sbo_with( sbo_design , svyby( ~ receipts_noisy , ~ gender_ownership , svyquantile , 0.5 , se = TRUE , ci = TRUE ) ) ) Estimate a ratio: sbo_MIcombine( sbo_with( sbo_design , svyratio( numerator = ~ receipts_noisy , denominator = ~ employment_noisy ) ) ) Subsetting Restrict the survey design to jointly owned by husband and wife: sub_sbo_design <- sbo_subset( sbo_design , husbwife %in% 1:3 ) Calculate the mean (average) of this subset: sbo_MIcombine( sbo_with( sub_sbo_design , svymean( ~ receipts_noisy ) ) ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- sbo_MIcombine( sbo_with( sbo_design , svymean( ~ receipts_noisy ) ) ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- sbo_MIcombine( sbo_with( sbo_design , svyby( ~ receipts_noisy , ~ gender_ownership , svymean ) ) ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: sbo_degf( sbo_design ) Calculate the complex sample survey-adjusted variance of any statistic: sbo_MIcombine( sbo_with( sbo_design , svyvar( ~ receipts_noisy ) ) ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement sbo_MIcombine( sbo_with( sbo_design , svymean( ~ receipts_noisy , deff = TRUE ) ) ) # SRS with replacement sbo_MIcombine( sbo_with( sbo_design , svymean( ~ receipts_noisy , deff = "replace" ) ) ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: # # sbo_MIsvyciprop( ~ established_before_2000 , sbo_design , # method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: # # sbo_MIsvyttest( receipts_noisy ~ established_before_2000 , sbo_design ) Perform a chi-squared test of association for survey data: # # sbo_MIsvychisq( ~ established_before_2000 + n07_employer , sbo_design ) Perform a survey-weighted generalized linear model: glm_result <- sbo_MIcombine( sbo_with( sbo_design , svyglm( receipts_noisy ~ established_before_2000 + n07_employer ) ) ) glm_result Replication Example This example matches the statistics and relative standard errors from three Appendix B columns: hispanic_receipts_result <- sbo_MIcombine( sbo_with( sbo_design , svyby( ~ receipts_noisy , ~ hispanic_ownership , svytotal ) ) ) hispanic_payroll_result <- sbo_MIcombine( sbo_with( sbo_design , svyby( ~ payroll_noisy , ~ hispanic_ownership , svytotal ) ) ) hispanic_employment_result <- sbo_MIcombine( sbo_with( sbo_design , svyby( ~ employment_noisy , ~ hispanic_ownership , svytotal ) ) ) Estimates at the U.S. Level using the PUMS Tables for: stopifnot( round( coef( hispanic_receipts_result )[ 'hispanic' ] , 0 ) == 350763923 ) stopifnot( round( coef( hispanic_receipts_result )[ 'equally hisp/non' ] , 0 ) == 56166354 ) stopifnot( round( coef( hispanic_receipts_result )[ 'non-hispanic' ] , 0 ) == 10540609303 ) stopifnot( round( coef( hispanic_payroll_result )[ 'hispanic' ] , 0 ) == 54367702 ) stopifnot( round( coef( hispanic_payroll_result )[ 'equally hisp/non' ] , 0 ) == 11083148 ) stopifnot( round( coef( hispanic_payroll_result )[ 'non-hispanic' ] , 0 ) == 1875353228 ) stopifnot( round( coef( hispanic_employment_result )[ 'hispanic' ] , 0 ) == 2026406 ) stopifnot( round( coef( hispanic_employment_result )[ 'equally hisp/non' ] , 0 ) == 400152 ) stopifnot( round( coef( hispanic_employment_result )[ 'non-hispanic' ] , 0 ) == 56889606 ) Relative Standard Errors of Estimates at the U.S. Level using the PUMS Tables for: stopifnot( round( cv( hispanic_receipts_result )[ 'hispanic' ] , 2 ) == 0.02 ) stopifnot( round( cv( hispanic_receipts_result )[ 'equally hisp/non' ] , 2 ) == 0.06 ) stopifnot( round( cv( hispanic_receipts_result )[ 'non-hispanic' ] , 2 ) == 0 ) stopifnot( round( cv( hispanic_payroll_result )[ 'hispanic' ] , 2 ) == 0.01 ) stopifnot( round( cv( hispanic_payroll_result )[ 'equally hisp/non' ] , 2 ) == 0.06 ) stopifnot( round( cv( hispanic_payroll_result )[ 'non-hispanic' ] , 2 ) == 0 ) stopifnot( round( cv( hispanic_employment_result )[ 'hispanic' ] , 2 ) == 0.01 ) stopifnot( round( cv( hispanic_employment_result )[ 'equally hisp/non' ] , 2 ) == 0.05 ) stopifnot( round( cv( hispanic_employment_result )[ 'non-hispanic' ] , 2 ) == 0 ) "],["survey-of-consumer-finances-scf.html", "Survey of Consumer Finances (SCF) Function Definitions Download, Import, Preparation Analysis Examples with the survey library   Replication Example Poverty and Inequality Estimation with convey  ", " Survey of Consumer Finances (SCF) A comprehensive survey of household wealth, the U.S. central bank studies net worth across the country by asking about both active and passive income, mortgages, pensions, credit card debt, even car leases. Five implicates, each containing one row per sampled household to account for statistical uncertainty. A complex sample survey designed to generalize to the civilian non-institutional U.S. population. Released triennially since 1989. Administered by the Board of Governors of the Federal Reserve System. Please skim before you begin: Measuring Income and Wealth at the Top Using Administrative and Survey Data Wikipedia Entry A haiku regarding this microdata: # incomes, assets, debts # high net worth oversample # pig bank laproscope Function Definitions This survey uses a multiply-imputed variance estimation technique described in the 2004 Codebook. Most users do not need to study this function carefully. Define a function specific to only this dataset: scf_MIcombine <- function (results, variances, call = sys.call(), df.complete = Inf, ...) { m <- length(results) oldcall <- attr(results, "call") if (missing(variances)) { variances <- suppressWarnings(lapply(results, vcov)) results <- lapply(results, coef) } vbar <- variances[[1]] cbar <- results[[1]] for (i in 2:m) { cbar <- cbar + results[[i]] # MODIFICATION: # vbar <- vbar + variances[[i]] } cbar <- cbar/m # MODIFICATION: # vbar <- vbar/m evar <- var(do.call("rbind", results)) r <- (1 + 1/m) * evar/vbar df <- (m - 1) * (1 + 1/r)^2 if (is.matrix(df)) df <- diag(df) if (is.finite(df.complete)) { dfobs <- ((df.complete + 1)/(df.complete + 3)) * df.complete * vbar/(vbar + evar) if (is.matrix(dfobs)) dfobs <- diag(dfobs) df <- 1/(1/dfobs + 1/df) } if (is.matrix(r)) r <- diag(r) rval <- list(coefficients = cbar, variance = vbar + evar * (m + 1)/m, call = c(oldcall, call), nimp = m, df = df, missinfo = (r + 2/(df + 3))/(r + 1)) class(rval) <- "MIresult" rval } Define a function to download and import each stata file: library(haven) scf_dta_import <- function( this_url ){ this_tf <- tempfile() download.file( this_url , this_tf , mode = 'wb' ) this_tbl <- read_dta( this_tf ) this_df <- data.frame( this_tbl ) file.remove( this_tf ) names( this_df ) <- tolower( names( this_df ) ) this_df } Download, Import, Preparation Download and import the full, summary extract, and replicate weights tables: scf_df <- scf_dta_import( "https://www.federalreserve.gov/econres/files/scf2022s.zip" ) ext_df <- scf_dta_import( "https://www.federalreserve.gov/econres/files/scfp2022s.zip" ) scf_rw_df <- scf_dta_import( "https://www.federalreserve.gov/econres/files/scf2022rw1s.zip" ) Confirm both the full public data and the summary extract contain five records per family: stopifnot( nrow( scf_df ) == nrow( scf_rw_df ) * 5 ) stopifnot( nrow( scf_df ) == nrow( ext_df ) ) Confirm only the primary economic unit and the five implicate identifiers overlap: stopifnot( all( sort( intersect( names( scf_df ) , names( ext_df ) ) ) == c( 'y1' , 'yy1' ) ) ) stopifnot( all( sort( intersect( names( scf_df ) , names( scf_rw_df ) ) ) == c( 'y1' , 'yy1' ) ) ) stopifnot( all( sort( intersect( names( ext_df ) , names( scf_rw_df ) ) ) == c( 'y1' , 'yy1' ) ) ) Remove the implicate identifier from the replicate weights table, add a column of fives for weighting: scf_rw_df[ , 'y1' ] <- NULL scf_df[ , 'five' ] <- 5 Save Locally   Save the object at any point: # scf_fn <- file.path( path.expand( "~" ) , "SCF" , "this_file.rds" ) # saveRDS( scf_df , file = scf_fn , compress = FALSE ) Load the same object: # scf_df <- readRDS( scf_fn ) Survey Design Definition Construct a multiply-imputed, complex sample survey design: Break the main table into five different implicates based on the final character of the column y1: library(stringr) s1_df <- scf_df[ str_sub( scf_df[ , 'y1' ] , -1 , -1 ) == 1 , ] s2_df <- scf_df[ str_sub( scf_df[ , 'y1' ] , -1 , -1 ) == 2 , ] s3_df <- scf_df[ str_sub( scf_df[ , 'y1' ] , -1 , -1 ) == 3 , ] s4_df <- scf_df[ str_sub( scf_df[ , 'y1' ] , -1 , -1 ) == 4 , ] s5_df <- scf_df[ str_sub( scf_df[ , 'y1' ] , -1 , -1 ) == 5 , ] Combine these into a single list, then merge each implicate with the summary extract: scf_imp <- list( s1_df , s2_df , s3_df , s4_df , s5_df ) scf_list <- lapply( scf_imp , merge , ext_df ) Replace all missing values in the replicate weights table with zeroes, multiply the replicate weights by the multiplication factor, then only keep the unique identifier and the final (combined) replicate weights: scf_rw_df[ is.na( scf_rw_df ) ] <- 0 scf_rw_df[ , paste0( 'wgt' , 1:999 ) ] <- scf_rw_df[ , paste0( 'wt1b' , 1:999 ) ] * scf_rw_df[ , paste0( 'mm' , 1:999 ) ] scf_rw_df <- scf_rw_df[ , c( 'yy1' , paste0( 'wgt' , 1:999 ) ) ] Sort both the five implicates and also the replicate weights table by the unique identifier: scf_list <- lapply( scf_list , function( w ) w[ order( w[ , 'yy1' ] ) , ] ) scf_rw_df <- scf_rw_df[ order( scf_rw_df[ , 'yy1' ] ) , ] Define the design: library(survey) library(mitools) scf_design <- svrepdesign( weights = ~wgt , repweights = scf_rw_df[ , -1 ] , data = imputationList( scf_list ) , scale = 1 , rscales = rep( 1 / 998 , 999 ) , mse = FALSE , type = "other" , combined.weights = TRUE ) Variable Recoding Add new columns to the data set: scf_design <- update( scf_design , hhsex = factor( hhsex , levels = 1:2 , labels = c( "male" , "female" ) ) , married = as.numeric( married == 1 ) , edcl = factor( edcl , levels = 1:4 , labels = c( "less than high school" , "high school or GED" , "some college" , "college degree" ) ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: scf_MIcombine( with( scf_design , svyby( ~ five , ~ five , unwtd.count ) ) ) scf_MIcombine( with( scf_design , svyby( ~ five , ~ hhsex , unwtd.count ) ) ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: scf_MIcombine( with( scf_design , svytotal( ~ five ) ) ) scf_MIcombine( with( scf_design , svyby( ~ five , ~ hhsex , svytotal ) ) ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: scf_MIcombine( with( scf_design , svymean( ~ networth ) ) ) scf_MIcombine( with( scf_design , svyby( ~ networth , ~ hhsex , svymean ) ) ) Calculate the distribution of a categorical variable, overall and by groups: scf_MIcombine( with( scf_design , svymean( ~ edcl ) ) ) scf_MIcombine( with( scf_design , svyby( ~ edcl , ~ hhsex , svymean ) ) ) Calculate the sum of a linear variable, overall and by groups: scf_MIcombine( with( scf_design , svytotal( ~ networth ) ) ) scf_MIcombine( with( scf_design , svyby( ~ networth , ~ hhsex , svytotal ) ) ) Calculate the weighted sum of a categorical variable, overall and by groups: scf_MIcombine( with( scf_design , svytotal( ~ edcl ) ) ) scf_MIcombine( with( scf_design , svyby( ~ edcl , ~ hhsex , svytotal ) ) ) Calculate the median (50th percentile) of a linear variable, overall and by groups: scf_MIcombine( with( scf_design , svyquantile( ~ networth , 0.5 , se = TRUE , interval.type = 'quantile' ) ) ) scf_MIcombine( with( scf_design , svyby( ~ networth , ~ hhsex , svyquantile , 0.5 , se = TRUE , interval.type = 'quantile' , ci = TRUE ) ) ) Estimate a ratio: scf_MIcombine( with( scf_design , svyratio( numerator = ~ income , denominator = ~ networth ) ) ) Subsetting Restrict the survey design to labor force participants: sub_scf_design <- subset( scf_design , lf == 1 ) Calculate the mean (average) of this subset: scf_MIcombine( with( sub_scf_design , svymean( ~ networth ) ) ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- scf_MIcombine( with( scf_design , svymean( ~ networth ) ) ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- scf_MIcombine( with( scf_design , svyby( ~ networth , ~ hhsex , svymean ) ) ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( scf_design$designs[[1]] ) Calculate the complex sample survey-adjusted variance of any statistic: scf_MIcombine( with( scf_design , svyvar( ~ networth ) ) ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement scf_MIcombine( with( scf_design , svymean( ~ networth , deff = TRUE ) ) ) # SRS with replacement scf_MIcombine( with( scf_design , svymean( ~ networth , deff = "replace" ) ) ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: # MIsvyciprop( ~ married , scf_design , # method = "likelihood" ) Regression Models and Tests of Association Perform a design-based t-test: # MIsvyttest( networth ~ married , scf_design ) Perform a chi-squared test of association for survey data: # MIsvychisq( ~ married + edcl , scf_design ) Perform a survey-weighted generalized linear model: glm_result <- scf_MIcombine( with( scf_design , svyglm( networth ~ married + edcl ) ) ) summary( glm_result ) Replication Example This example matches the “Table 4” tab’s cell Y6 of the Excel Based on Public Data: mean_net_worth <- scf_MIcombine( with( scf_design , svymean( ~ networth ) ) ) stopifnot( round( coef( mean_net_worth ) / 1000 , 1 ) == 1059.5 ) This example comes within $500 of the standard error of mean net worth from Table 2 of the Federal Reserve Bulletin, displaying the minor differences between the Internal Data and Public Data: stopifnot( abs( 23.2 - round( SE( mean_net_worth ) / 1000 , 1 ) ) < 0.5 ) This example matches the “Table 4” tab’s cells X6 of the Excel Based on Public Data: # compute quantile with all five implicates stacked (not the recommended technique) fake_design <- svydesign( ~ 1 , data = ext_df[ c( 'networth' , 'wgt' ) ] , weights = ~ wgt ) median_net_worth_incorrect_errors <- svyquantile( ~ networth , fake_design , 0.5 ) stopifnot( round( coef( median_net_worth_incorrect_errors ) / 1000 , 2 ) == 192.7 ) Poverty and Inequality Estimation with convey   The R convey library estimates measures of income concentration, poverty, inequality, and wellbeing. This textbook details the available features. As a starting point for SCF users, this code calculates the gini coefficient on complex sample survey data: library(convey) scf_design$designs <- lapply( scf_design$designs , convey_prep ) scf_MIcombine( with( scf_design , svygini( ~ networth ) ) ) "],["survey-of-income-and-program-participation-sipp.html", "Survey of Income and Program Participation (SIPP) Download, Import, Preparation Analysis Examples with the survey library   Replication Example Poverty and Inequality Estimation with convey   Analysis Examples with srvyr  ", " Survey of Income and Program Participation (SIPP) The primary longitudinal assessment of income fluctuation, labor force participation, social programs. Annual tables with one record per month per person per sampled household, time period weights. A complex sample generalizing to the U.S. civilian non-institutionalized across varying time periods. Multi-year panels since 1980s, its current and now permanent four year rotation beginning in 2018. Administered and financed by the US Census Bureau. Please skim before you begin: 2023 Survey of Income and Program Participation Users’ Guide 2023 Data User Notes A haiku regarding this microdata: # federal programs # poverty oversample # monthly dynamics Download, Import, Preparation Determine which variables from the main table to import: variables_to_keep <- c( 'ssuid' , 'pnum' , 'monthcode' , 'spanel' , 'swave' , 'erelrpe' , 'tlivqtr' , 'wpfinwgt' , 'rmesr' , 'thcyincpov' , 'tfcyincpov' , 'tehc_st' , 'rhicovann' , 'rfpov' , 'thnetworth' , 'tftotinc' ) Download and import the latest main file: library(httr) library(data.table) main_tf <- tempfile() main_url <- paste0( "https://www2.census.gov/programs-surveys/sipp/" , "data/datasets/2023/pu2023_csv.zip" ) GET( main_url , write_disk( main_tf ) , progress() ) main_csv <- unzip( main_tf , exdir = tempdir() ) sipp_main_dt <- fread( main_csv , sep = "|" , select = toupper( variables_to_keep ) ) sipp_main_df <- data.frame( sipp_main_dt ) names( sipp_main_df ) <- tolower( names( sipp_main_df ) ) Download and import the appropriate replicate weights file: rw_tf <- tempfile() rw_url <- paste0( "https://www2.census.gov/programs-surveys/sipp/" , "data/datasets/2023/rw2023_csv.zip" ) GET( rw_url , write_disk( rw_tf ) , progress() ) rw_csv <- unzip( rw_tf , exdir = tempdir() ) sipp_rw_dt <- fread( rw_csv , sep = "|" ) sipp_rw_df <- data.frame( sipp_rw_dt ) names( sipp_rw_df ) <- tolower( names( sipp_rw_df ) ) Limit both files to December records for a point-in-time estimate, then merge: sipp_df <- merge( sipp_main_df[ sipp_main_df[ , 'monthcode' ] %in% 12 , ] , sipp_rw_df[ sipp_rw_df[ , 'monthcode' ] %in% 12 , ] , by = c( 'ssuid' , 'pnum' , 'monthcode' , 'spanel' , 'swave' ) ) stopifnot( nrow( sipp_df ) == sum( sipp_rw_df[ , 'monthcode' ] %in% 12 ) ) Save Locally   Save the object at any point: # sipp_fn <- file.path( path.expand( "~" ) , "SIPP" , "this_file.rds" ) # saveRDS( sipp_df , file = sipp_fn , compress = FALSE ) Load the same object: # sipp_df <- readRDS( sipp_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) sipp_design <- svrepdesign( data = sipp_df , weights = ~ wpfinwgt , repweights = "repwgt([1-9]+)" , type = "Fay" , rho = 0.5 ) Variable Recoding Add new columns to the data set: rmesr_values <- c( "With a job entire month, worked all weeks", "With a job all month, absent from work without pay 1+ weeks, absence not due to layoff", "With a job all month, absent from work without pay 1+ weeks, absence due to layoff", "With a job at least 1 but not all weeks, no time on layoff and no time looking for work", "With a job at least 1 but not all weeks, some weeks on layoff or looking for work", "No job all month, on layoff or looking for work all weeks", "No job all month, at least one but not all weeks on layoff or looking for work", "No job all month, no time on layoff and no time looking for work" ) sipp_design <- update( sipp_design , one = 1 , employment_status = factor( rmesr , levels = 1:8 , labels = rmesr_values ) , household_below_poverty = as.numeric( thcyincpov < 1 ) , family_below_poverty = as.numeric( tfcyincpov < 1 ) , state_name = factor( as.numeric( tehc_st ) , levels = c(1L, 2L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 53L, 54L, 55L, 56L, 60L, 61L) , labels = c("Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "District of Columbia", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming", "Puerto Rico", "Foreign Country") ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( sipp_design , "sampling" ) != 0 ) svyby( ~ one , ~ state_name , sipp_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , sipp_design ) svyby( ~ one , ~ state_name , sipp_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ tftotinc , sipp_design , na.rm = TRUE ) svyby( ~ tftotinc , ~ state_name , sipp_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ employment_status , sipp_design , na.rm = TRUE ) svyby( ~ employment_status , ~ state_name , sipp_design , svymean , na.rm = TRUE ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ tftotinc , sipp_design , na.rm = TRUE ) svyby( ~ tftotinc , ~ state_name , sipp_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ employment_status , sipp_design , na.rm = TRUE ) svyby( ~ employment_status , ~ state_name , sipp_design , svytotal , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ tftotinc , sipp_design , 0.5 , na.rm = TRUE ) svyby( ~ tftotinc , ~ state_name , sipp_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ tftotinc , denominator = ~ rfpov , sipp_design , na.rm = TRUE ) Subsetting Restrict the survey design to individuals ever covered by health insurance during the year: sub_sipp_design <- subset( sipp_design , rhicovann == 1 ) Calculate the mean (average) of this subset: svymean( ~ tftotinc , sub_sipp_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ tftotinc , sipp_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ tftotinc , ~ state_name , sipp_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( sipp_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ tftotinc , sipp_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ tftotinc , sipp_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ tftotinc , sipp_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ family_below_poverty , sipp_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( tftotinc ~ family_below_poverty , sipp_design ) Perform a chi-squared test of association for survey data: svychisq( ~ family_below_poverty + employment_status , sipp_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( tftotinc ~ family_below_poverty + employment_status , sipp_design ) summary( glm_result ) Replication Example This example matches statistics and standard errors from the Wealth and Asset Ownership for Households, by Type of Asset and Selected Characteristics: 2022: Restrict the design to permanent residence-based householders to match the count in Table 4: sipp_household_design <- subset( sipp_design , erelrpe %in% 1:2 & tlivqtr %in% 1:2 ) stopifnot( round( coef( svytotal( ~ one , sipp_household_design ) ) / 1000 , -2 ) == 134100 ) Compute Household Net Worth distribution and standard errors across the Total row of Tables 4 and 4A: sipp_household_design <- update( sipp_household_design , thnetworth_category = factor( findInterval( thnetworth , c( 1 , 5000 , 10000 , 25000 , 50000 , 100000 , 250000 , 500000 ) ) , levels = 0:8 , labels = c( "Zero or Negative" , "$1 to $4,999" , "$5,000 to $9,999" , "$10,000 to $24,999" , "$25,000 to $49,999" , "$50,000 to $99,999" , "$100,000 to $249,999" , "$250,000 to $499,999" , "$500,000 or over" ) ) ) results <- svymean( ~ thnetworth_category , sipp_household_design ) stopifnot( all.equal( as.numeric( round( coef( results ) * 100 , 1 ) ) , c( 11.1 , 6.8 , 3.5 , 5.7 , 5.6 , 7.8 , 15.9 , 14.4 , 29.2 ) ) ) stopifnot( all.equal( as.numeric( round( SE( results ) * 100 , 1 ) ) , c( 0.3 , 0.2 , 0.2 , 0.2 , 0.2 , 0.2 , 0.3 , 0.3 , 0.3 ) ) ) Poverty and Inequality Estimation with convey   The R convey library estimates measures of income concentration, poverty, inequality, and wellbeing. This textbook details the available features. As a starting point for SIPP users, this code calculates the gini coefficient on complex sample survey data: library(convey) sipp_design <- convey_prep( sipp_design ) svygini( ~ tftotinc , sipp_design , na.rm = TRUE ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for SIPP users, this code replicates previously-presented examples: library(srvyr) sipp_srvyr_design <- as_survey( sipp_design ) Calculate the mean (average) of a linear variable, overall and by groups: sipp_srvyr_design %>% summarize( mean = survey_mean( tftotinc , na.rm = TRUE ) ) sipp_srvyr_design %>% group_by( state_name ) %>% summarize( mean = survey_mean( tftotinc , na.rm = TRUE ) ) "],["social-security-public-use-data-files-ssa.html", "Social Security Public-Use Data Files (SSA) Recommended Reading Download, Import, Preparation Analysis Examples with base R   Replication Example Analysis Examples with dplyr   Analysis Examples with data.table   Analysis Examples with duckdb  ", " Social Security Public-Use Data Files (SSA) Microdata from administrative sources like the Master Beneficiary Record, Supplemental Security Record. Tables contain either one record per person or one record per person per year. A systematic sample of either social security number holders (most americans) or program recipients (current beneficiaries). Multiply 1% samples by 100 to get weighted statistics, 5% samples by 20. No expected release timeline. Released by the Office of Research, Evaluation, and Statistics, US Social Security Administration. Recommended Reading Two Methodology Documents: The 2006 Earnings Public-Use Microdata File: An Introduction Comparing Earnings Estimates from the 2006 Public-Use File and the Annual Statistical Supplement One Haiku: # annual earnings. # for pensioner payouts, see # the '04 extract Download, Import, Preparation Download and import the 1951-2006 one percent files with one record per person and per person-year: library(haven) library(httr) tf <- tempfile() ssa_url <- "https://www.ssa.gov/policy/docs/microdata/epuf/epuf2006_sas_files.zip" GET( ssa_url , write_disk( tf ) , progress() ) ssa_files <- unzip( tf , exdir = tempdir() ) ssa_fn <- grep( 'demographic' , ssa_files , value = TRUE ) annual_fn <- grep( 'annual' , ssa_files , value = TRUE ) ssa_tbl <- read_sas( ssa_fn ) annual_tbl <- read_sas( annual_fn ) ssa_df <- data.frame( ssa_tbl ) annual_df <- data.frame( annual_tbl ) names( ssa_df ) <- tolower( names( ssa_df ) ) names( annual_df ) <- tolower( names( annual_df ) ) Sum up 1951-1952 and 1953-2006 earnings, and also 1953-2006 credits, copying the naming convention: summed_earnings_5152 <- with( subset( annual_df , year_earn %in% 1951:1952 ) , aggregate( annual_earnings , list( id ) , sum ) ) names( summed_earnings_5152 ) <- c( 'id' , 'tot_cov_earn5152' ) summed_earnings_5306 <- with( subset( annual_df , year_earn > 1952 ) , aggregate( annual_earnings , list( id ) , sum ) ) names( summed_earnings_5306 ) <- c( 'id' , 'tot_cov_earn5306' ) summed_quarters_5306 <- with( subset( annual_df , year_earn > 1952 ) , aggregate( annual_qtrs , list( id ) , sum ) ) names( summed_quarters_5306 ) <- c( 'id' , 'qc5306' ) Isolate a single year of earnings: earnings_2006 <- annual_df[ annual_df[ , 'year_earn' ] == 2006 , c( 'id' , 'annual_earnings' ) ] names( earnings_2006 ) <- c( 'id' , 'tot_cov_earn06' ) Merge each new column on to the person-level table, then add zeroes to person-years without earnings: stopifnot( all( !is.na( ssa_df ) ) ) before_nrow <- nrow( ssa_df ) ssa_df <- merge( ssa_df , summed_earnings_5152 , all.x = TRUE ) ssa_df <- merge( ssa_df , summed_earnings_5306 , all.x = TRUE ) ssa_df <- merge( ssa_df , summed_quarters_5306 , all.x = TRUE ) ssa_df <- merge( ssa_df , earnings_2006 , all.x = TRUE ) ssa_df[ is.na( ssa_df ) ] <- 0 stopifnot( nrow( ssa_df ) == before_nrow ) Save Locally   Save the object at any point: # ssa_fn <- file.path( path.expand( "~" ) , "SSA" , "this_file.rds" ) # saveRDS( ssa_df , file = ssa_fn , compress = FALSE ) Load the same object: # ssa_df <- readRDS( ssa_fn ) Variable Recoding Add new columns to the data set: ssa_df <- transform( ssa_df , decade_of_birth = floor( yob / 10 ) * 10 , sex = factor( sex , levels = 1:2 , labels = c( 'male' , 'female' ) ) , tot_cov_earn3706 = ( tot_cov_earn3750 + tot_cov_earn5152 + tot_cov_earn5306 ) , qc3706 = ( qc3750 + qc5152 + qc5306 ) , any_earnings_2006 = ( tot_cov_earn06 > 0 ) , earnings_periods = factor( ifelse( ( tot_cov_earn5152 + tot_cov_earn5306 > 0 ) & tot_cov_earn3750 > 0 , 1 , ifelse( tot_cov_earn5152 > 0 | tot_cov_earn5306 > 0 , 2 , ifelse( tot_cov_earn3750 > 0 , 3 , 4 ) ) ) , levels = 1:4 , labels = c( 'Earnings in both periods' , 'Earnings during 1951-2006 only' , 'Earnings during 1937-1950 only' , 'No earnings' ) ) ) Analysis Examples with base R   Unweighted Counts Count the unweighted number of records in the table, overall and by groups: nrow( ssa_df ) table( ssa_df[ , "sex" ] , useNA = "always" ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: mean( ssa_df[ , "tot_cov_earn3706" ] ) tapply( ssa_df[ , "tot_cov_earn3706" ] , ssa_df[ , "sex" ] , mean ) Calculate the distribution of a categorical variable, overall and by groups: prop.table( table( ssa_df[ , "decade_of_birth" ] ) ) prop.table( table( ssa_df[ , c( "decade_of_birth" , "sex" ) ] ) , margin = 2 ) Calculate the sum of a linear variable, overall and by groups: sum( ssa_df[ , "tot_cov_earn3706" ] ) tapply( ssa_df[ , "tot_cov_earn3706" ] , ssa_df[ , "sex" ] , sum ) Calculate the median (50th percentile) of a linear variable, overall and by groups: quantile( ssa_df[ , "tot_cov_earn3706" ] , 0.5 ) tapply( ssa_df[ , "tot_cov_earn3706" ] , ssa_df[ , "sex" ] , quantile , 0.5 ) Subsetting Limit your data.frame to individuals with at least forty lifetime credits: sub_ssa_df <- subset( ssa_df , qc3706 >= 40 ) Calculate the mean (average) of this subset: mean( sub_ssa_df[ , "tot_cov_earn3706" ] ) Measures of Uncertainty Calculate the variance, overall and by groups: var( ssa_df[ , "tot_cov_earn3706" ] ) tapply( ssa_df[ , "tot_cov_earn3706" ] , ssa_df[ , "sex" ] , var ) Regression Models and Tests of Association Perform a t-test: t.test( tot_cov_earn3706 ~ any_earnings_2006 , ssa_df ) Perform a chi-squared test of association: this_table <- table( ssa_df[ , c( "any_earnings_2006" , "decade_of_birth" ) ] ) chisq.test( this_table ) Perform a generalized linear model: glm_result <- glm( tot_cov_earn3706 ~ any_earnings_2006 + decade_of_birth , data = ssa_df ) summary( glm_result ) Replication Example This example matches statistics in The 2006 Earnings Public-Use Microdata File: An Introduction: Chart 5. Percentage distribution of individuals in EPUF, by capped Social Security taxable earnings status: chart_five_results <- prop.table( table( ssa_df[ , 'earnings_periods' ] ) ) chart_five_results <- round( 100 * chart_five_results ) stopifnot( chart_five_results[ 'Earnings in both periods' ] == 16 ) stopifnot( chart_five_results[ 'Earnings during 1951-2006 only' ] == 55 ) stopifnot( chart_five_results[ 'Earnings during 1937-1950 only' ] == 4 ) stopifnot( chart_five_results[ 'No earnings' ] == 25 ) Table 4. Average and median Social Security taxable earnings in EPUF, by sex, 1951–2006 (in dollars): nonzero_2006_earners <- ssa_df[ ssa_df[ , 'tot_cov_earn06' ] > 0 , 'tot_cov_earn06' ] stopifnot( round( mean( nonzero_2006_earners ) , 0 ) == 30953 ) stopifnot( round( quantile( nonzero_2006_earners )[ 3 ] , 0 ) == 24000 ) Table A1. Number and percentage distribution of individuals with Social Security taxable earnings records in EPUF, by sex, 1951–2006: nonzero_2006_earners <- ssa_df[ ssa_df[ , 'tot_cov_earn06' ] > 0 , ] stopifnot( round( mean( nonzero_2006_earners[ , 'tot_cov_earn06' ] ) , 0 ) == 30953 ) stopifnot( round( quantile( nonzero_2006_earners[ , 'tot_cov_earn06' ] )[ 3 ] , 0 ) == 24000 ) This example matches statistics in Comparing Earnings Estimates from the 2006 Earnings Public-Use File and the Annual Statistical Supplement: Table 4. Comparing Supplement and EPUF estimates: Number of all, male, and female workers with any earnings during the year, 1951–2006: stopifnot( round( nrow( nonzero_2006_earners ) * 100 , -3 ) == 156280000 ) earners_in_2006_by_sex <- table( nonzero_2006_earners[ , 'sex' ] ) * 100 stopifnot( round( earners_in_2006_by_sex[ 'male' ] , -3 ) == 81576000 ) stopifnot( round( earners_in_2006_by_sex[ 'female' ] , -3 ) == 74681000 ) Analysis Examples with dplyr   The R dplyr library offers an alternative grammar of data manipulation to base R and SQL syntax. dplyr offers many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, and the tidyverse style of non-standard evaluation. This vignette details the available features. As a starting point for SSA users, this code replicates previously-presented examples: library(dplyr) ssa_tbl <- as_tibble( ssa_df ) Calculate the mean (average) of a linear variable, overall and by groups: ssa_tbl %>% summarize( mean = mean( tot_cov_earn3706 ) ) ssa_tbl %>% group_by( sex ) %>% summarize( mean = mean( tot_cov_earn3706 ) ) Analysis Examples with data.table   The R data.table library provides a high-performance version of base R’s data.frame with syntax and feature enhancements for ease of use, convenience and programming speed. data.table offers concise syntax: fast to type, fast to read, fast speed, memory efficiency, a careful API lifecycle management, an active community, and a rich set of features. This vignette details the available features. As a starting point for SSA users, this code replicates previously-presented examples: library(data.table) ssa_dt <- data.table( ssa_df ) Calculate the mean (average) of a linear variable, overall and by groups: ssa_dt[ , mean( tot_cov_earn3706 ) ] ssa_dt[ , mean( tot_cov_earn3706 ) , by = sex ] Analysis Examples with duckdb   The R duckdb library provides an embedded analytical data management system with support for the Structured Query Language (SQL). duckdb offers a simple, feature-rich, fast, and free SQL OLAP management system. This vignette details the available features. As a starting point for SSA users, this code replicates previously-presented examples: library(duckdb) con <- dbConnect( duckdb::duckdb() , dbdir = 'my-db.duckdb' ) dbWriteTable( con , 'ssa' , ssa_df ) Calculate the mean (average) of a linear variable, overall and by groups: dbGetQuery( con , 'SELECT AVG( tot_cov_earn3706 ) FROM ssa' ) dbGetQuery( con , 'SELECT sex , AVG( tot_cov_earn3706 ) FROM ssa GROUP BY sex' ) "],["trends-in-international-mathematics-and-science-study-timss.html", "Trends in International Mathematics and Science Study (TIMSS) Function Definitions Download, Import, Preparation Analysis Examples with the survey library   Replication Example", " Trends in International Mathematics and Science Study (TIMSS) A comparative study of student achievement in math and science across more than 50 nations. Grade-specific tables with one record per school, student, teacher, plus files containing student achievement, home background, student-teacher linkage, and within-country scoring reliability. A complex survey generalizing to fourth- and eighth-grade populations of participating countries. Released quadrennially since 1995. Funded by the International Association for the Evaluation of Educational Achievement, run at BC. Please skim before you begin: TIMSS 2019 User Guide for the International Database, 2nd Edition Methods and Procedures: TIMSS 2019 Technical Report A haiku regarding this microdata: # brando for stella, # gump's jenny, rock's adrian, # students toward math test Function Definitions This survey uses a multiply-imputed variance estimation technique described in Methods Chapter 14. Most users do not need to study this function carefully. Define a function specific to only this dataset: timss_MIcombine <- function (results, variances, call = sys.call(), df.complete = Inf, ...) { m <- length(results) oldcall <- attr(results, "call") if (missing(variances)) { variances <- suppressWarnings(lapply(results, vcov)) results <- lapply(results, coef) } vbar <- variances[[1]] cbar <- results[[1]] for (i in 2:m) { cbar <- cbar + results[[i]] vbar <- vbar + variances[[i]] } cbar <- cbar/m vbar <- vbar/m # MODIFICATION # evar <- var(do.call("rbind", results)) evar <- sum( ( unlist( results ) - cbar )^2 / 4 ) r <- (1 + 1/m) * evar/vbar df <- (m - 1) * (1 + 1/r)^2 if (is.matrix(df)) df <- diag(df) if (is.finite(df.complete)) { dfobs <- ((df.complete + 1)/(df.complete + 3)) * df.complete * vbar/(vbar + evar) if (is.matrix(dfobs)) dfobs <- diag(dfobs) df <- 1/(1/dfobs + 1/df) } if (is.matrix(r)) r <- diag(r) rval <- list(coefficients = cbar, variance = vbar + evar * (m + 1)/m, call = c(oldcall, call), nimp = m, df = df, missinfo = (r + 2/(df + 3))/(r + 1)) class(rval) <- "MIresult" rval } Download, Import, Preparation Download and unzip the 2019 fourth grade international database: library(httr) tf <- tempfile() this_url <- "https://timss2019.org/international-database/downloads/T19_G4_SPSS%20Data.zip" GET( this_url , write_disk( tf ) , progress() ) unzipped_files <- unzip( tf , exdir = tempdir() ) Import and stack each of the student context data files for Albania through Canada: library(haven) # limit unzipped files to those starting with `asg` followed by three letters followed by `m7` asg_fns <- unzipped_files[ grepl( '^asg[a-z][a-z][a-z]m7' , basename( unzipped_files ) ) ] # further limit asg files to the first ten countries countries_thru_canada <- c("alb", "arm", "aus", "aut", "aze", "bhr", "bfl", "bih", "bgr", "can") fns_thru_canada <- paste0( paste0( '^asg' , countries_thru_canada , 'm7' ) , collapse = "|" ) asg_alb_can_fns <- asg_fns[ grepl( fns_thru_canada , basename( asg_fns ) ) ] timss_df <- NULL for( spss_fn in asg_alb_can_fns ){ this_tbl <- read_spss( spss_fn ) this_tbl <- zap_labels( this_tbl ) this_df <- data.frame( this_tbl ) names( this_df ) <- tolower( names( this_df ) ) timss_df <- rbind( timss_df , this_df ) } # order the data.frame by unique student id timss_df <- timss_df[ with( timss_df , order( idcntry , idstud ) ) , ] Save Locally   Save the object at any point: # timss_fn <- file.path( path.expand( "~" ) , "TIMSS" , "this_file.rds" ) # saveRDS( timss_df , file = timss_fn , compress = FALSE ) Load the same object: # timss_df <- readRDS( timss_fn ) Survey Design Definition Construct a multiply-imputed, complex sample survey design: From among possibly plausible values, determine all columns that are multiply-imputed plausible values: # identify all columns ending with `01` thru `05` ppv <- grep( "(.*)0[1-5]$" , names( timss_df ) , value = TRUE ) # remove those ending digits ppv_prefix <- gsub( "0[1-5]$" , "" , ppv ) # identify each of the possibilities with exactly five matches (five implicates) pv <- names( table( ppv_prefix )[ table( ppv_prefix ) == 5 ] ) # identify each of the `01` thru `05` plausible value columns pv_columns <- grep( paste0( "^" , pv , "0[1-5]$" , collapse = "|" ) , names( timss_df ) , value = TRUE ) Extract those multiply-imputed columns into a separate data.frame, then remove them from the source: pv_wide_df <- timss_df[ c( 'idcntry' , 'idstud' , pv_columns ) ] timss_df[ pv_columns ] <- NULL Reshape these columns from one record per student to one record per student per implicate: pv_long_df <- reshape( pv_wide_df , varying = lapply( paste0( pv , '0' ) , paste0 , 1:5 ) , direction = 'long' , timevar = 'implicate' , idvar = c( 'idcntry' , 'idstud' ) ) names( pv_long_df ) <- gsub( "01$" , "" , names( pv_long_df ) ) Merge the columns from the source data.frame onto the one record per student per implicate data.frame: timss_long_df <- merge( timss_df , pv_long_df ) timss_long_df <- timss_long_df[ with( timss_long_df , order( idcntry , idstud ) ) , ] stopifnot( nrow( timss_long_df ) == nrow( pv_long_df ) ) stopifnot( nrow( timss_long_df ) / 5 == nrow( timss_df ) ) Divide the five plausible value implicates into a list with five data.frames based on the implicate number: timss_list <- split( timss_long_df , timss_long_df[ , 'implicate' ] ) Construct a replicate weights table following the estimation technique described in Methods Chapter 14: weights_df <- timss_df[ c( 'jkrep' , 'jkzone' ) ] for( j in 1:75 ){ for( i in 0:1 ){ weights_df[ weights_df[ , 'jkzone' ] != j , paste0( 'rw' , i , j ) ] <- 1 weights_df[ weights_df[ , 'jkzone' ] == j , paste0( 'rw' , i , j ) ] <- 2 * ( weights_df[ weights_df[ , 'jkzone' ] == j , 'jkrep' ] == i ) } } weights_df[ c( 'jkrep' , 'jkzone' ) ] <- NULL Define the design: library(survey) library(mitools) timss_design <- svrepdesign( weights = ~totwgt , repweights = weights_df , data = imputationList( timss_list ) , type = "other" , scale = 0.5 , rscales = rep( 1 , 150 ) , combined.weights = FALSE , mse = TRUE ) Variable Recoding Add new columns to the data set: timss_design <- update( timss_design , one = 1 , countries_thru_canada = factor( as.numeric( idcntry ) , levels = c(8L, 51L, 36L, 40L, 31L, 48L, 956L, 70L, 100L, 124L) , labels = c("Albania", "Armenia", "Australia", "Austria", "Azerbaijan", "Bahrain", "Belgium (Flemish)", "Bosnia and Herzegovina", "Bulgaria", "Canada") ) , sex = factor( asbg01 , levels = 1:2 , labels = c( "female" , "male" ) ) , born_in_country = ifelse( asbg07 %in% 1:2 , as.numeric( asbg07 == 1 ) , NA ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: timss_MIcombine( with( timss_design , svyby( ~ one , ~ one , unwtd.count ) ) ) timss_MIcombine( with( timss_design , svyby( ~ one , ~ sex , unwtd.count ) ) ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: timss_MIcombine( with( timss_design , svytotal( ~ one ) ) ) timss_MIcombine( with( timss_design , svyby( ~ one , ~ sex , svytotal ) ) ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: timss_MIcombine( with( timss_design , svymean( ~ asmmat , na.rm = TRUE ) ) ) timss_MIcombine( with( timss_design , svyby( ~ asmmat , ~ sex , svymean , na.rm = TRUE ) ) ) Calculate the distribution of a categorical variable, overall and by groups: timss_MIcombine( with( timss_design , svymean( ~ countries_thru_canada ) ) ) timss_MIcombine( with( timss_design , svyby( ~ countries_thru_canada , ~ sex , svymean ) ) ) Calculate the sum of a linear variable, overall and by groups: timss_MIcombine( with( timss_design , svytotal( ~ asmmat , na.rm = TRUE ) ) ) timss_MIcombine( with( timss_design , svyby( ~ asmmat , ~ sex , svytotal , na.rm = TRUE ) ) ) Calculate the weighted sum of a categorical variable, overall and by groups: timss_MIcombine( with( timss_design , svytotal( ~ countries_thru_canada ) ) ) timss_MIcombine( with( timss_design , svyby( ~ countries_thru_canada , ~ sex , svytotal ) ) ) Calculate the median (50th percentile) of a linear variable, overall and by groups: timss_MIcombine( with( timss_design , svyquantile( ~ asmmat , 0.5 , se = TRUE , na.rm = TRUE ) ) ) timss_MIcombine( with( timss_design , svyby( ~ asmmat , ~ sex , svyquantile , 0.5 , se = TRUE , ci = TRUE , na.rm = TRUE ) ) ) Estimate a ratio: timss_MIcombine( with( timss_design , svyratio( numerator = ~ asssci , denominator = ~ asmmat ) ) ) Subsetting Restrict the survey design to Australia, Austria, Azerbaijan, Belgium (French): sub_timss_design <- subset( timss_design , idcntry %in% c( 36 , 40 , 31 , 956 ) ) Calculate the mean (average) of this subset: timss_MIcombine( with( sub_timss_design , svymean( ~ asmmat , na.rm = TRUE ) ) ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- timss_MIcombine( with( timss_design , svymean( ~ asmmat , na.rm = TRUE ) ) ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- timss_MIcombine( with( timss_design , svyby( ~ asmmat , ~ sex , svymean , na.rm = TRUE ) ) ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( timss_design$designs[[1]] ) Calculate the complex sample survey-adjusted variance of any statistic: timss_MIcombine( with( timss_design , svyvar( ~ asmmat , na.rm = TRUE ) ) ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement timss_MIcombine( with( timss_design , svymean( ~ asmmat , na.rm = TRUE , deff = TRUE ) ) ) # SRS with replacement timss_MIcombine( with( timss_design , svymean( ~ asmmat , na.rm = TRUE , deff = "replace" ) ) ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: # MIsvyciprop( ~ born_in_country , timss_design , # method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: # MIsvyttest( asmmat ~ born_in_country , timss_design ) Perform a chi-squared test of association for survey data: # MIsvychisq( ~ born_in_country + countries_thru_canada , timss_design ) Perform a survey-weighted generalized linear model: glm_result <- timss_MIcombine( with( timss_design , svyglm( asmmat ~ born_in_country + countries_thru_canada ) ) ) summary( glm_result ) Replication Example This example matches the mean proficiency and standard error of the Australia row of the Summary Statistics and Standard Errors for Proficiency in Overall Mathematics-Grade 4 table from the Appendix 14A: Summary Statistics and Standard Errors for Proficiency in Grade 4 Mathematics: australia_design <- subset( timss_design , countries_thru_canada %in% "Australia" ) stopifnot( nrow( australia_design ) == 5890 ) result <- timss_MIcombine( with( australia_design , svymean( ~ asmmat ) ) ) stopifnot( round( coef( result ) , 3 ) == 515.880 ) stopifnot( round( SE( result ) , 3 ) == 2.776 ) This example matches the jackknife sampling, imputation, and total variances of the same row: australia_fn <- unzipped_files[ grepl( 'asgaus' , basename( unzipped_files ) ) ] australia_tbl <- read_spss( australia_fn ) australia_tbl <- zap_labels( australia_tbl ) australia_df <- data.frame( australia_tbl ) names( australia_df ) <- tolower( names( australia_df ) ) estimate <- mean( c( with( australia_df , weighted.mean( asmmat01 , totwgt ) ) , with( australia_df , weighted.mean( asmmat02 , totwgt ) ) , with( australia_df , weighted.mean( asmmat03 , totwgt ) ) , with( australia_df , weighted.mean( asmmat04 , totwgt ) ) , with( australia_df , weighted.mean( asmmat05 , totwgt ) ) ) ) stopifnot( round( estimate , 3 ) == 515.880 ) for( k in 1:5 ){ this_variance <- 0 for( j in 1:75 ){ for( i in 0:1 ){ this_variance <- this_variance + ( weighted.mean( australia_df[ , paste0( 'asmmat0' , k ) ] , ifelse( j == australia_df[ , 'jkzone' ] , australia_df[ , 'totwgt' ] * 2 * ( australia_df[ , 'jkrep' ] == i ) , australia_df[ , 'totwgt' ] ) ) - weighted.mean( australia_df[ , paste0( 'asmmat0' , k ) ] , australia_df[ , 'totwgt' ] ) )^2 } } assign( paste0( 'v' , k ) , this_variance * 0.5 ) } sampling_variance <- mean( c( v1 , v2 , v3 , v4 , v5 ) ) stopifnot( round( sampling_variance , 3 ) == 7.397 ) imputation_variance <- ( 6 / 5 ) * ( ( ( with( australia_df , weighted.mean( asmmat01 , totwgt ) ) - estimate )^2 / 4 ) + ( ( with( australia_df , weighted.mean( asmmat02 , totwgt ) ) - estimate )^2 / 4 ) + ( ( with( australia_df , weighted.mean( asmmat03 , totwgt ) ) - estimate )^2 / 4 ) + ( ( with( australia_df , weighted.mean( asmmat04 , totwgt ) ) - estimate )^2 / 4 ) + ( ( with( australia_df , weighted.mean( asmmat05 , totwgt ) ) - estimate )^2 / 4 ) ) stopifnot( round( imputation_variance , 3 ) == 0.309 ) stopifnot( round( sampling_variance + imputation_variance , 3 ) == 7.706 ) "],["violence-against-children-and-youth-surveys-vacs.html", "Violence Against Children And Youth Surveys (VACS) Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " Violence Against Children And Youth Surveys (VACS) The global surveillance system to track and monitor the burden of violence against children. One table per country with one row per sampled respondent. Nationally representative cross-sectional household surveys of children and youth ages 13–24. No listed update frequency across the participating nations. Led by the CDC through funding from PEPFAR, in partnership with Together for Girls. Please skim before you begin: Sampling design and methodology of the Violence Against Children and Youth Surveys Violence Against Children Surveys (VACS): Towards a global surveillance system A haiku regarding this microdata: # enable us to # lodge cane between each spoke of # cycles of abuse Download, Import, Preparation Request public VACS data at https://www.togetherforgirls.org/en/analyzing-public-vacs-data. Select the Mozambique 2019 dataset and Stata option. Download and unzip the Mozambique VACS Public Use Dataset files: library(haven) vacs_tbl <- read_stata( file.path( path.expand( "~" ) , "mozambique_public use data.dta" ) ) vacs_df <- data.frame( vacs_tbl ) names( vacs_df ) <- tolower( names( vacs_df ) ) Save Locally   Save the object at any point: # vacs_fn <- file.path( path.expand( "~" ) , "VACS" , "this_file.rds" ) # saveRDS( vacs_df , file = vacs_fn , compress = FALSE ) Load the same object: # vacs_df <- readRDS( vacs_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) options( survey.lonely.psu = "adjust" ) vacs_design <- svydesign( ids = ~cluster , strata = ~strata , weights = ~sampleweight , data = subset( vacs_df , sampleweight > 0 ) , nest = TRUE ) Variable Recoding Add new columns to the data set: vacs_design <- update( vacs_design , one = 1 , age_sex_group = factor( ifelse( agegrp == 1 , sex , sex + 2 ) , levels = 1:4 , labels = c( 'male 13-17' , 'female 13-17' , 'male 18-24' , 'female 18-24' ) ) , sex = factor( sex , levels = 1:2 , labels = c( 'male' , 'female' ) ) , agegrp = factor( agegrp , levels = 1:2 , labels = c( '13-17' , '18-24' ) ) , ever_attended_school = ifelse( eversch %in% 1:2 , as.numeric( eversch == 1 ) , NA ) , childhood_physical_violence = as.numeric( pv18 == 1 ) , marry = factor( marry , levels = 1:3 , labels = c( 'Yes, ever married' , 'Yes, ever lived with a partner' , 'No, never married or lived with a partner' ) ) , age_at_first_pregnancy = ifelse( prage < 98 , prage , NA ) , age_at_first_cohabitation = ifelse( marage < 98 , marage , NA ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( vacs_design , "sampling" ) != 0 ) svyby( ~ one , ~ age_sex_group , vacs_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , vacs_design ) svyby( ~ one , ~ age_sex_group , vacs_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ age_at_first_cohabitation , vacs_design , na.rm = TRUE ) svyby( ~ age_at_first_cohabitation , ~ age_sex_group , vacs_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ marry , vacs_design ) svyby( ~ marry , ~ age_sex_group , vacs_design , svymean ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ age_at_first_cohabitation , vacs_design , na.rm = TRUE ) svyby( ~ age_at_first_cohabitation , ~ age_sex_group , vacs_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ marry , vacs_design ) svyby( ~ marry , ~ age_sex_group , vacs_design , svytotal ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ age_at_first_cohabitation , vacs_design , 0.5 , na.rm = TRUE ) svyby( ~ age_at_first_cohabitation , ~ age_sex_group , vacs_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ age_at_first_pregnancy , denominator = ~ age_at_first_cohabitation , vacs_design , na.rm = TRUE ) Subsetting Restrict the survey design to : sub_vacs_design <- subset( vacs_design , childhood_physical_violence == 1 ) Calculate the mean (average) of this subset: svymean( ~ age_at_first_cohabitation , sub_vacs_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ age_at_first_cohabitation , vacs_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ age_at_first_cohabitation , ~ age_sex_group , vacs_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( vacs_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ age_at_first_cohabitation , vacs_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ age_at_first_cohabitation , vacs_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ age_at_first_cohabitation , vacs_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ ever_attended_school , vacs_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( age_at_first_cohabitation ~ ever_attended_school , vacs_design ) Perform a chi-squared test of association for survey data: svychisq( ~ ever_attended_school + marry , vacs_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( age_at_first_cohabitation ~ ever_attended_school + marry , vacs_design ) summary( glm_result ) Replication Example This example matches statistics and confidence intervals within 0.1% from the Final Report of the Mozambique Violence Against Children and Youth Survey (VACS), 2019, Table 4.1.1. Prevalence of different types of sexual violence[1] before age 18, among 18-24-year-olds: females_18_to_24_design <- subset( vacs_design , sex == 'female' & agegrp == '18-24' ) # define a function to check unweighted N, prevalence, confidence interval for each estimate check_sv <- function( this_variable , this_design = females_18_to_24_design , N , prevalence , lb , ub ){ this_formula <- as.formula( paste( "~ as.numeric(" , this_variable , "== 1 )" ) ) stopifnot( coef( unwtd.count( this_formula , this_design ) ) == N ) this_result <- svymean( this_formula , this_design , na.rm = TRUE ) stopifnot( round( coef( this_result ) , 3 ) == prevalence ) stopifnot( abs( confint( this_result )[1] - lb ) < 0.0015 ) stopifnot( abs( confint( this_result )[2] - ub ) < 0.0015 ) invisible( TRUE ) } # sexual touching in childhood check_sv( "sv1_only18" , N = 1232 , prevalence = 0.066 , lb = 0.039 , ub = 0.093 ) # unwanted attempted sex in childhood check_sv( "sv2_only18" , N = 1232 , prevalence = 0.061 , lb = 0.035 , ub = 0.087 ) # pressured or coerced sex in childhood check_sv( "sv4_only18" , N = 1221 , prevalence = 0.056 , lb = 0.035 , ub = 0.077 ) # physically forced sex in childhood check_sv( "sv3_only18" , N = 1231 , prevalence = 0.035 , lb = 0.020 , ub = 0.051 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for VACS users, this code replicates previously-presented examples: library(srvyr) vacs_srvyr_design <- as_survey( vacs_design ) Calculate the mean (average) of a linear variable, overall and by groups: vacs_srvyr_design %>% summarize( mean = survey_mean( age_at_first_cohabitation , na.rm = TRUE ) ) vacs_srvyr_design %>% group_by( age_sex_group ) %>% summarize( mean = survey_mean( age_at_first_cohabitation , na.rm = TRUE ) ) "],["youth-risk-behavior-surveillance-system-yrbss.html", "Youth Risk Behavior Surveillance System (YRBSS) Download, Import, Preparation Analysis Examples with the survey library   Replication Example Analysis Examples with srvyr  ", " Youth Risk Behavior Surveillance System (YRBSS) The high school edition of the Behavioral Risk Factor Surveillance System (BRFSS). One table with one row per sampled youth respondent. A complex sample survey designed to generalize to all public and private school students in grades 9-12 in the United States. Released biennially since 1993. Administered by the Centers for Disease Control and Prevention. Please skim before you begin: Methodology of the Youth Risk Behavior Surveillance System Wikipedia Entry A haiku regarding this microdata: # maladolescence # epidemiology # sex, drugs, rock and roll Download, Import, Preparation Load the SAScii library to interpret a SAS input program, and also re-arrange the SAS input program: library(SAScii) sas_url <- "https://www.cdc.gov/yrbs/files/2023/2023XXH_SAS_Input_Program.sas" sas_text <- tolower( readLines( sas_url ) ) # find the (out of numerical order) # `site` location variable's position # within the SAS input program site_location <- which( sas_text == '@1 site $3.' ) # find the start field's position # within the SAS input program input_location <- which( sas_text == "input" ) # create a vector from 1 to the length of the text file sas_length <- seq( length( sas_text ) ) # remove the site_location sas_length <- sas_length[ -site_location ] # re-insert the site variable's location # immediately after the starting position sas_reorder <- c( sas_length[ seq( input_location ) ] , site_location , sas_length[ seq( input_location + 1 , length( sas_length ) ) ] ) # re-order the sas text file sas_text <- sas_text[ sas_reorder ] sas_tf <- tempfile() writeLines( sas_text , sas_tf ) Download and import the national file: dat_tf <- tempfile() dat_url <- "https://www.cdc.gov/yrbs/files/2023/XXH2023_YRBS_Data.dat" download.file( dat_url , dat_tf , mode = 'wb' ) yrbss_df <- read.SAScii( dat_tf , sas_tf ) names( yrbss_df ) <- tolower( names( yrbss_df ) ) yrbss_df[ , 'one' ] <- 1 Save Locally   Save the object at any point: # yrbss_fn <- file.path( path.expand( "~" ) , "YRBSS" , "this_file.rds" ) # saveRDS( yrbss_df , file = yrbss_fn , compress = FALSE ) Load the same object: # yrbss_df <- readRDS( yrbss_fn ) Survey Design Definition Construct a complex sample survey design: library(survey) yrbss_design <- svydesign( ~ psu , strata = ~ stratum , data = yrbss_df , weights = ~ weight , nest = TRUE ) Variable Recoding Add new columns to the data set: yrbss_design <- update( yrbss_design , did_not_always_wear_seat_belt = as.numeric( qn8 == 1 ) , ever_used_marijuana = as.numeric( qn46 == 1 ) , tried_to_quit_tobacco_past_year = as.numeric( qn40 == 1 ) , used_tobacco_past_year = as.numeric( q40 > 1 ) ) Analysis Examples with the survey library   Unweighted Counts Count the unweighted number of records in the survey sample, overall and by groups: sum( weights( yrbss_design , "sampling" ) != 0 ) svyby( ~ one , ~ ever_used_marijuana , yrbss_design , unwtd.count ) Weighted Counts Count the weighted size of the generalizable population, overall and by groups: svytotal( ~ one , yrbss_design ) svyby( ~ one , ~ ever_used_marijuana , yrbss_design , svytotal ) Descriptive Statistics Calculate the mean (average) of a linear variable, overall and by groups: svymean( ~ bmipct , yrbss_design , na.rm = TRUE ) svyby( ~ bmipct , ~ ever_used_marijuana , yrbss_design , svymean , na.rm = TRUE ) Calculate the distribution of a categorical variable, overall and by groups: svymean( ~ q2 , yrbss_design , na.rm = TRUE ) svyby( ~ q2 , ~ ever_used_marijuana , yrbss_design , svymean , na.rm = TRUE ) Calculate the sum of a linear variable, overall and by groups: svytotal( ~ bmipct , yrbss_design , na.rm = TRUE ) svyby( ~ bmipct , ~ ever_used_marijuana , yrbss_design , svytotal , na.rm = TRUE ) Calculate the weighted sum of a categorical variable, overall and by groups: svytotal( ~ q2 , yrbss_design , na.rm = TRUE ) svyby( ~ q2 , ~ ever_used_marijuana , yrbss_design , svytotal , na.rm = TRUE ) Calculate the median (50th percentile) of a linear variable, overall and by groups: svyquantile( ~ bmipct , yrbss_design , 0.5 , na.rm = TRUE ) svyby( ~ bmipct , ~ ever_used_marijuana , yrbss_design , svyquantile , 0.5 , ci = TRUE , na.rm = TRUE ) Estimate a ratio: svyratio( numerator = ~ tried_to_quit_tobacco_past_year , denominator = ~ used_tobacco_past_year , yrbss_design , na.rm = TRUE ) Subsetting Restrict the survey design to youths who ever drank alcohol: sub_yrbss_design <- subset( yrbss_design , qn40 > 1 ) Calculate the mean (average) of this subset: svymean( ~ bmipct , sub_yrbss_design , na.rm = TRUE ) Measures of Uncertainty Extract the coefficient, standard error, confidence interval, and coefficient of variation from any descriptive statistics function result, overall and by groups: this_result <- svymean( ~ bmipct , yrbss_design , na.rm = TRUE ) coef( this_result ) SE( this_result ) confint( this_result ) cv( this_result ) grouped_result <- svyby( ~ bmipct , ~ ever_used_marijuana , yrbss_design , svymean , na.rm = TRUE ) coef( grouped_result ) SE( grouped_result ) confint( grouped_result ) cv( grouped_result ) Calculate the degrees of freedom of any survey design object: degf( yrbss_design ) Calculate the complex sample survey-adjusted variance of any statistic: svyvar( ~ bmipct , yrbss_design , na.rm = TRUE ) Include the complex sample design effect in the result for a specific statistic: # SRS without replacement svymean( ~ bmipct , yrbss_design , na.rm = TRUE , deff = TRUE ) # SRS with replacement svymean( ~ bmipct , yrbss_design , na.rm = TRUE , deff = "replace" ) Compute confidence intervals for proportions using methods that may be more accurate near 0 and 1. See ?svyciprop for alternatives: svyciprop( ~ did_not_always_wear_seat_belt , yrbss_design , method = "likelihood" , na.rm = TRUE ) Regression Models and Tests of Association Perform a design-based t-test: svyttest( bmipct ~ did_not_always_wear_seat_belt , yrbss_design ) Perform a chi-squared test of association for survey data: svychisq( ~ did_not_always_wear_seat_belt + q2 , yrbss_design ) Perform a survey-weighted generalized linear model: glm_result <- svyglm( bmipct ~ did_not_always_wear_seat_belt + q2 , yrbss_design ) summary( glm_result ) Replication Example This example matches statistics, standard errors, and confidence intervals from the “did not always wear a seat belt” row of PDF page 29 of this CDC analysis software document: unwtd_count_result <- unwtd.count( ~ did_not_always_wear_seat_belt , yrbss_design ) stopifnot( coef( unwtd_count_result ) == 15071 ) wtd_n_result <- svytotal( ~ one , subset( yrbss_design , !is.na( did_not_always_wear_seat_belt ) ) ) stopifnot( round( coef( wtd_n_result ) , 0 ) == 16917 ) share_result <- svymean( ~ did_not_always_wear_seat_belt , yrbss_design , na.rm = TRUE ) stopifnot( round( coef( share_result ) , 4 ) == .3958 ) stopifnot( round( SE( share_result ) , 4 ) == .0172 ) ci_result <- svyciprop( ~ did_not_always_wear_seat_belt , yrbss_design , na.rm = TRUE ) stopifnot( round( confint( ci_result )[1] , 4 ) == 0.3621 ) stopifnot( round( confint( ci_result )[2] , 4 ) == 0.4304 ) Analysis Examples with srvyr   The R srvyr library calculates summary statistics from survey data, such as the mean, total or quantile using dplyr-like syntax. srvyr allows for the use of many verbs, such as summarize, group_by, and mutate, the convenience of pipe-able functions, the tidyverse style of non-standard evaluation and more consistent return types than the survey package. This vignette details the available features. As a starting point for YRBSS users, this code replicates previously-presented examples: library(srvyr) yrbss_srvyr_design <- as_survey( yrbss_design ) Calculate the mean (average) of a linear variable, overall and by groups: yrbss_srvyr_design %>% summarize( mean = survey_mean( bmipct , na.rm = TRUE ) ) yrbss_srvyr_design %>% group_by( ever_used_marijuana ) %>% summarize( mean = survey_mean( bmipct , na.rm = TRUE ) ) "]] diff --git a/metadata/nis.txt b/metadata/nis.txt index 9e6974ed..1c007129 100644 --- a/metadata/nis.txt +++ b/metadata/nis.txt @@ -71,11 +71,11 @@ One Haiku: ~~~{download_and_import_block} -Download the fixed-width file: +Download the 2023 fixed-width file: ```{r eval = FALSE , results = "hide" } dat_tf <- tempfile() -dat_url <- "https://ftp.cdc.gov/pub/Vaccines_NIS/NISPUF21.DAT" +dat_url <- "https://ftp.cdc.gov/pub/Vaccines_NIS/NISPUF23.DAT" download.file( dat_url , dat_tf , mode = 'wb' ) ``` @@ -86,7 +86,7 @@ library(Hmisc) r_tf <- tempfile() -r_script_url <- "https://ftp.cdc.gov/pub/Vaccines_NIS/NISPUF21.R" +r_script_url <- "https://ftp.cdc.gov/pub/Vaccines_NIS/NISPUF23.R" r_input_lines <- readLines( r_script_url ) @@ -103,7 +103,7 @@ writeLines( r_input_lines , r_tf ) source( r_tf , echo = TRUE ) # rename the resultant data.frame object -chapter_tag_df <- NISPUF21 +chapter_tag_df <- NISPUF23 names( chapter_tag_df ) <- tolower( names( chapter_tag_df ) ) @@ -195,7 +195,7 @@ needs_srvyr_block: yes ## Replication Example {-} -This example matches the statistics and standard errors from [Data User's Guide Table 4](https://www.cdc.gov/vaccines/imz-managers/nis/downloads/NIS-PUF21-DUG.pdf#page=35): +This example matches the statistics and standard errors from [Data User's Guide Table 4](https://www.cdc.gov/nis/media/pdfs/2024/11/NISPUF23DUG.pdf#page=36): ```{r eval = FALSE , results = "hide" } @@ -211,12 +211,12 @@ coefficients <- results[ , "p_utd431h314_rout_sUTD" , drop = FALSE ] standard_errors <- results[ , "se.p_utd431h314_rout_sUTD" , drop = FALSE ] -stopifnot( round( coefficients[ "HISPANIC" , ] , 3 ) == .711 ) -stopifnot( round( coefficients[ "NON-HISPANIC WHITE ONLY" , ] , 3 ) == .742 ) -stopifnot( round( coefficients[ "NON-HISPANIC BLACK ONLY" , ] , 3 ) == .647 ) -stopifnot( round( standard_errors[ "HISPANIC" , ] , 3 ) == .015 ) -stopifnot( round( standard_errors[ "NON-HISPANIC WHITE ONLY" , ] , 3 ) == .009 ) -stopifnot( round( standard_errors[ "NON-HISPANIC BLACK ONLY" , ] , 3 ) == .022 ) +stopifnot( round( coefficients[ "HISPANIC" , ] , 3 ) == .674 ) +stopifnot( round( coefficients[ "NON-HISPANIC WHITE ONLY" , ] , 3 ) == .716 ) +stopifnot( round( coefficients[ "NON-HISPANIC BLACK ONLY" , ] , 3 ) == .666 ) +stopifnot( round( standard_errors[ "HISPANIC" , ] , 3 ) == .017 ) +stopifnot( round( standard_errors[ "NON-HISPANIC WHITE ONLY" , ] , 3 ) == .008 ) +stopifnot( round( standard_errors[ "NON-HISPANIC BLACK ONLY" , ] , 3 ) == .023 ) ``` ~~~ diff --git a/nis.Rmd b/nis.Rmd index 0f96a3c6..e5fe8bba 100644 --- a/nis.Rmd +++ b/nis.Rmd @@ -58,11 +58,11 @@ One Haiku: ## Download, Import, Preparation {-} -Download the fixed-width file: +Download the 2023 fixed-width file: ```{r eval = FALSE , results = "hide" } dat_tf <- tempfile() -dat_url <- "https://ftp.cdc.gov/pub/Vaccines_NIS/NISPUF21.DAT" +dat_url <- "https://ftp.cdc.gov/pub/Vaccines_NIS/NISPUF23.DAT" download.file( dat_url , dat_tf , mode = 'wb' ) ``` @@ -73,7 +73,7 @@ library(Hmisc) r_tf <- tempfile() -r_script_url <- "https://ftp.cdc.gov/pub/Vaccines_NIS/NISPUF21.R" +r_script_url <- "https://ftp.cdc.gov/pub/Vaccines_NIS/NISPUF23.R" r_input_lines <- readLines( r_script_url ) @@ -90,7 +90,7 @@ writeLines( r_input_lines , r_tf ) source( r_tf , echo = TRUE ) # rename the resultant data.frame object -nis_df <- NISPUF21 +nis_df <- NISPUF23 names( nis_df ) <- tolower( names( nis_df ) ) @@ -336,7 +336,7 @@ summary( glm_result ) ## Replication Example {-} -This example matches the statistics and standard errors from [Data User's Guide Table 4](https://www.cdc.gov/vaccines/imz-managers/nis/downloads/NIS-PUF21-DUG.pdf#page=35): +This example matches the statistics and standard errors from [Data User's Guide Table 4](https://www.cdc.gov/nis/media/pdfs/2024/11/NISPUF23DUG.pdf#page=36): ```{r eval = FALSE , results = "hide" } @@ -352,12 +352,12 @@ coefficients <- results[ , "p_utd431h314_rout_sUTD" , drop = FALSE ] standard_errors <- results[ , "se.p_utd431h314_rout_sUTD" , drop = FALSE ] -stopifnot( round( coefficients[ "HISPANIC" , ] , 3 ) == .711 ) -stopifnot( round( coefficients[ "NON-HISPANIC WHITE ONLY" , ] , 3 ) == .742 ) -stopifnot( round( coefficients[ "NON-HISPANIC BLACK ONLY" , ] , 3 ) == .647 ) -stopifnot( round( standard_errors[ "HISPANIC" , ] , 3 ) == .015 ) -stopifnot( round( standard_errors[ "NON-HISPANIC WHITE ONLY" , ] , 3 ) == .009 ) -stopifnot( round( standard_errors[ "NON-HISPANIC BLACK ONLY" , ] , 3 ) == .022 ) +stopifnot( round( coefficients[ "HISPANIC" , ] , 3 ) == .674 ) +stopifnot( round( coefficients[ "NON-HISPANIC WHITE ONLY" , ] , 3 ) == .716 ) +stopifnot( round( coefficients[ "NON-HISPANIC BLACK ONLY" , ] , 3 ) == .666 ) +stopifnot( round( standard_errors[ "HISPANIC" , ] , 3 ) == .017 ) +stopifnot( round( standard_errors[ "NON-HISPANIC WHITE ONLY" , ] , 3 ) == .008 ) +stopifnot( round( standard_errors[ "NON-HISPANIC BLACK ONLY" , ] , 3 ) == .023 ) ``` ---