The file data.RDS
contains three data lists with the following information:
pew
- Pew Research Organization’s October 2016 Political Survey. The original data can be found at http://www.people-press.org/dataset/october-2016-political-survey/. It is a national telephone sample survey.
cps
- the November 2016 Voting and Registration Supplement to the Current Population Survey. The full data set can be downloaded from <www.nber.org/cps/>.
actual_result
- election result in 2016 from https://uselectionatlas.org/
data <- readRDS("data.RDS")
pew <- data[[1]]
cps <- data[[2]]
actual_result <- data[[3]] #true election results
Outcome variable: 2016 Presidential voting intention, which is obtained from Pew Research Organization’s October 2016 Political Survey (2,583 interviews, conducted during October 20-25, 2016.)
Covariates: Individual characteristics (from the survey) and group level predictors (2012 state vote)
Post-strata: Age x Gender x Race x Education x State
Stratum counts: obtained from the November 2016 Voting and Registration Supplement to the Current Population Survey
library(tidyverse);
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## Warning: package 'ggplot2' was built under R version 3.6.3
## Warning: package 'tibble' was built under R version 3.6.3
## Warning: package 'tidyr' was built under R version 3.6.3
## Warning: package 'readr' was built under R version 3.6.3
## Warning: package 'purrr' was built under R version 3.6.3
## Warning: package 'dplyr' was built under R version 3.6.3
## Warning: package 'stringr' was built under R version 3.6.2
## Warning: package 'forcats' was built under R version 3.6.3
## -- Conflicts ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(maps); library(mapproj); library(gridExtra);
## Warning: package 'maps' was built under R version 3.6.3
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
## Warning: package 'mapproj' was built under R version 3.6.3
## Warning: package 'gridExtra' was built under R version 3.6.2
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(rstan); library(rstanarm);library(loo);
## Warning: package 'rstan' was built under R version 3.6.3
## Loading required package: StanHeaders
## Warning: package 'StanHeaders' was built under R version 3.6.3
## rstan (Version 2.21.2, GitRev: 2e1f913d3ca3)
## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)
## Do not specify '-march=native' in 'LOCAL_CPPFLAGS' or a Makevars file
##
## Attaching package: 'rstan'
## The following object is masked from 'package:tidyr':
##
## extract
## Warning: package 'rstanarm' was built under R version 3.6.3
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 3.6.2
## This is rstanarm version 2.21.1
## - See https://mc-stan.org/rstanarm/articles/priors for changes to default priors!
## - Default priors may change, so it's safest to specify priors, even if equivalent to the defaults.
## - For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores())
##
## Attaching package: 'rstanarm'
## The following object is masked from 'package:rstan':
##
## loo
## Warning: package 'loo' was built under R version 3.6.3
## This is loo version 2.3.1
## - Online documentation and vignettes at mc-stan.org/loo
## - As of v2.0.0 loo defaults to 1 core but we recommend using as many as possible. Use the 'cores' argument or set options(mc.cores = NUM_CORES) for an entire session.
## - Windows 10 users: loo may be very slow if 'mc.cores' is set in your .Rprofile file (see https://github.com/stan-dev/loo/issues/94).
##
## Attaching package: 'loo'
## The following object is masked from 'package:rstan':
##
## loo
library(brms)
## Warning: package 'brms' was built under R version 3.6.3
## Loading 'brms' package (version 2.13.5). Useful instructions
## can be found by typing help('brms'). A more detailed introduction
## to the package is available through vignette('brms_overview').
##
## Attaching package: 'brms'
## The following objects are masked from 'package:rstanarm':
##
## dirichlet, exponential, get_y, lasso, ngrps
## The following object is masked from 'package:rstan':
##
## loo
## The following object is masked from 'package:stats':
##
## ar
fit <- stan_glm(demvote ~ 1 +age4 + gender + race3 + educ4 +
region + qlogis(obama12), data = pew, chains=2, family = binomial) #logistic model; demvote is a binary variable taking the value 1 if voting preference is for Clinton; age4 is a categorical variable for age; obama12 is the proportion voted for Obama in 2012
##
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 1).
## Chain 1:
## Chain 1: Gradient evaluation took 0.001 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 10 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1:
## Chain 1:
## Chain 1: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 1: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 1: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 1: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 1: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 1:
## Chain 1: Elapsed Time: 1.308 seconds (Warm-up)
## Chain 1: 1.394 seconds (Sampling)
## Chain 1: 2.702 seconds (Total)
## Chain 1:
##
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 2).
## Chain 2:
## Chain 2: Gradient evaluation took 0 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2:
## Chain 2:
## Chain 2: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 2: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 2: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 2: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 2: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 2:
## Chain 2: Elapsed Time: 1.24 seconds (Warm-up)
## Chain 2: 1.632 seconds (Sampling)
## Chain 2: 2.872 seconds (Total)
## Chain 2:
posterior_predict
in rstanarm
substitutes for the usual predict
function in R:imputations <- posterior_predict(fit, draws = 50,
newdata = subset(cps, select=c(age4, gender, race3, educ4, region, obama12)))
This creates a matrix imputations
of dimension draws
x nrow(newdata)
.
get_state_estimates
.get_state_estimates <- function(imputations) {
state_by_clinton <- function(imputed_values) 100 * prop.table(
xtabs(weight ~ state + imputed_values, data = cps), 1)[,"1"]
state_estimates <- apply(imputations, 1, state_by_clinton)
apply(state_estimates, 1, mean)
}
mrp <- get_state_estimates(imputations)
rmse <- function(est, act) sqrt(mean((est - act)^2, na.rm = TRUE))
estimates = cbind(actual_result, mrp)
estimates
## state name actual mrp
## AK AK Alaska 36.55087 39.98310
## AL AL Alabama 34.35795 43.01501
## AR AR Arkansas 33.65312 34.46911
## AZ AZ Arizona 44.58042 48.72325
## CA CA California 61.48236 63.25903
## CO CO Colorado 48.15651 52.83765
## CT CT Connecticut 54.56630 55.70593
## DC DC District of Columbia 90.86382 92.42943
## DE DE Delaware 53.08598 53.85387
## FL FL Florida 47.40708 46.63471
## GA GA Georgia 45.34558 51.51777
## HI HI Hawaii 62.22149 66.85512
## IA IA Iowa 41.74049 41.45994
## ID ID Idaho 27.48493 31.53213
## IL IL Illinois 55.24264 55.11377
## IN IN Indiana 37.45972 38.10348
## KS KS Kansas 35.73996 33.33459
## KY KY Kentucky 32.68217 31.06049
## LA LA Louisiana 38.44957 46.67666
## MA MA Massachusetts 60.00506 57.13687
## MD MD Maryland 60.32574 63.64090
## ME ME Maine 47.83020 47.20271
## MI MI Michigan 47.02703 49.86495
## MN MN Minnesota 46.44200 44.35839
## MO MO Missouri 37.87013 39.92888
## MS MS Mississippi 40.05745 50.70305
## MT MT Montana 35.41276 38.05168
## NC NC North Carolina 46.17287 48.48771
## ND ND North Dakota 27.22674 29.12134
## NE NE Nebraska 33.69876 31.98898
## NH NH New Hampshire 46.82626 44.43356
## NJ NJ New Jersey 54.98926 58.77485
## NM NM New Mexico 48.25565 58.65605
## NV NV Nevada 47.91782 54.43079
## NY NY New York 59.00366 64.13347
## OH OH Ohio 43.24300 45.55541
## OK OK Oklahoma 28.93168 27.61328
## OR OR Oregon 50.07185 51.67167
## PA PA Pennsylvania 47.45532 49.07111
## RI RI Rhode Island 54.40661 58.44736
## SC SC South Carolina 40.67342 47.91486
## SD SD South Dakota 31.73743 30.29397
## TN TN Tennessee 34.71633 36.38274
## TX TX Texas 43.12016 42.11723
## UT UT Utah 27.16647 26.17472
## VA VA Virginia 49.75135 48.71907
## VT VT Vermont 56.67779 58.38842
## WA WA Washington 52.53904 54.08845
## WI WI Wisconsin 46.45384 43.35142
## WV WV West Virginia 26.17656 25.58975
## WY WY Wyoming 21.87736 26.86529
rmse(mrp, actual_result$actual)
## [1] 3.914767
## Warning: Removed 2 rows containing missing values (geom_bar).
us_map <- map_data("state")
estimates %>%
mutate(state_name = tolower(name),
clinton_pct = cut(mrp, breaks = c(-Inf, 40, 45, 50, 55, 60, 100),
labels = c("<40", "40-45", "45-50", "50-55", "55-60", ">60"))) %>%
ggplot(aes(map_id = state_name)) +
geom_map(aes(fill = clinton_pct), map = us_map) +
expand_limits(x = us_map$long, y = us_map$lat) +
coord_map("albers", lat0 = 39, lat1 = 45) +
scale_fill_brewer(name = "Clinton %", type = "div", palette = "RdBu") +
theme(axis.line = element_blank()) +
theme_void()