library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(dslabs)
data("polls_us_election_2016")
# Create a table called `polls` that filters by state, date, and reports the spread
polls <- polls_us_election_2016 %>%
filter(state != "U.S." & enddate >= "2016-10-31") %>%
mutate(spread = rawpoll_clinton/100 - rawpoll_trump/100)
# Create an object called `cis` that columns for the lower and upper confidence intervals. Select the columns indicated in the instructions.
N <- polls$samplesize
cis <- polls %>% mutate(X_hat=(spread+1)/2,se=2*sqrt(X_hat*(1-X_hat)/N),lower=spread-qnorm(0.975)*se,upper=spread+qnorm(0.975)*se) %>%
select(state,startdate,enddate,pollster,grade,spread,lower,upper)
add <- results_us_election_2016 %>% mutate(actual_spread = clinton/100 - trump/100) %>% select(state, actual_spread)
cis <- cis %>% mutate(state = as.character(state)) %>% left_join(add, by = "state")
errors <- cis %>% mutate(error = (spread - actual_spread),hit = sign(spread) == sign(actual_spread))
# The 'errors' data have already been loaded. Examine them using the `head` function.
head(errors)
#> state startdate enddate pollster grade
#> 1 New Mexico 2016-11-06 2016-11-06 Zia Poll <NA>
#> 2 Virginia 2016-11-03 2016-11-04 Public Policy Polling B+
#> 3 Iowa 2016-11-01 2016-11-04 Selzer & Company A+
#> 4 Wisconsin 2016-10-26 2016-10-31 Marquette University A
#> 5 North Carolina 2016-11-04 2016-11-06 Siena College A
#> 6 Georgia 2016-11-06 2016-11-06 Landmark Communications B
#> spread lower upper actual_spread error hit
#> 1 0.02 -0.001331221 0.0413312213 0.083 -0.063 TRUE
#> 2 0.05 -0.005634504 0.1056345040 0.054 -0.004 TRUE
#> 3 -0.07 -0.139125210 -0.0008747905 -0.094 0.024 TRUE
#> 4 0.06 0.004774064 0.1152259363 -0.007 0.067 FALSE
#> 5 0.00 -0.069295191 0.0692951912 -0.036 0.036 FALSE
#> 6 -0.03 -0.086553820 0.0265538203 -0.051 0.021 TRUE
# Generate an object called 'totals' that contains the numbers of good and bad predictions for polls rated A- and C-
totals <- errors %>% filter(grade %in% c("A-", "C-")) %>% group_by(grade,hit) %>% summarize(num = n()) %>% spread(grade,num)
#> Error in spread(., grade, num): could not find function "spread"
totals
#> Error in eval(expr, envir, enclos): object 'totals' not found
# Print the proportion of hits for grade A- polls to the console
totals %>% mean(hit == TRUE / `A-`)
#> Error in eval(lhs, parent, parent): object 'totals' not found
# Print the proportion of hits for grade C- polls to the console
totals %>% mean(hit == TRUE / `C-`)
#> Error in eval(lhs, parent, parent): object 'totals' not found