L01: Simulations to understand sampling distributions

Includes:

Lion noses linear regression
Data generation consistent with model
Linear regression of this first dataset
In-class Sampling Distribution Simulation Assignment

Document Preamble

Load libraries

library(knitr)
library(ezknitr)
library(abd)

Settings for Knitr (optional)

opts_chunk$set(fig.width = 8, fig.height = 6)

1. Lion noses linear regression:

Data entry

data(LionNoses)
head(LionNoses)

##   age proportion.black
## 1 1.1             0.21
## 2 1.5             0.14
## 3 1.9             0.11
## 4 2.2             0.13
## 5 2.6             0.12
## 6 3.2             0.13

Fit linear model

lm.nose<-lm(age~proportion.black, data=LionNoses)

Parameters:

Coefficients and residual variation are stored in lmfit:

coef(lm.nose)

##      (Intercept) proportion.black 
##        0.8790062       10.6471194

summary(lm.nose)$sigma # residual variation

## [1] 1.668764

What else is stored in lmfit? (residuals, variance covariance matrix, etc)

names(lm.nose)

##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"

names(summary(lm.nose))

##  [1] "call"          "terms"         "residuals"     "coefficients" 
##  [5] "aliased"       "sigma"         "df"            "r.squared"    
##  [9] "adj.r.squared" "fstatistic"    "cov.unscaled"

2. Data generation consistent with fitted model

## Use the same sampmle size Sample size - use length so it matches sample size of original data
n <- length(LionNoses$age)

## Predictor - copy of original proporation black data, now in vector
p.black <- LionNoses$proportion.black

## Parameters
sigma <- summary(lm.nose)$sigma # residual variation
betas <- coef(lm.nose)# regression coefficients

## Errors and response
# Residual errors are modeled as ~ N(0, sigma)
epsilon <- rnorm(n, 0, sigma)

# Response is modeled as linear function plus residual errors
y <- betas[1] + betas[2]*p.black + epsilon

3. Linear regression of this generated dataset

# Fit of model to simulated data:  
lmfit.generated <- lm(y ~ p.black)
summary(lmfit.generated)

## 
## Call:
## lm(formula = y ~ p.black)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3676 -0.8066  0.0594  0.8901  2.5837 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.9760     0.5113   1.909   0.0659 .  
## p.black      11.3066     1.3569   8.333 2.67e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.5 on 30 degrees of freedom
## Multiple R-squared:  0.6983, Adjusted R-squared:  0.6882 
## F-statistic: 69.43 on 1 and 30 DF,  p-value: 2.666e-09

In-Class Sampling Distribution Simulation Assignment

Exercise 3:

Generate 5000 datasets using the same code
Fit a linear regression model to each dataset “lm.temp”
Store the estimates of \(\beta_1\) and t-statistics
Calucate confidence limits for each simulation and determine how many include the true parameter used to simulate the data.

Hint: if you get stuck, try starting with a small number of simulations (less than 5000) until you get the code right.

#   set up a matricies to hold results 
nsims <- 5000 # number of simulations
beta.hat<- matrix(NA,   nrow    =   nsims,  ncol    =   1) # estimates of beta_1
tsamp.dist<-matrix(NA, nsims,1) # matrix to hold t-statistics
limits <- matrix(NA, nrow = nsims, ncol = 2) # matrix to hold CI limits 
colnames(limits) <- c("LL.slope","UL.slope")# label columns

# Simulation
for(i in 1:nsims){
  epsilon <- rnorm(n, 0, sigma) # random errors
  y <- betas[1] + betas[2]*p.black + epsilon # response
  lm.temp <- lm(y ~ p.black)
  ## extract beta-hat  
  beta.hat[i] <- coef(lm.temp)[2] 
  # Here is our t-statistic, calculated for each sample
  tsamp.dist[i]<-(beta.hat[i]-betas[2])/sqrt(vcov(lm.temp)[2,2])
  # Confidence limits
  limits[i,] <- confint(lm.temp)[2,] 
}

How many CI include the parameter used to generate the data?

# Indicator of whether "true" parameter is within confidence intervals
I.in <- betas[2] >= limits[,1] & betas[2] <= limits[,2]

# Proportion of confidence intervals with true beta
sum(I.in)/nsims

## [1] 0.9464

Plot earlier results

par(mfrow=c(1,2))
hist(beta.hat, col="gray",xlab="", main=expression(paste("Sampling Distribution of ", hat(beta)[1])))
abline(v=betas[2]) # add population parameter
hist(tsamp.dist, xlab="",
     main=expression(t==frac(hat(beta)-beta, se(hat(beta)))), freq=FALSE)
tvalues<-seq(-3,3, length=1000) # xvalues to evaluate t-distribution
lines(tvalues,dt(tvalues, df=28)) # overlay t-distribution

plot of chunk sampplot

Plot results of confidence limits (first 100 of them)

sim.dat<-data.frame(est.slope=beta.hat, limits, In=I.in) 
ggplot(sim.dat[1:100,], aes(x=est.slope, y=1:100, colour=as.factor(In))) +
  geom_segment(aes(x=LL.slope, xend=UL.slope, yend=1:100, colour=as.factor(In))) +
  scale_colour_discrete(name=expression(paste("Contains ", beta, "?"))) +
  geom_point() +
  theme(axis.text.y=element_blank()) +
  geom_vline(xintercept=betas[2]) +
  ylab("")+xlab("Estimate")

plot of chunk ciplot

Footer

spun with ezspin(“in_class/Exercise3Solution.R”, out_dir = “output/in_class”, keep_md = FALSE, fig_dir = “figures”)

Session Information:

sessionInfo()

## R version 3.3.1 (2016-06-21)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows >= 8 x64 (build 9200)
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] abd_0.2-8         mosaic_0.14.4     Matrix_1.2-6     
##  [4] mosaicData_0.14.0 ggplot2_2.1.0     dplyr_0.5.0      
##  [7] lattice_0.20-33   nlme_3.1-128      ezknitr_0.6      
## [10] knitr_1.14       
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.7       formatR_1.4       plyr_1.8.4       
##  [4] R.methodsS3_1.7.1 R.utils_2.4.0     tools_3.3.1      
##  [7] digest_0.6.10     evaluate_0.9      tibble_1.2       
## [10] gtable_0.2.0      DBI_0.5-1         yaml_2.1.14      
## [13] ggdendro_0.1-20   gridExtra_2.2.1   stringr_1.1.0    
## [16] rprojroot_1.1     R6_2.2.0          rmarkdown_1.3    
## [19] tidyr_0.6.0       magrittr_1.5      backports_1.0.4  
## [22] scales_0.4.0      htmltools_0.3.5   splines_3.3.1    
## [25] MASS_7.3-45       assertthat_0.1    mime_0.5         
## [28] colorspace_1.2-6  labeling_0.3      stringi_1.1.2    
## [31] lazyeval_0.2.0    munsell_0.4.3     markdown_0.7.7   
## [34] R.oo_1.20.0