---
title: "Bootstrap"
author: "JJB + Course"
date: "02/25/2019"
output:
   html_document:
     toc: true
     toc_float:
       collapsed: false
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

# Non-parametric Bootstrap

## Example: Bootstrapping subject-heights mean

```{r bootstrap-heights, cache = TRUE}
# Step 1: Obtain samples from population
sample_data = data.frame(id     = c(1, 2, 3, 4, 5),
                         sex    = c("M","F","F","M","M"),
                         height = c(6.1, 5.5, 5.2, 5.6, 5.9))                            
theta_hat = mean(sample_data$height)   # Compute the mean for the height data
n_obs = nrow(sample_data)              # Length of data
boot_iter = 500L                       # Number of bootstrap iterations
theta_star = rep(NA, boot_iter)        # Bootstrapped estimate of theta

for (i in seq_len(boot_iter)) {
  set.seed(11882 + i)                  # Set seed for reproducibility
  # Step 2: Randomly sample observations positions from 1 to n_obs
  indexes = sample(n_obs, n_obs, replace = TRUE)

  # Extract out the observation positions
  sample_data_star = sample_data[indexes,, drop = FALSE]

  # Step 3: Compute the desired statistic on the bootstrapped values
  theta_star[i] = mean(sample_data_star$height)
  
  
} # Step 4: Repeat until i matches boot_iter 

# See first portion of output
head(theta_star)

# See compute the mean of the bootstrapped distribution
mean(theta_star)

# Compare with the mean of the population sample
theta_hat
```

## Examples: Compute Percentiles

```{r example-quantiles}
# Sample Data
x = c(1, 2, 3, 4, 5, 6)

quantile(x, 
  probs = c(0.25, 0.5, 0.75, 1)
)

# Median is the 50% quantile
median(x)
```

### Example: Distribution Quantiles

```{r my-dist-quantile}
# Retrieve real quantiles
qnorm(c(0.25, 0.5, 0.75, 1))
```


### Example: Quantile CI

```{r ci-bootstrap-quantile, cache = TRUE, dependson="bootstrap-heights"}
alpha    = 0.05
alpha / 2
1 - alpha / 2
ci_range = quantile(theta_star, probs = c(alpha / 2, 1 - alpha / 2))
ci_range
```


## Example: Plotting Bootstrapped Samples

```{r view-bootstrap-samples}
# Graph results
library("ggplot2")

graph_bootstrap = data.frame(iter       = seq_along(theta_star),
                             theta_star = theta_star)
ggplot(graph_bootstrap) + 
  geom_histogram(aes(theta_star)) + labs(
    title = "Non-parametric Bootstrapped Data",
    sub   = "Example simulation",
    x     = "Values of Theta^*",
    y     = "Frequency of Theta^* Values"
  )
```


## Exercise: Standard Deviation of `iris`' `Sepal.Width`

```{r check-out-iris}
# Data set is included with base R
head(iris)

# Step 1: Obtain samples from population
sample_data = iris

# Extract all of the Sepal.Width observations
# grab observations
```

```{r view-bootstrap}
# Graph results
library("ggplot2")

graph_bootstrap = data.frame(iter       = seq_along(theta_star),
                             theta_star = theta_star)
ggplot(graph_bootstrap) + 
  geom_histogram(aes(theta_star)) + labs(
    title = "Non-parametric Bootstrapped Data",
    sub   = "Example simulation",
    x     = "Values of Theta^*",
    y     = "Frequency of Theta^* Values"
  )
```


## Example: Non-parametric bootstrap with `boot`

```{r r-boot-pkg, eval = FALSE}
# install.packages('boot')
library('boot')

# Create a data.frame w/ distance data
nsim_obs = 100

# Set seed to reproduce generated data
set.seed(981)

problem_data = 
  data.frame(distance = rchisq(nsim_obs, df = 5)
             # other data here 
)

# Create a sampling or subset function for the data
sampling_function = function(d, ind) {
  # Extract out the observation positions (this allows for multiple indices
  # to be selected)
  problem_data_star = problem_data[ind,, drop = FALSE]
  
  # Compute the desired statistic on the bootstrapped data
  mean(problem_data_star$distance)
}

# Run the bootstrapping procedure
booted_means = boot(
  data = problem_data,            # Pass the data
  statistic = sampling_function,  # Pass a function that computes a sample and statistic
  R = 200                         # Number of iterations
)

# Calculate different confidence intervals
boot.ci(booted_means)

# Show underlying statistic distribution (e.g. t is the mean statistic)
plot(booted_means)
```


# Parametric Bootstrap

## Example: Parametric bootstrap with `sd` and `mean` of a Normal Distribution

```{r r-parametric, cache = TRUE}
sample_values = rnorm(1000)           # Step 1: Obtain samples from known
                                      # population distribution.
                                      # Step 2: Obtain statistics
theta_mean_hat = mean(sample_values)  # Compute sample mean
theta_sd_hat = sd(sample_values)      # Compute sample standard deviation

n_obs = length(sample_values)         # Length of data
boot_iter = 250L                      # Number of bootstrap iterations
theta_mean_star = rep(NA, boot_iter)  # Bootstrapped estimate of mean
theta_sd_star = rep(NA, boot_iter)    # Bootstrapped estimate of standard dev

for (i in seq_len(boot_iter)) {
  set.seed(385 + i)                   # Set seed for reproducibility

  # Step 3: Randomly generate observations under distribution
  sample_values_star = rnorm(n_obs, mean = theta_mean_hat, sd = theta_sd_hat )

  # Step 4: Compute the desired statistic on the bootstrapped values
  theta_mean_star[i] = mean(sample_values_star)
  theta_sd_star[i] = sd(sample_values_star)

} # Step 5: Repeat until i matches boot_iter 

mean(theta_mean_star)
theta_mean_hat
mean(theta_sd_star)
theta_sd_hat
```

```{r my-data}
# install.packages("tidyr")
library("tidyr")

graph_bootstrap = data.frame(iter            = seq_len(boot_iter),
                             theta_mean_star = theta_mean_star,
                             theta_sd_star   = theta_sd_star)

# Tidy data
tidy_bootstrap = gather(graph_bootstrap,
                        key = theta_type_estimated, value = theta_star,
                        theta_mean_star:theta_sd_star)

theta_vlines = data.frame(theta_type_estimated = c("theta_mean_star", "theta_sd_star"), 
                          theta_val = c(0, 1))

library("ggplot2")
ggplot(tidy_bootstrap) + 
  geom_histogram(aes(theta_star)) +
  geom_vline(data = theta_vlines,
             aes(xintercept = theta_val, color = theta_type_estimated)) +
  facet_wrap(~theta_type_estimated) + 
  labs(
    title = "Non-parametric Bootstrapped Data",
    color = "Type of Theta Estimated",
    sub   = "Example simulation",
    x     = "Values of Theta^*",
    y     = "Frequency of Theta^* Values"
  ) + theme_bw()
```

## Exercise: Parametric bootstrap with `mean`, `sd`, and `median` of a Poisson Distribution

Estimate the `mean`, `sd`, and `median` of a Lambda distribution with an initial
parameter of `lambda = 3`.

```{r r-parametric-pois, cache = TRUE, eval = FALSE}

```


```{r overview-data, eval = FALSE}


```