--- title: "Linear Regression" author: "JJB + Course" date: "02/20/2019" output: html_document: toc: true toc_float: collapsed: false --- ```{r setup = TRUE, include = FALSE} knitr::opts_chunk$set( comment = "#>", collapse = TRUE ) ``` # Linear Relationships Recall that a linear line is given by the formula of: \[y = b + m\cdot x \] where: - $y$ is the response - $b$ is the intercept or where the line crosses the _y_-axis. - $m$ is the **slope** or change between values. - $x$ is the predictor or input. ```{r compute-linear-relationship} # Define constants m = 1.5 b = 2 n = 20 # Compute values x = seq(-10, 10, length.out = n) y = m*x + b ``` ```{r show-linear-relationship, echo = FALSE} # install.packages("ggplot2") library("ggplot2") df = data.frame(x = x, y = y) ggplot(df) + aes(x = x, y = y) + geom_hline(yintercept = 0) + geom_vline(xintercept = 0) + geom_line(color = "blue") + geom_point(size = 3, color = "orange") + labs(title = "Observing a Linear Relationship", subtitle = paste("with", n, "observations"), x = "Predictor Value (x)", y = "Response Value (y)") + theme_bw() ``` ```{r data-gen-procedures, echo = FALSE} gen_linear_data = function(n, m = 1, b = 2, start = -10, end = 10) { # Compute values x = seq(start, end, length.out = n) y = m * x + b out = data.frame(x = x, y = y) attr(out, "n") = n out } gen_quad_data = function(n, a = 1, b = 2, c = 3, start = -10, end = 10) { # Compute values x = seq(start, end, length.out = n) y = a * x^2 + b * x + c out = data.frame(x = x, y = y) attr(out, "n") = n out } make_relationship_graph = function(df, estimate = FALSE, title = "Observing Relationships") { estimate_layer = if(estimate) { geom_smooth(method = "lm", color = "blue") } else { geom_line(color = "blue") } ggplot(df) + aes(x = x, y = y) + geom_hline(yintercept = 0) + geom_vline(xintercept = 0) + estimate_layer + geom_point(size = 3, color = "orange") + labs( title = title, subtitle = paste("with", attr(df, "n"), "observations"), x = "Predictor Value (x)", y = "Response Value (y)" ) + theme_bw() } ``` ```{r sample-relationships} make_relationship_graph(gen_linear_data(10), title = "Observing a Positive Linear Relationship") make_relationship_graph(gen_linear_data(10, m = -1), title = "Observing a Negative Linear Relationship") make_relationship_graph(gen_quad_data(10, a = 1, start = 0), title = "Observing a Quadratic Relationship (non-linear)") ``` What happens if the points are not _perfectly_ straight? ```{r compute-fragmented-linear-relationship} # Recalculate y with additional "noise" or error. y = m*x + b + rnorm(n, sd = 3) # Add gaussian noise ``` ```{r show-fragmented-linear-relationship, echo = FALSE} # Update the data.frame df$y = y # Replot graph ggplot(df) + aes(x = x, y = y) + geom_hline(yintercept = 0) + geom_vline(xintercept = 0) + # geom_smooth(method = "lm") + geom_point(size = 3, color = "orange") + labs(title = "Observing a Partial Linear Relationship", subtitle = paste("with", n, "observations"), x = "Predictor Value (x)", y = "Response Value (y)") + theme_bw() ``` How can we estimate what the "linear line" that goes through the majority of points is? # SLR **Simple Linear Regression (SLR)** is given as: \[{y_i} = {\beta _0} + {\beta _1}{x_i} + {\varepsilon _i}\] where: - $i$ is the observation identifier. - $y_i$ is the response value for the observation. - $\beta _0$ is the intercept value. - $\beta _1$ is the response value. - $x_i$ is the predictor or input for the observation. ## Example: Simulating SLR Data Before running algorithms on real-world data, we first try to see if we can recover an "oracle" model, where we know the exact parameters the data was simulated under. ```{r simulating-data} # Set seed for reproducibility set.seed(9123) # Number of observations n = 500 # Design matrix: n x 2 X = cbind(1, rnorm(n, mean = 2)) # True beta values: 2 x 1 beta = c(2.5, 4) # Compute response variable with error: n x 1 y = X[, 1] * beta[1] + X[, 2] * beta[2] + rnorm(n, sd = 3) ``` ```{r show-pattern, echo = FALSE} df = data.frame(x = X[, 2], y = y) ggplot(df, aes(x, y)) + geom_point() + labs(title = "Generated Random Normal Data", subtitle = paste("Created using", n, "observations"), x = "Simulated Predictor Values", y = "True Response") ``` ## Example: Computing SLR with analytical forms Let's apply the analytical SLR solutions. ```{r compute-slr} # Set seed for reproducibility set.seed(9123) # Mean of Response (y) y_mu = mean(y) # Mean of Predictor (x) x = X[, 2] x_mu = mean(x) # Estimate Slope beta1_hat = sum((x - x_mu) * (y - y_mu)) / sum((x - x_mu) ^ 2) # Estimate Intercept beta0_hat = y_mu - beta1_hat * x_mu cbind(c(beta0_hat, beta1_hat), beta) ``` These estimates are then shown on the graph via the "blue" line. ```{r show-lm-form, echo = FALSE} library("ggplot2") df = data.frame(x = X[, 2], y = y) ggplot(df, aes(x, y)) + geom_point() + geom_smooth(method = "lm", color = "blue") + labs(title = "Estimated Coefficients on Generated Data", subtitle = paste("with", n, "observations"), x = "Simulated Predictor Values", y = "True Response") ``` ## Example: Optimizing for SLR ```{r optim-r-slr} set.seed(8812) beta_hat = rep(0, 2) # Provide initial beta value: 2 x 1 # Write the cost function to minimize min_rss_slr = function(par, X, y) { rss = sum((y - (X[,1]*par[1] + X[,2] * par[2]))^2) return(rss) } # Perform the minimization model_opt = optim(par = beta, fn = min_rss_slr, method = "BFGS", control=list(trace=TRUE), X = X, y = y) # Check parameter difference cbind(model_opt$par, beta) ``` The optimization stopped when we moved away from the "minimum" of the data. ```{r show-optim-path} ggplot(data.frame(x = c(1, 2, 3), y = c(4545.000693, 4542.177432, 4545.000693)), aes(x, y)) + geom_point() + geom_line() + geom_point(data = data.frame(x = 2, y = 4542.177432), color = "red", size =3 ) + labs(title = "Optimization of RSS", subtitle = "using the BFGS optimizer", y = "RSS value", x = "Iteration") ``` # Matrices ## Example: Creating a Matrix Matrices are built in a different way than `data.frame`s. In particular, data is supplied as a 1D vector and then dimensions are added. ```{r construct-matrix} # Construct a matrix z = matrix( c(1, 2, 3, 4, 5, 6), nrow = 3, ncol = 2 ) # Two dimensions dim(z) # Length is the number of elements length(z) ``` **Note:** You only need to supply _one_ dimension and _R_ will automatically calculate the other. ```{r auto-calc-dimension} # Construct a matrix z_auto = matrix( c(1, 2, 3, 4, 5, 6), nrow = 3 ) # Show their the same. all.equal(z, z_auto) ``` Construct matrix by **column**. ```{r fill-order-col} z_col = matrix(c(1, 2, 3, 4, 5, 6), nrow = 3, ncol = 2, byrow = FALSE) # Set to FALSE (default) z_col # Note, this is the default. all.equal(z, z_col) ``` Construct matrix by **row**. ```{r fill-order-row} z_row = matrix(c(1, 2, 3, 4, 5, 6), nrow = 3, ncol = 2, byrow = TRUE) # Set to TRUE z_row ``` ## Example: Adding Data to a Matrix Unlike a `data.frame`, the `matrix` has a special way of adding new _data_. ```{r} # Starting matrix the_matrix = matrix( c(1, 2, 3, 4), nrow = 2, ncol = 2 ) # Note: Use the following sparingly # Add data by column cbind(the_matrix, c(5, 6)) # Add data by row rbind(the_matrix, c(5, 6)) ``` # MLR ## Example: Fitting a Linear Regression with `lm` Fitting an ordinary linear model in _R_ uses: ```{r, eval = FALSE} model_fit = lm( y ~ x1 + x2, data = data_set ) ``` where: - the first argument is a `formula` with the response variable to left of the `~`, read "is modeled as", and the predictor variables to the right - the second argument specifies the data set. Inside the `formula`, model terms can be combined using `+` to give:^[${\beta _0}$ represents the intercept term and is automatically included.] $${y_i} = {\beta _0} + {\beta _1}{x_{i,1} } + {\beta _2}{x_{i,2} } + {\varepsilon _i}$$ ## Example: Fits with `lm` ```{r model-data} model_fit = lm(mpg ~ wt + qsec, data = mtcars) model_fit ``` ## Example: The Formula Object and Design Matrix Specify the response (⟵) and predictors (→) ```{r sample-model-frame-manipulation} model_formula = mpg ~ wt + qsec ``` Construct a model frame to hold the formula and data ```{r model-frame-here} model_frame = model.frame(model_formula, data = mtcars) ``` Retrieve the Design Matrix on the right (→) hand side of the formula ```{r model-design-matrix} X = model.matrix(model_formula, model_frame) X ``` Retrieve the Response on the left (⟵) hand side of the formula ```{r extract-model-response} y = model.response(model_frame) y ``` Comparing the difference between model matrices _with_ and _without_ the intercept. Incorporate the intercept back in: ```{r view-model-matrix-differences} X = model.matrix( mpg ~ wt + qsec, # By default, the intercept is included. data = mtcars ) head(X) ``` Remove the intercept: ```{r remove-model-int} X_noint = model.matrix( mpg ~ wt + qsec - 1, # note the -1 to remove intercept data = mtcars ) head(X_noint) ``` ## Example: Inference with `lm` By using the `summary()` function, we can obtain detailed information about the model's properties. These properties allow us to perform _inference_ as to the quality of the model from a statistical point of view. ```{r model-summary} summary(model_fit) ``` ## Example: Calling `lm.fit` with pre-made `X` and `y` Sometimes, we may wish to directly obtain a fit without using a `formula` approach. ```{r directly-call-lm} model_fit_design = lm.fit(x = X, y = matrix(mtcars$mpg)) ``` Note: The output structure is relatively _intacted_ compared to an `lm()` call. ```{r model-fit-example} model_fit_design$coefficients model_fit$coefficients ``` ## Example: Predictions Predict the responses with new data using the `predict` function. ```{r my-predictions} future_car = data.frame(wt = 0.5, qsec = 12) y_hat = predict(model_fit, newdata = future_car) y_hat ``` This is equivalent to calling: \[\begin{align*} \widehat {MPG} &= {{\hat \beta }_0} + {{\hat \beta }_1}wt + {{\hat \beta }_2}qsec \\ \widehat {MPG} &= {{\hat \beta }_0} + {{\hat \beta }_1}\left( {0.5} \right) + {{\hat \beta }_2}\left( {12} \right) \end{align*} \] **Note:** Leaving off `newdata` shows the predictions for the training data set. ```{r my-model-training-predictions} y_hat_model = predict(model_fit) head(y_hat_model) ``` **Note:** `predict` is also a generic function. # Factors and the Design Matrix ## Example: Factors in Data Frames ```{r factors-in-data-frames} id = c(1, 2, 3, 55) sex = c("M", "F", "F", "M") height = c(6.1, 5.5, 5.2, 5.9) subject_heights_vec = data.frame( id = id, sex = sex, height = height ) # Notice sex is a factor inside the data.frame str(subject_heights_vec) # But outside, on the vector it is a character class(sex) summary(subject_heights_vec) summary(sex) ``` ```{r df-without-factors} # Data frame without factors subject_heights = data.frame( id = c(1, 2, 3, 55), sex = c("M", "F", "F", "M"), height = c(6.1, 5.5, 5.2, 5.9), stringsAsFactors = FALSE # Manually disabling factors ) ``` Data Frame with Factors ```{r df-with-factors} subject_heights_fct = data.frame( id = c(1, 2, 3, 55), sex = c("M", "F", "F", "M"), height = c(6.1, 5.5, 5.2, 5.9), stringsAsFactors = TRUE # default behavior ) ``` Notice output difference between `character` and `factor` ```{r} summary(subject_heights) summary(subject_heights_fct) ``` This is shown in the structure between the two data.frames ```{r} str(subject_heights) str(subject_heights_fct) ``` ## Example: Factor Vector ```{r} sex = c("M", "F", "F", "M") # Character vector sex_factor = factor(sex) # Convert from character to factor as.numeric(sex_factor) # Retrieve levels ``` ```{r} letters ``` ```{r example-of-binary-op, eval = FALSE} # This will fail. 3 * "F" ``` ```{r} 3 * 1 ``` ### Exercise: Make a Design Matrix ```{r sample-data} class_seating = data.frame( distance = c("back", "back", "front", "back", "front", "front"), side = c("right", "left", "middle", "middle", "right", "left") ) ``` Checking with R ... (Recall: `-1` omits the intercept.) ```{r example-model-call} ``` ## Example: Factor Math Failures ```{r factor-math-folies, eval = FALSE} x = c(3L, -1L, 22L, 9L, 0L, 22L, 9L) # Create Integer Vector my_factor = as.factor(x) # Cast integer to factor my_factor + 10 # Error in an unexpected way min(my_factor) # Show stopping error ``` ## Example: Exploring a Factor ```{r explore-factor} x = c(3L, -1L, 22L, 9L, 0L, 22L, 9L) # Create Integer Vector my_factor = as.factor(x) # Cast integer to factor levels(my_factor) # List of Levels my_factor[1] = 9 # Modify with a pre-existing level my_factor[2] = 18 # Error if level isn't present already my_factor x_recovery = levels(my_factor)[my_factor] # Recover element-wise values x_recovery class(x_recovery) x_recovery = as.numeric(x_recovery) # Cast to appropriate type x_recovery + 10 ``` ## Example: Ordered Factors ```{r ordered-factors-demo} yields = c("hi", "low", "med", "low", "med", "low") # Factor without Order yields_fct = factor(yields) yields_fct # Notice order isn't set appropriately yields_order = factor(yields, ordered = TRUE) # Add Order yields_order # Correct ordering from low to high yields_fixed_order = factor(yields, levels = c("low", "med", "hi"), ordered = TRUE) yields_fixed_order ``` ## Exercise: To be a Factor or an Ordered Factor Determine whether the following should be either a factor or an ordered factor **Months:** (Jan, Feb, ... , Nov, Dec) **Colors:** (red, orange, ... , black, green) **Alphabet:** (a, b, ... , y, z)