Linear Regression

Silvie Cinková

2025-08-07

Diamonds

library(tidyverse)
library(ggfortify)
diamonds <- ggplot2::diamonds
colnames(diamonds)
 [1] "carat"   "cut"     "color"   "clarity" "depth"   "table"   "price"  
 [8] "x"       "y"       "z"      

Carat vs. price

How much does a diamond’s weight in carats affect its price?

ggplot(diamonds, aes(x = carat, y = price)) + geom_point(alpha = 0.3)

Correlation

cor <- cor.test(x = diamonds$carat, y = diamonds$price, method = "pearson") #default
cor 

    Pearson's product-moment correlation

data:  diamonds$carat and diamonds$price
t = 551.41, df = 53938, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.9203098 0.9228530
sample estimates:
      cor 
0.9215913 

Extract values from cor.test

str(cor)
List of 9
 $ statistic  : Named num 551
  ..- attr(*, "names")= chr "t"
 $ parameter  : Named int 53938
  ..- attr(*, "names")= chr "df"
 $ p.value    : num 0
 $ estimate   : Named num 0.922
  ..- attr(*, "names")= chr "cor"
 $ null.value : Named num 0
  ..- attr(*, "names")= chr "correlation"
 $ alternative: chr "two.sided"
 $ method     : chr "Pearson's product-moment correlation"
 $ data.name  : chr "diamonds$carat and diamonds$price"
 $ conf.int   : num [1:2] 0.92 0.923
  ..- attr(*, "conf.level")= num 0.95
 - attr(*, "class")= chr "htest"

The broom library

  • convenience library for statistical tests
  • extracts values from (many) common test functions
  • as data frames
  • functions tidy, augment, and glance
cor %>% broom::tidy()
# A tibble: 1 × 8
  estimate statistic p.value parameter conf.low conf.high method     alternative
     <dbl>     <dbl>   <dbl>     <int>    <dbl>     <dbl> <chr>      <chr>      
1    0.922      551.       0     53938    0.920     0.923 Pearson's… two.sided  

carat vs. price

  • linear model with ggplot2
diamonds_plot <- diamonds %>% ggplot(aes(x = carat, y = price)) +
  geom_point(alpha = 0.3) + 
  geom_smooth(method = "lm", se = TRUE, formula = y ~ x) + 
  coord_fixed(ratio = 1/10000)
diamonds_plot

Slope and Intercept in detail

diamonds_plot + 
  geom_abline(slope = 10000, intercept = 0, linetype = 3) + 
  coord_fixed(ratio = 1/10000)

Slope and intercept computed

lm_carat_price <- lm(formula = price ~ carat , data = diamonds)
lm_carat_price

Call:
lm(formula = price ~ carat, data = diamonds)

Coefficients:
(Intercept)        carat  
      -2256         7756  

lm detail

lm_carat_price %>% summary()

Call:
lm(formula = price ~ carat, data = diamonds)

Residuals:
     Min       1Q   Median       3Q      Max 
-18585.3   -804.8    -18.9    537.4  12731.7 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -2256.36      13.06  -172.8   <2e-16 ***
carat        7756.43      14.07   551.4   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1549 on 53938 degrees of freedom
Multiple R-squared:  0.8493,    Adjusted R-squared:  0.8493 
F-statistic: 3.041e+05 on 1 and 53938 DF,  p-value: < 2.2e-16

Display Coefficients with broom::tidy

  • sometimes masked by other packages
  • call explicitly broom::tidy
lm_carat_price %>% broom::tidy()
# A tibble: 2 × 5
  term        estimate std.error statistic p.value
  <chr>          <dbl>     <dbl>     <dbl>   <dbl>
1 (Intercept)   -2256.      13.1     -173.       0
2 carat          7756.      14.1      551.       0

Observed vs. predicted values

  • model predicts a value for each observed data point
  • predicted values form the regression line
  • residual = observed value - predicted value

Model values: observed, fitted

values_lm_carat_price <- lm_carat_price %>% broom::augment() 
glimpse(values_lm_carat_price)
Rows: 53,940
Columns: 8
$ price      <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340,…
$ carat      <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23,…
$ .fitted    <dbl> -472.382688, -627.511200, -472.382688, -6.997151, 148.13136…
$ .resid     <dbl> 798.3827, 953.5112, 799.3827, 340.9972, 186.8686, 730.8184,…
$ .hat       <dbl> 4.515399e-05, 4.706148e-05, 4.515399e-05, 3.982758e-05, 3.8…
$ .sigma     <dbl> 1548.572, 1548.571, 1548.572, 1548.576, 1548.576, 1548.573,…
$ .cooksd    <dbl> 6.001647e-06, 8.922178e-06, 6.016690e-06, 9.656791e-07, 2.7…
$ .std.resid <dbl> 0.51557559, 0.61575429, 0.51622136, 0.22020685, 0.12067468,…

.resid (Residuals)

  • .resid = .fitted minus price
values_lm_carat_price %>% select(price, .fitted, .resid) %>%
  slice_head(n = 10) %>% mutate(my_residuals = price - .fitted) 
# A tibble: 10 × 4
   price .fitted .resid my_residuals
   <int>   <dbl>  <dbl>        <dbl>
 1   326 -472.     798.         798.
 2   326 -628.     954.         954.
 3   327 -472.     799.         799.
 4   334   -7.00   341.         341.
 5   335  148.     187.         187.
 6   336 -395.     731.         731.
 7   336 -395.     731.         731.
 8   337 -240.     577.         577.
 9   337 -550.     887.         887.
10   338 -472.     810.         810.
lm_carat_price %>% broom::glance()
# A tibble: 1 × 12
  r.squared adj.r.squared sigma statistic p.value    df   logLik     AIC     BIC
      <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>    <dbl>   <dbl>   <dbl>
1     0.849         0.849 1549.   304051.       0     1 -472730. 945467. 945493.
# ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

Is my model reliable?

  • standardized residuals ought to be roughly normally distributed
  • Normality tests can be too strict
    • shapiro.test (3 - 5,000 data points)
    • ks.test for larger data sets
  • Use plots with ggfortify and autoplot

Shapiro test on standardized residuals

  • returns probability of your distribution being normal
  • make a random sample of max 4,999 data points
values_lm_carat_price$.std.resid %>% 
  sample(size = 4999) %>%
  shapiro.test()

    Shapiro-Wilk normality test

data:  .
W = 0.87719, p-value < 2.2e-16

Kolmogorov-Smirnov on standardized residuals

values_lm_carat_price$.std.resid %>% 
  ks.test(y = "pnorm", sd = sd(.), mean = mean(.))

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  .
D = 0.15254, p-value < 2.2e-16
alternative hypothesis: two-sided

ggfortify and autoplot

autoplot(lm_carat_price, which = 1)

Quantile-quantile plot

  • Standardized residuals ought to lie on the dotted line.
lm_carat_price %>% autoplot(which  = 2)

Scale-Location plot

lm_carat_price %>% autoplot(which = 3)

Cook’s distance

  • distance = influence of the observation on the model
  • combines leverage (how far predictor from mean) and residual size (guessing error)
  • observations worth investigating individually
lm_carat_price %>% autoplot(which = 4)

lm_carat_price %>% autoplot(which = 6)

Potential improvements

  • add another predictor (often a categorical one is very helpful)
  • log-transform the response variable (remedies heteroscedasticity)
  • add a nonlinear term (e.g. x square)

Add a predictor

lm_carat_cut_price <- lm(formula = price ~ carat + cut, data = diamonds)
broom::glance(lm_carat_cut_price) %>% select(c(1:3, 6, 8, 9))
# A tibble: 1 × 6
  r.squared adj.r.squared sigma    df     AIC     BIC
      <dbl>         <dbl> <dbl> <dbl>   <dbl>   <dbl>
1     0.856         0.856 1511.     5 942854. 942916.

Just price ~ carat

broom::glance(lm_carat_price)  %>% select(c(1:3, 6, 8, 9))
# A tibble: 1 × 6
  r.squared adj.r.squared sigma    df     AIC     BIC
      <dbl>         <dbl> <dbl> <dbl>   <dbl>   <dbl>
1     0.849         0.849 1549.     1 945467. 945493.

Plot diagnostics for the enriched model

autoplot(lm_carat_cut_price, which = c(1,2,3,4,6), ncol = 3)

Log transformation of the response variable

lm_log_carat_price <- lm(formula = log(price) ~ (carat), data = diamonds)
broom::glance(lm_log_carat_price) %>% select(c(1:3, 5, 8:9))
# A tibble: 1 × 6
  r.squared adj.r.squared sigma p.value    AIC    BIC
      <dbl>         <dbl> <dbl>   <dbl>  <dbl>  <dbl>
1     0.847         0.847 0.397       0 53464. 53491.
broom::glance(lm_log_carat_price) %>% select(c(1:3, 5, 8:9))
# A tibble: 1 × 6
  r.squared adj.r.squared sigma p.value    AIC    BIC
      <dbl>         <dbl> <dbl>   <dbl>  <dbl>  <dbl>
1     0.847         0.847 0.397       0 53464. 53491.

log-transformed response

  • better QQ fit
autoplot(lm_log_carat_price, which = c(1,2,3,4,6), ncol = 3)

log-transformed response + cut

lm_log_carat_cut_price <- lm(formula = log(price)~ carat + cut, data = diamonds)
autoplot(lm_log_carat_cut_price, which = c(1,2, 3,4,6), ncol = 3)

Add a non-linear term

this curve looks like \(x^2\), right?

diamonds %>% ggplot(aes(x = carat, y = price)) + 
  geom_smooth(method = "gam") 

lm_carat_price_x2 <- lm(formula = price ~ I(carat^2), data = diamonds)
autoplot(lm_carat_price_x2, which = c(1,2,3,4,6))

broom::glance(lm_carat_price_x2) %>% select(1:3,4, 8:9)
# A tibble: 1 × 6
  r.squared adj.r.squared sigma statistic     AIC     BIC
      <dbl>         <dbl> <dbl>     <dbl>   <dbl>   <dbl>
1     0.794         0.794 1812.   207606. 962398. 962425.

Just add another predictor without any other transformations

lm(formula = price ~ carat + cut + clarity, data = diamonds) %>% 
  broom::glance() %>% select(1:3,4, 8:9)
# A tibble: 1 × 6
  r.squared adj.r.squared sigma statistic     AIC     BIC
      <dbl>         <dbl> <dbl>     <dbl>   <dbl>   <dbl>
1     0.897         0.897 1281.    39107. 925009. 925133.
lm(formula = price ~ carat + cut, data = diamonds)

Call:
lm(formula = price ~ carat + cut, data = diamonds)

Coefficients:
(Intercept)        carat        cut.L        cut.Q        cut.C        cut^4  
   -2701.38      7871.08      1239.80      -528.60       367.91        74.59  
broom::glance(lm_carat_cut_price) %>% select(c(1:3, 6, 8, 9))
# A tibble: 1 × 6
  r.squared adj.r.squared sigma    df     AIC     BIC
      <dbl>         <dbl> <dbl> <dbl>   <dbl>   <dbl>
1     0.856         0.856 1511.     5 942854. 942916.
lm(formula = price ~ carat + cut + clarity + color, data = diamonds) %>% 
  broom::glance() %>% select(1:3,4, 8:9)
# A tibble: 1 × 6
  r.squared adj.r.squared sigma statistic     AIC     BIC
      <dbl>         <dbl> <dbl>     <dbl>   <dbl>   <dbl>
1     0.916         0.916 1157.    32641. 914023. 914201.

Diagnostic plots for the richest model

lm(formula = price ~ carat + cut + clarity + color, data = diamonds) %>%
  autoplot(which = c(1:4, 6), ncol = 3)

Interaction between terms

richest2 <- lm(formula = price ~ carat + cut *  clarity + color, data = diamonds) 
broom::glance(richest2)
# A tibble: 1 × 12
  r.squared adj.r.squared sigma statistic p.value    df   logLik     AIC     BIC
      <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>    <dbl>   <dbl>   <dbl>
1     0.917         0.916 1153.    12862.       0    46 -456806. 913707. 914134.
# ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
broom::tidy(richest2) %>% filter(p.value < 0.05) %>% arrange(desc(abs(estimate)), p.value)
# A tibble: 29 × 5
   term            estimate std.error statistic   p.value
   <chr>              <dbl>     <dbl>     <dbl>     <dbl>
 1 carat              8888.      12.0    740.   0        
 2 clarity.L          4391.      58.2     75.4  0        
 3 (Intercept)       -3653.      18.6   -196.   0        
 4 color.L           -1910.      17.7   -108.   0        
 5 clarity.Q         -1711.      53.8    -31.8  4.89e-220
 6 clarity.C           922.      48.3     19.1  8.54e- 81
 7 cut.L               646.      43.3     14.9  3.32e- 50
 8 color.Q            -631.      16.1    -39.2  0        
 9 cut.C:clarity.L    -440.     110.      -3.98 6.90e-  5
10 cut.L:clarity.L    -433.     162.      -2.68 7.42e-  3
# ℹ 19 more rows