23.9 回归诊断

包括线性模型和广义线性模型

Regression Deletion Diagnostics ?influence.measures

library(extrafont) # 注册字体 CM Roman 到 PDF 设备
data(anscombe)
form <- paste(paste0("y", seq(4)), paste0("x", seq(4)), sep = "~") # form <- sprintf('y%d ~ x%d', 1:4, 1:4)
fit <- lapply(form, lm, data = anscombe)
par(mfrow = c(2, 2), mar = 0.1 + c(4, 4, 1, 1), oma = c(0, 0, 2, 0), family = "CM Roman")
for (i in 1:4) {
  plot(as.formula(form[i]),
    data = anscombe, col = "black",
    pch = 19, cex = 1.2,
    xlim = c(3, 19), ylim = c(3, 13),
    xlab = as.expression(substitute(bold(x[i]), list(i = i))),
    ylab = as.expression(substitute(bold(y[i]), list(i = i)))
  )
  abline(fit[[i]], col = "red", lwd = 2)
  text(7, 12, bquote(bold(R)^2 == .(round(summary(fit[[i]])$r.squared, 3))))
}
mtext("Anscombe's 4 Regression data sets", outer = TRUE, cex = 1.2)
模型诊断很重要

图 23.1: 模型诊断很重要

library(ggplot2)
library(patchwork)
data("anscombe")

form <- sprintf('y%d ~ x%d', 1:4, 1:4)
fit <- lapply(form, lm, data = anscombe)

plot_lm <- function(i) {
  annotate_texts <- c("", "nonlinearity", "outlier", "influential point")
  p <- ggplot(data = anscombe, aes_string(x = paste0("x", i), y = paste0("y", i))) +
    geom_point() +
    geom_abline(intercept = coef(fit[[i]])[1], slope = coef(fit[[i]])[2], color = "red") +
    theme_minimal() +
    labs(
      x = substitute(bold(x[a]), list(a = i)), y = substitute(bold(y[b]), list(b = i)),
      title = bquote(bold(R)^2 == .(round(summary(fit[[i]])$r.squared, 3)))
    )
  p + annotate("text", x = 12, y = 11, label = annotate_texts[i])

}

Reduce("+", lapply(1:4, plot_lm))
线性模型可能在欺骗你

图 23.2: 线性模型可能在欺骗你