# 第 26 章 线性模型

There’s probably some examples, but there are some examples of people using solve(t(X) %*% W %*% X) %*% W %*% Y to compute regression coefficients, too.

— Thomas Lumley 67

## 26.1 方差分析

I was profoundly disappointed when I saw that S-PLUS 4.5 now provides “Type III” sums of squares as a routine option for the summary method for aov objects. I note that it is not yet available for multistratum models, although this has all the hallmarks of an oversight (that is, a bug) rather than common sense seeing the light of day. When the decision was being taken of whether to include this feature, “because the FDA requires it” a few of my colleagues and I were consulted and our reply was unhesitatingly a clear and unequivocal “No”, but it seems the FDA and SAS speak louder and we were clearly outvoted.

— Bill Venables 68

## 26.2 单因素方差分析

chickwts 不同的喂食方式对体重的影响

boxplot(weight ~ feed, data = chickwts, col = "lightgray",
varwidth = TRUE, notch = TRUE, main = "chickwt data",
ylab = "Weight at six weeks (gm)")
## Warning in (function (z, notch = FALSE, width = NULL, varwidth = FALSE, : some
## notches went outside hinges ('box'): maybe set notch=FALSE
anova(fm1 <- lm(weight ~ feed, data = chickwts))
## Analysis of Variance Table
##
## Response: weight
##           Df Sum Sq Mean Sq F value    Pr(>F)
## feed       5 231129   46226  15.365 5.936e-10 ***
## Residuals 65 195556    3009
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
opar <- par(mfrow = c(2, 2), oma = c(0, 0, 1.1, 0),
mar = c(4.1, 4.1, 2.1, 1.1))
plot(fm1)
par(opar)

sleep

## Student's paired t-test 成对样本的 t 检验
with(sleep,
t.test(extra[group == 1],
extra[group == 2], paired = TRUE))
##
##  Paired t-test
##
## data:  extra[group == 1] and extra[group == 2]
## t = -4.0621, df = 9, p-value = 0.002833
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.4598858 -0.7001142
## sample estimates:
## mean of the differences
##                   -1.58
## The sleep *prolongations*
sleep1 <- with(sleep, extra[group == 2] - extra[group == 1])
summary(sleep1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
##    0.00    1.05    1.30    1.58    1.70    4.60
stripchart(sleep1, method = "stack", xlab = "hours",
main = "Sleep prolongation (n = 10)")
boxplot(sleep1, horizontal = TRUE, add = TRUE,
at = .6, pars = list(boxwex = 0.5, staplewex = 0.25))

michelson <- transform(morley,
Expt = factor(Expt), Run = factor(Run))
xtabs(~ Expt + Run, data = michelson)  # 5 x 20 balanced (two-way)
##     Run
## Expt 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
##    1 1 1 1 1 1 1 1 1 1  1  1  1  1  1  1  1  1  1  1  1
##    2 1 1 1 1 1 1 1 1 1  1  1  1  1  1  1  1  1  1  1  1
##    3 1 1 1 1 1 1 1 1 1  1  1  1  1  1  1  1  1  1  1  1
##    4 1 1 1 1 1 1 1 1 1  1  1  1  1  1  1  1  1  1  1  1
##    5 1 1 1 1 1 1 1 1 1  1  1  1  1  1  1  1  1  1  1  1
plot(Speed ~ Expt, data = michelson,
main = "Speed of Light Data", xlab = "Experiment No.")
fm <- aov(Speed ~ Run + Expt, data = michelson)
summary(fm)
##             Df Sum Sq Mean Sq F value  Pr(>F)
## Run         19 113344    5965   1.105 0.36321
## Expt         4  94514   23629   4.378 0.00307 **
## Residuals   76 410166    5397
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fm0 <- update(fm, . ~ . - Run)
anova(fm0, fm)
## Analysis of Variance Table
##
## Model 1: Speed ~ Expt
## Model 2: Speed ~ Run + Expt
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1     95 523510
## 2     76 410166 19    113344 1.1053 0.3632

ToothGrowth 维生素 C 对牙齿增长的关系

coplot(len ~ dose | supp, data = ToothGrowth, panel = panel.smooth,
xlab = "ToothGrowth data: length vs dose, given type of supplement")

?lm mlm

## 26.4 多因素方差分析

MANOVA.RMffmanova 包处理多因素方差分析

## 26.5 核学习

David Meyer 基于 libsvm 开发了 e1071 包，基于核方法实现了非线性回归分类算法

## 26.6 通用机器学习

lda MASS predict(obj)
glm stats predict(obj, type = "response")
gbm gbm predict(obj, type = "response", n.trees)
mda mda predict(obj, type = "posterior")
rpart rpart predict(obj, type = "prob")
Weka RWeka predict(obj, type = "probability")
logitboost LogitBoost predict(obj, type = "raw", nIter)
pamr.train pamr pamr.predict(obj, type = "posterior")

## 26.7 理论基础

\begin{align} Y &= X \beta + \epsilon \\ X^{\top}Y &= X^{\top}X\beta \\ \hat{\beta} &= (X^{\top}X)^{-1}X^{\top}Y \\ \hat{Y} &= X(X^{\top}X)^{-1}X^{\top}Y \\ \hat{\sigma^2} &= \frac{\|Y - \hat{Y}\|_2}{n - rk(X)} \\ & = \frac{\|(I - X(X^{\top}X)^{-1}X^{\top})Y\|_2}{n - rk(X)}\\ & = \frac{Y^{\top}(I - X(X^{\top}X)^{-1}X^{\top})Y}{n - rk(X)} \end{align}

## 26.8 多重多元线性回归

fit_mtcars <- lm(cbind(mpg, qsec) ~ 1, data = mtcars, offset = cbind(wt, wt * 2))
summary(fit_mtcars)
## Response mpg :
##
## Call:
## lm(formula = mpg ~ 1, data = mtcars, offset = cbind(wt, wt *
##     2))
##
## Residuals:
##     Min      1Q  Median      3Q     Max
## -11.897  -4.947  -1.316   2.984  15.192
##
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   16.873      1.219   13.85  8.1e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.893 on 31 degrees of freedom
##
##
## Response qsec :
##
## Call:
## lm(formula = qsec ~ 1, data = mtcars, offset = cbind(wt, wt *
##     2))
##
## Residuals:
##     Min      1Q  Median      3Q     Max
## -4.6842 -2.0793 -0.1693  2.2693  5.1857
##
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  11.4142     0.5076   22.49   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.871 on 31 degrees of freedom

## 26.9 回归诊断

Regression Deletion Diagnostics ?influence.measures

library(ggplot2)
library(patchwork)
data("anscombe")

form <- sprintf('y%d ~ x%d', 1:4, 1:4)
fit <- lapply(form, lm, data = anscombe)

plot_lm <- function(i) {
annotate_texts <- c("", "nonlinearity", "outlier", "influential point")
p <- ggplot(data = anscombe, aes_string(x = paste0("x", i), y = paste0("y", i))) +
geom_point() +
geom_abline(intercept = coef(fit[[i]])[1], slope = coef(fit[[i]])[2], color = "red") +
theme_minimal() +
labs(
x = substitute(bold(x[a]), list(a = i)), y = substitute(bold(y[b]), list(b = i)),
# RMSE
sqrt(sigma2_hat)
##          [,1]
## [1,] 9.893187

## 26.13 Intercountry Life-Cycle Savings Data 1960-1970

data("LifeCycleSavings")

data("longley")

## 26.15 甲醛的测定

ggplot(data = Formaldehyde, aes(x = carb, y = optden)) +
geom_point() +
theme_minimal()

## 26.16 迈克尔逊光速数据分析

1879 年迈克尔逊光速测量数据，记录了五次实验，每次试验测量 20 次光速，得到表格 26.2

reshape(
data = morley, v.names = "Speed", idvar = "Expt",
timevar = "Run", direction = "wide", sep = ""
) %>%
knitr::kable(.,
caption = "迈克尔逊光速数据",
row.names = FALSE, col.names = gsub("(Speed)", "", names(.)),
align = "c"
)

Expt 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
1 850 740 900 1070 930 850 950 980 980 880 1000 980 930 650 760 810 1000 1000 960 960
2 960 940 960 940 880 800 850 880 900 840 830 790 810 880 880 830 800 790 760 800
3 880 880 880 860 720 720 620 860 970 950 880 910 850 870 840 840 850 840 840 840
4 890 810 810 820 800 770 760 740 750 760 910 920 890 860 880 720 840 850 850 780
5 890 840 780 810 760 810 790 810 820 850 870 870 810 740 810 940 950 800 810 870

ggplot(data = morley, aes(x = Expt, y = Speed, group = Expt)) +
geom_boxplot() +
geom_jitter() +
theme_minimal() +
labs(x = "Expt", y = "Speed (km/sec)")

## 26.17 不同喂食方式对小鸡体重的影响 I

ggplot(data = chickwts, aes(x = feed, y = weight, color = feed)) +
geom_boxplot() +
geom_jitter() +
theme_minimal()

## 26.18 不同喂食方式对小鸡体重的影响 II

ggplot(data = ChickWeight, aes(x = Time, y = weight, group = Chick, color = Diet)) +
geom_point() +
geom_line() +
facet_wrap(~Diet) +
theme_minimal()

ggplot(data = ChickWeight,
aes(x = Time, y = weight, group = Diet, colour = Diet)) +
facet_wrap(~Diet) +
geom_jitter() +
stat_summary(fun = "mean", geom = "line", colour = "black") +
theme_minimal() +
labs(
title = "Chick Weight over Time by Diet",
x = "Time (days)",
y = "Weight (grams)"
)

## 26.19 酶的酶联免疫吸附测定

ggplot(data = DNase, aes(x= conc,y= density, color = Run)) +
geom_point() +
theme_minimal()

## 26.20 婴儿的体重随年龄的变化情况

BirthWeight 数据集记录了婴儿的体重随年龄的变化情况，年龄以周为单位计，体重以克为单位计

# 带截距项和不带截距项
summary(l1 <- lm(birthw ~ sex + age), correlation = TRUE)
##
## Call:
## lm(formula = birthw ~ sex + age)
##
## Residuals:
##     Min      1Q  Median      3Q     Max
## -257.49 -125.28  -58.44  169.00  303.98
##
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1610.28     786.08  -2.049   0.0532 .
## sexF         -163.04      72.81  -2.239   0.0361 *
## age           120.89      20.46   5.908 7.28e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 177.1 on 21 degrees of freedom
## Multiple R-squared:   0.64,  Adjusted R-squared:  0.6057
## F-statistic: 18.67 on 2 and 21 DF,  p-value: 2.194e-05
##
## Correlation of Coefficients:
##      (Intercept) sexF
## sexF  0.07
## age  -1.00       -0.12
anova(l1)
## Analysis of Variance Table
##
## Response: birthw
##           Df  Sum Sq Mean Sq F value    Pr(>F)
## sex        1   76163   76163  2.4279    0.1341
## age        1 1094940 1094940 34.9040 7.284e-06 ***
## Residuals 21  658771   31370
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 与带交互项的模型比较
summary(li <- lm(birthw ~ sex + sex:age), correlation = TRUE)
##
## Call:
## lm(formula = birthw ~ sex + sex:age)
##
## Residuals:
##     Min      1Q  Median      3Q     Max
## -246.69 -138.11  -39.13  176.57  274.28
##
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1268.67    1114.64  -1.138 0.268492
## sexF         -872.99    1611.33  -0.542 0.593952
## sexM:age      111.98      29.05   3.855 0.000986 ***
## sexF:age      130.40      30.00   4.347 0.000313 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 180.6 on 20 degrees of freedom
## Multiple R-squared:  0.6435, Adjusted R-squared:   0.59
## F-statistic: 12.03 on 3 and 20 DF,  p-value: 0.000101
##
## Correlation of Coefficients:
##          (Intercept) sexF  sexM:age
## sexF     -0.69
## sexM:age -1.00        0.69
## sexF:age  0.00       -0.72  0.00
anova(li, l1)
## Analysis of Variance Table
##
## Model 1: birthw ~ sex + sex:age
## Model 2: birthw ~ sex + age
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1     20 652425
## 2     21 658771 -1   -6346.2 0.1945 0.6639
# 类似，只是使用 glm 命令来拟合而已
summary(zi <- glm(birthw ~ sex + age, family = gaussian()))
##
## Call:
## glm(formula = birthw ~ sex + age, family = gaussian())
##
## Deviance Residuals:
##     Min       1Q   Median       3Q      Max
## -257.49  -125.28   -58.44   169.00   303.98
##
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1610.28     786.08  -2.049   0.0532 .
## sexF         -163.04      72.81  -2.239   0.0361 *
## age           120.89      20.46   5.908 7.28e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 31370.04)
##
##     Null deviance: 1829873  on 23  degrees of freedom
## Residual deviance:  658771  on 21  degrees of freedom
## AIC: 321.39
##
## Number of Fisher Scoring iterations: 2
anova(zi)
## Analysis of Deviance Table
##
##
## Response: birthw
##
## Terms added sequentially (first to last)
##
##
##      Df Deviance Resid. Df Resid. Dev
## NULL                    23    1829873
## sex   1    76163        22    1753711
## age   1  1094940        21     658771
# summary(z.o4 <- update(zi, subset = -4))
summary(zz <- update(zi, birthw ~ sex + age + sex:age))
##
## Call:
## glm(formula = birthw ~ sex + age + sex:age, family = gaussian())
##
## Deviance Residuals:
##     Min       1Q   Median       3Q      Max
## -246.69  -138.11   -39.13   176.57   274.28
##
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1268.67    1114.64  -1.138 0.268492
## sexF         -872.99    1611.33  -0.542 0.593952
## age           111.98      29.05   3.855 0.000986 ***
## sexF:age       18.42      41.76   0.441 0.663893
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 32621.23)
##
##     Null deviance: 1829873  on 23  degrees of freedom
## Residual deviance:  652425  on 20  degrees of freedom
## AIC: 323.16
##
## Number of Fisher Scoring iterations: 2
anova(zi, zz)
## Analysis of Deviance Table
##
## Model 1: birthw ~ sex + age
## Model 2: birthw ~ sex + age + sex:age
##   Resid. Df Resid. Dev Df Deviance
## 1        21     658771
## 2        20     652425  1   6346.2

## 26.21 火炬松树的生长情况

26.3 记录了 14 颗火炬树种子的生长情况

reshape(Loblolly, idvar = "Seed", timevar = "age",
v.names = "height", direction = "wide", sep = "") %>%
knitr::kable(.,
caption = "火炬松树的高度（英尺）随时间（年）的变化",
row.names = FALSE, col.names = gsub("(height)", "", names(.)),
align = "c"
)

Seed 3 5 10 15 20 25
301 4.51 10.89 28.72 41.74 52.70 60.92
303 4.55 10.92 29.07 42.83 53.88 63.39
305 4.79 11.37 30.21 44.40 55.82 64.10
307 3.91 9.48 25.66 39.07 50.78 59.07
309 4.81 11.20 28.66 41.66 53.31 63.05
311 3.88 9.40 25.99 39.55 51.46 59.64
315 4.32 10.43 27.16 40.85 51.33 60.07
319 4.57 10.57 27.90 41.13 52.43 60.69
321 3.77 9.03 25.45 38.98 49.76 60.28
323 4.33 10.79 28.97 42.44 53.17 61.62
325 4.38 10.48 27.93 40.20 50.06 58.49
327 4.12 9.92 26.54 37.82 48.43 56.81
329 3.93 9.34 26.08 37.79 48.31 56.43
331 3.46 9.05 25.85 39.15 49.12 59.49

26.5 火炬树种子基本决定了树的长势，不同种子预示最后的高度，并且在生长期也是很稳定地生长

p <- ggplot(data = Loblolly, aes(x = age, y = height, color = Seed)) +
geom_point() +
geom_line() +
theme_minimal() +
labs(x = "age (yr)", y = "height (ft)")
p

## 26.22 酶促反应的反应速率

Puromycin 酶促反应的反应速度，模型拟合 ?SSmicmen

ggplot(data = Puromycin, aes(x = conc, y = rate, color = state)) +
geom_point() +
geom_line() +
theme_minimal() +
labs(
x = "Substrate concentration (ppm)",
y = "Reaction velocity (counts/min/min)",
title = "Puromycin data and fitted Michaelis-Menten curves"
)

## 26.23 茶碱的药代动力学

ggplot(data = Theoph, aes(x = Time, y = conc, color = Subject)) +
geom_point() +
geom_line() +
facet_wrap(Wt ~ Dose, ncol = 3, labeller = "label_both") +
theme_minimal() +
labs(
x = "Time since drug administration (hr)",
y = "Theophylline concentration (mg/L)",
title = "Observed concentrations and fitted model"
)
Theoph %>%
transform(., wt_dose = paste(Wt, Dose, sep = "~")) %>%
ggplot(., aes(x = Time, y = conc, color = wt_dose)) +
geom_point() +
geom_line() +
theme_minimal() +
labs(
x = "Time since drug administration (hr)",
y = "Theophylline concentration (mg/L)",
title = "Observed concentrations and fitted model"
)
ggplot(data = Theoph, aes(x = Time, y = conc, color = Subject)) +
geom_point() +
geom_line() +
theme_minimal() +
labs(
x = "Time since drug administration (hr)",
y = "Theophylline concentration (mg/L)",
title = "Observed concentrations and fitted model"
)

## 26.24 本章总结

This is a bit like asking how should I tweak my sailboat so I can explore the ocean floor.

— Roger Koenker 71

## 26.25 运行环境

## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
##
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base
##
## other attached packages:
## [1] patchwork_1.1.1 gganimate_1.0.7 ggplot2_3.3.5   magrittr_2.0.1
##
## loaded via a namespace (and not attached):
##  [1] progress_1.2.2    tidyselect_1.1.1  xfun_0.29         bslib_0.3.1
##  [5] purrr_0.3.4       lattice_0.20-45   splines_4.1.2     colorspace_2.0-2
##  [9] vctrs_0.3.8       generics_0.1.1    viridisLite_0.4.0 htmltools_0.5.2
## [13] mgcv_1.8-38       yaml_2.2.1        utf8_1.2.2        rlang_0.4.12
## [17] jquerylib_0.1.4   pillar_1.6.4      glue_1.6.0        withr_2.4.3
## [21] DBI_1.1.2         tweenr_1.0.2      lifecycle_1.0.1   stringr_1.4.0
## [25] munsell_0.5.0     gtable_0.3.0      memoise_2.0.1     evaluate_0.14
## [29] labeling_0.4.2    knitr_1.37        fastmap_1.1.0     curl_4.3.2
## [33] fansi_0.5.0       gifski_1.4.3-1    highr_0.9         Rcpp_1.0.7
## [37] scales_1.1.1      cachem_1.0.6      jsonlite_1.7.2    sysfonts_0.8.5
## [41] farver_2.1.0      fs_1.5.2          hms_1.1.1         digest_0.6.29
## [45] stringi_1.7.6     bookdown_0.24     dplyr_1.0.7       grid_4.1.2
## [49] tools_4.1.2       sass_0.4.0        tibble_3.1.6      crayon_1.4.2
## [53] pkgconfig_2.0.3   downlit_0.4.0     Matrix_1.4-0      ellipsis_0.3.2
## [57] xml2_1.3.3        prettyunits_1.1.1 assertthat_0.2.1  rmarkdown_2.11
## [61] R6_2.5.1          nlme_3.1-153      compiler_4.1.2

### 参考文献

[62]
J. Fox and S. Weisberg, An R companion to applied regression, Third. Thousand Oaks CA: Sage, 2019.Available: https://socialsciences.mcmaster.ca/jfox/Books/Companion/