31.4 分层正态模型

Multilevel Models 多水平模型、Hierarchical Models 层次模型

31.4.1 schools 数据

# 模型编译
mod <- cmdstan_model(stan_file = "code/eight_schools.stan")

# 模型拟合
eight_schools_fit <- mod$sample(
  data = list( # 观测数据
    J = 8,
    y = c(28, 8, -3, 7, -1, 1, 18, 12),
    sigma = c(15, 10, 16, 11, 9, 11, 10, 18)
  ),
  iter_warmup = 1000, # 每条链预处理迭代次数
  iter_sampling = 2000, # 每条链总迭代次数
  chains = 4, # 马尔科夫链的数目
  parallel_chains = 1, # 指定 CPU 核心数,可以给每条链分配一个
  show_messages = FALSE, # 不显示迭代的中间过程
  refresh = 0, # 不显示采样的进度
  seed = 20190425 # 设置随机数种子,不要使用 set.seed() 函数
)
## Running MCMC with 4 sequential chains...
## 
## Chain 1 finished in 0.1 seconds.
## Chain 2 finished in 0.1 seconds.
## Chain 3 finished in 0.1 seconds.
## Chain 4 finished in 0.1 seconds.
## 
## All 4 chains finished successfully.
## Mean chain execution time: 0.1 seconds.
## Total execution time: 0.5 seconds.

模型拟合结果

eight_schools_fit$cmdstan_summary()
## Inference for Stan model: eight_schools_model
## 4 chains: each with iter=(2000,2000,2000,2000); warmup=(0,0,0,0); thin=(1,1,1,1); 8000 iterations saved.
## 
## Warmup took (0.023, 0.023, 0.026, 0.026) seconds, 0.098 seconds total
## Sampling took (0.070, 0.077, 0.074, 0.071) seconds, 0.29 seconds total
## 
##                     Mean     MCSE  StdDev     5%       50%   95%    N_Eff  N_Eff/s    R_hat
## 
## lp__            -4.0e+01    0.052     2.6    -44  -3.9e+01   -36     2589     8866      1.0
## accept_stat__    8.1e-01  7.2e-03    0.25   0.23      0.92   1.0  1.2e+03  4.0e+03  1.0e+00
## stepsize__       3.9e-01  1.2e-02   0.017   0.37      0.41  0.41  2.0e+00  6.9e+00  6.9e+12
## treedepth__      3.2e+00  9.5e-02    0.49    3.0       3.0   4.0  2.7e+01  9.2e+01  1.0e+00
## n_leapfrog__     1.0e+01  8.2e-01     4.2    7.0       7.0    15  2.6e+01  8.9e+01  1.0e+00
## divergent__      1.8e-03      nan   0.042   0.00      0.00  0.00      nan      nan      nan
## energy__         4.5e+01  6.7e-02     3.4     39        44    51  2.6e+03  8.9e+03  1.0e+00
## 
## mu               7.9e+00     0.12     5.2  -0.21   7.9e+00    16     1936     6631      1.0
## tau              6.6e+00     0.13     5.8   0.49   5.2e+00    18     2086     7145      1.0
## eta[1]           3.9e-01    0.011    0.94   -1.2   4.2e-01   1.9     7207    24682     1.00
## eta[2]           4.5e-03    0.011    0.88   -1.4   4.3e-03   1.4     6098    20884     1.00
## eta[3]          -2.1e-01    0.011    0.93   -1.7  -2.3e-01   1.4     6605    22619     1.00
## eta[4]          -3.3e-02    0.012    0.89   -1.5  -4.3e-02   1.5     5900    20206     1.00
## eta[5]          -3.6e-01    0.011    0.86   -1.7  -3.8e-01   1.1     6220    21303     1.00
## eta[6]          -2.2e-01    0.011    0.89   -1.7  -2.4e-01   1.3     6276    21492      1.0
## eta[7]           3.5e-01    0.011    0.90   -1.2   3.8e-01   1.8     6346    21733      1.0
## eta[8]           5.5e-02    0.012    0.94   -1.5   6.0e-02   1.6     6211    21269     1.00
## theta[1]         1.1e+01     0.12     8.3   0.24   1.0e+01    27     4952    16958     1.00
## theta[2]         7.9e+00    0.073     6.4   -2.6   8.0e+00    18     7581    25963     1.00
## theta[3]         6.2e+00     0.12     7.7   -7.1   6.6e+00    18     4029    13799      1.0
## theta[4]         7.6e+00    0.083     6.5   -3.0   7.7e+00    18     6221    21304      1.0
## theta[5]         5.0e+00    0.089     6.4   -6.5   5.6e+00    14     5146    17622      1.0
## theta[6]         6.1e+00    0.085     6.8   -5.7   6.5e+00    16     6363    21791     1.00
## theta[7]         1.1e+01    0.088     6.9   0.69   1.0e+01    23     6075    20803      1.0
## theta[8]         8.4e+00     0.11     7.8   -3.8   8.3e+00    22     4996    17109     1.00
## 
## Samples were drawn using hmc with nuts.
## For each parameter, N_Eff is a crude measure of effective sample size,
## and R_hat is the potential scale reduction factor on split chains (at 
## convergence, R_hat=1).

4 条马尔可夫链,19 个变量,2000 次迭代,轨迹数据如下

eight_schools_fit$draws()
## # A draws_array: 2000 iterations, 4 chains, and 19 variables
## , , variable = lp__
## 
##          chain
## iteration   1   2   3   4
##         1 -42 -41 -40 -34
##         2 -37 -40 -39 -38
##         3 -40 -43 -48 -39
##         4 -39 -39 -40 -38
##         5 -38 -39 -42 -41
## 
## , , variable = mu
## 
##          chain
## iteration    1    2       3    4
##         1  9.0  7.5 -0.4099  6.5
##         2  5.9  7.5  0.0021 13.8
##         3  6.2  6.9 12.1570  8.1
##         4 12.0 14.7 11.5424  8.5
##         5  5.8 12.8  2.3618 10.0
## 
## , , variable = tau
## 
##          chain
## iteration    1     2    3    4
##         1 3.20  0.88 3.40 13.0
##         2 9.70  7.84 6.39  8.7
##         3 0.38 10.47 0.85  3.9
##         4 7.19  1.73 2.49  1.4
##         5 4.06  1.26 0.44  1.5
## 
## , , variable = eta[1]
## 
##          chain
## iteration     1     2     3     4
##         1  1.10  0.66  1.54  0.81
##         2 -0.30  0.24  1.27  0.48
##         3 -0.31 -0.14 -0.46 -1.01
##         4  0.97 -0.96 -0.53  0.25
##         5  1.04 -0.92 -0.68  1.26
## 
## # ... with 1995 more iterations, and 15 more variables

提取参数 \(\mu\) 的四条迭代点列

eight_schools_fit$draws("mu")
## # A draws_array: 2000 iterations, 4 chains, and 1 variables
## , , variable = mu
## 
##          chain
## iteration    1    2       3    4
##         1  9.0  7.5 -0.4099  6.5
##         2  5.9  7.5  0.0021 13.8
##         3  6.2  6.9 12.1570  8.1
##         4 12.0 14.7 11.5424  8.5
##         5  5.8 12.8  2.3618 10.0
## 
## # ... with 1995 more iterations

eight_schools_fit 是一个 R6 对象,包含整个模型信息

class(eight_schools_fit)
## [1] "CmdStanMCMC" "CmdStanFit"  "R6"
str(eight_schools_fit)
## Classes 'CmdStanMCMC', 'CmdStanFit', 'R6' <CmdStanMCMC>
##   Inherits from: <CmdStanFit>
##   Public:
##     clone: function (deep = FALSE) 
##     cmdstan_diagnose: function (...) 
##     cmdstan_summary: function (...) 
##     data_file: function () 
##     draws: function (variables = NULL, inc_warmup = FALSE) 
##     init: function () 
##     initialize: function (runset) 
##     inv_metric: function (matrix = TRUE) 
##     latent_dynamics_files: function (include_failed = FALSE) 
##     lp: function () 
##     metadata: function () 
##     num_chains: function () 
##     num_procs: function () 
##     output: function (id = NULL) 
##     output_files: function (include_failed = FALSE) 
##     print: function (variables = NULL, ..., digits = 2, max_rows = 10) 
##     return_codes: function () 
##     runset: CmdStanRun, R6
##     sampler_diagnostics: function (inc_warmup = FALSE) 
##     save_data_file: function (dir = ".", basename = NULL, timestamp = TRUE, random = TRUE) 
##     save_latent_dynamics_files: function (dir = ".", basename = NULL, timestamp = TRUE, random = TRUE) 
##     save_object: function (file, ...) 
##     save_output_files: function (dir = ".", basename = NULL, timestamp = TRUE, random = TRUE) 
##     summary: function (variables = NULL, ...) 
##     time: function () 
##   Private:
##     draws_: -42.0125 -37.337 -40.2068 -38.9824 -38.4394 -38.7721 -35 ...
##     init_: NULL
##     inv_metric_: list
##     metadata_: list
##     read_csv_: function (variables = NULL, sampler_diagnostics = NULL) 
##     sampler_diagnostics_: NULL
##     warmup_draws_: NULL
##     warmup_sampler_diagnostics_: NULL

模型诊断:查看迭代点列的平稳性

mcmc_dens(eight_schools_fit$draws(c("mu")))

分层线性模型之生长曲线模型 (Gelfand et al. 1990)

31.4.2 rats 数据

贝叶斯分层图

# 数据准备
# modified code from https://github.com/stan-dev/example-models/tree/master/bugs_examples/vol1/rats
N <- 30
T <- 5
y <- structure(c(
  151, 145, 147, 155, 135, 159, 141, 159, 177, 134,
  160, 143, 154, 171, 163, 160, 142, 156, 157, 152, 154, 139, 146,
  157, 132, 160, 169, 157, 137, 153, 199, 199, 214, 200, 188, 210,
  189, 201, 236, 182, 208, 188, 200, 221, 216, 207, 187, 203, 212,
  203, 205, 190, 191, 211, 185, 207, 216, 205, 180, 200, 246, 249,
  263, 237, 230, 252, 231, 248, 285, 220, 261, 220, 244, 270, 242,
  248, 234, 243, 259, 246, 253, 225, 229, 250, 237, 257, 261, 248,
  219, 244, 283, 293, 312, 272, 280, 298, 275, 297, 350, 260, 313,
  273, 289, 326, 281, 288, 280, 283, 307, 286, 298, 267, 272, 285,
  286, 303, 295, 289, 258, 286, 320, 354, 328, 297, 323, 331, 305,
  338, 376, 296, 352, 314, 325, 358, 312, 324, 316, 317, 336, 321,
  334, 302, 302, 323, 331, 345, 333, 316, 291, 324
), .Dim = c(30, 5))
x <- c(8.0, 15.0, 22.0, 29.0, 36.0)
xbar <- 22.0


# 模型参数设置
chains <- 4
iter <- 1000

init <- rep(list(list(
  alpha = rep(250, 30), beta = rep(6, 30),
  alpha_c = 150, beta_c = 10,
  tausq_c = 1, tausq_alpha = 1,
  tausq_beta = 1
)), chains)
mod <- cmdstan_model(stan_file = "code/rats.stan")

rats_fit <- mod$sample(
  data = list(N = N, T = T, y = y, x = x, xbar = xbar),
  init = init,
  iter_warmup = 1000, # 每条链预处理迭代次数
  iter_sampling = 2000, # 每条链总迭代次数
  chains = chains, # 马尔科夫链的数目
  parallel_chains = 1, # 指定 CPU 核心数,可以给每条链分配一个
  show_messages = FALSE, # 不显示迭代的中间过程
  refresh = 0, # 不显示采样的进度
  seed = 20190425 # 设置随机数种子,不要使用 set.seed() 函数
)
## Running MCMC with 4 sequential chains...
## 
## Chain 1 finished in 0.6 seconds.
## Chain 2 finished in 0.6 seconds.
## Chain 3 finished in 0.7 seconds.
## Chain 4 finished in 0.7 seconds.
## 
## All 4 chains finished successfully.
## Mean chain execution time: 0.7 seconds.
## Total execution time: 2.8 seconds.

参考文献

Gelfand, Alan E., Susan E. Hills, Amy Racine-Poon, and Adrian F. M. Smith. 1990. “Illustration of Bayesian Inference in Normal Data Models Using Gibbs Sampling.” Journal of the American Statistical Association 85 (412): 972–85. https://doi.org/10.1080/01621459.1990.10474968.