31.4 分层正态模型

Multilevel Models 多水平模型、Hierarchical Models 层次模型

31.4.1 schools 数据

# 模型编译
mod <- cmdstan_model(stan_file = "code/eight_schools.stan")

# 模型拟合
eight_schools_fit <- mod$sample(
  data = list( # 观测数据
    J = 8,
    y = c(28, 8, -3, 7, -1, 1, 18, 12),
    sigma = c(15, 10, 16, 11, 9, 11, 10, 18)
  ),
  iter_warmup = 1000, # 每条链预处理迭代次数
  iter_sampling = 2000, # 每条链总迭代次数
  chains = 4, # 马尔科夫链的数目
  parallel_chains = 1, # 指定 CPU 核心数,可以给每条链分配一个
  show_messages = FALSE, # 不显示迭代的中间过程
  refresh = 0, # 不显示采样的进度
  seed = 20190425 # 设置随机数种子,不要使用 set.seed() 函数
)
## Running MCMC with 4 sequential chains...
## 
## Chain 1 finished in 0.1 seconds.
## Chain 2 finished in 0.1 seconds.
## Chain 3 finished in 0.1 seconds.
## Chain 4 finished in 0.1 seconds.
## 
## All 4 chains finished successfully.
## Mean chain execution time: 0.1 seconds.
## Total execution time: 0.7 seconds.

模型拟合结果

eight_schools_fit$cmdstan_summary()
## Inference for Stan model: eight_schools_model
## 4 chains: each with iter=(2000,2000,2000,2000); warmup=(0,0,0,0); thin=(1,1,1,1); 8000 iterations saved.
## 
## Warmup took (0.023, 0.026, 0.023, 0.026) seconds, 0.098 seconds total
## Sampling took (0.070, 0.083, 0.083, 0.086) seconds, 0.32 seconds total
## 
##                     Mean     MCSE  StdDev     5%       50%   95%    N_Eff  N_Eff/s    R_hat
## 
## lp__            -4.0e+01  5.4e-02     2.7    -44  -3.9e+01   -36     2447     7600     1.00
## accept_stat__       0.88  1.5e-02    0.20   0.40      0.96   1.0  1.8e+02  5.7e+02  1.0e+00
## stepsize__          0.34  3.2e-02   0.045   0.28      0.33  0.41  2.0e+00  6.2e+00  1.8e+13
## treedepth__          3.5  1.8e-01    0.54    3.0       4.0   4.0  8.4e+00  2.6e+01  1.1e+00
## n_leapfrog__          12  1.3e+00     4.0    7.0        15    15  9.9e+00  3.1e+01  1.1e+00
## divergent__         0.00      nan    0.00   0.00      0.00  0.00      nan      nan      nan
## energy__              45  7.2e-02     3.5     39        44    51  2.4e+03  7.5e+03  1.0e+00
## 
## mu               8.0e+00  8.1e-02     5.0  0.015   7.9e+00    17     3886    12068      1.0
## tau              6.6e+00  1.0e-01     5.6   0.48   5.3e+00    17     3064     9516      1.0
## eta[1]           3.9e-01  1.1e-02    0.95   -1.2   4.2e-01   1.9     7716    23962     1.00
## eta[2]          -4.0e-04  9.5e-03    0.88   -1.4   2.1e-04   1.4     8427    26170     1.00
## eta[3]          -2.0e-01  1.0e-02    0.94   -1.7  -2.0e-01   1.4     8319    25837      1.0
## eta[4]          -3.0e-02  9.7e-03    0.88   -1.5  -2.8e-02   1.4     8220    25528      1.0
## eta[5]          -3.7e-01  1.0e-02    0.88   -1.8  -3.9e-01   1.1     7355    22841      1.0
## eta[6]          -2.2e-01  9.8e-03    0.90   -1.7  -2.5e-01   1.3     8439    26208      1.0
## eta[7]           3.5e-01  1.0e-02    0.88   -1.1   3.7e-01   1.8     7255    22530      1.0
## eta[8]           5.4e-02  1.1e-02    0.93   -1.5   6.6e-02   1.6     7356    22844     1.00
## theta[1]         1.1e+01  1.1e-01     8.4   0.10   1.0e+01    27     5668    17602     1.00
## theta[2]         7.9e+00  6.8e-02     6.4   -2.5   7.9e+00    18     8940    27765      1.0
## theta[3]         6.2e+00  9.3e-02     7.7   -7.5   6.6e+00    18     6961    21617      1.0
## theta[4]         7.7e+00  7.1e-02     6.5   -3.0   7.7e+00    18     8515    26445      1.0
## theta[5]         5.0e+00  7.0e-02     6.4   -6.4   5.5e+00    14     8218    25520      1.0
## theta[6]         6.1e+00  7.3e-02     6.7   -5.7   6.5e+00    16     8504    26409     1.00
## theta[7]         1.1e+01  8.4e-02     6.8   0.92   1.0e+01    23     6537    20301     1.00
## theta[8]         8.5e+00  1.0e-01     7.8   -3.8   8.2e+00    22     5904    18334      1.0
## 
## Samples were drawn using hmc with nuts.
## For each parameter, N_Eff is a crude measure of effective sample size,
## and R_hat is the potential scale reduction factor on split chains (at 
## convergence, R_hat=1).

4 条马尔可夫链,19 个变量,2000 次迭代,轨迹数据如下

eight_schools_fit$draws()
## # A draws_array: 2000 iterations, 4 chains, and 19 variables
## , , variable = lp__
## 
##          chain
## iteration   1   2   3   4
##         1 -42 -39 -38 -40
##         2 -37 -40 -37 -42
##         3 -40 -38 -38 -43
##         4 -39 -39 -40 -43
##         5 -38 -45 -38 -40
## 
## , , variable = mu
## 
##          chain
## iteration    1      2    3    4
##         1  9.0 -3.696  8.9  1.9
##         2  5.9 13.895  1.5 13.5
##         3  6.2  0.013  4.2  1.5
##         4 12.0  2.365 10.6 15.5
##         5  5.8 -7.633  5.2 12.2
## 
## , , variable = tau
## 
##          chain
## iteration    1    2     3     4
##         1 3.20 10.3  7.93  4.11
##         2 9.70  1.9 10.70  0.65
##         3 0.38  6.3  4.31  1.18
##         4 7.19  9.2  0.43 11.03
##         5 4.06  5.6  3.36  8.74
## 
## , , variable = eta[1]
## 
##          chain
## iteration     1     2     3     4
##         1  1.10  1.72  1.23  1.10
##         2 -0.30 -0.98  1.19 -0.74
##         3 -0.31  1.28  0.34 -0.85
##         4  0.97  1.53 -0.86  2.02
##         5  1.04  0.75  1.46  1.24
## 
## # ... with 1995 more iterations, and 15 more variables

提取参数 \(\mu\) 的四条迭代点列

eight_schools_fit$draws("mu")
## # A draws_array: 2000 iterations, 4 chains, and 1 variables
## , , variable = mu
## 
##          chain
## iteration    1      2    3    4
##         1  9.0 -3.696  8.9  1.9
##         2  5.9 13.895  1.5 13.5
##         3  6.2  0.013  4.2  1.5
##         4 12.0  2.365 10.6 15.5
##         5  5.8 -7.633  5.2 12.2
## 
## # ... with 1995 more iterations

eight_schools_fit 是一个 R6 对象,包含整个模型信息

class(eight_schools_fit)
## [1] "CmdStanMCMC" "CmdStanFit"  "R6"
str(eight_schools_fit)
## Classes 'CmdStanMCMC', 'CmdStanFit', 'R6' <CmdStanMCMC>
##   Inherits from: <CmdStanFit>
##   Public:
##     clone: function (deep = FALSE) 
##     cmdstan_diagnose: function () 
##     cmdstan_summary: function (flags = NULL) 
##     data_file: function () 
##     draws: function (variables = NULL, inc_warmup = FALSE, format = getOption("cmdstanr_draws_format", 
##     init: function () 
##     initialize: function (runset) 
##     inv_metric: function (matrix = TRUE) 
##     latent_dynamics_files: function (include_failed = FALSE) 
##     loo: function (variables = "log_lik", r_eff = TRUE, ...) 
##     lp: function () 
##     metadata: function () 
##     num_chains: function () 
##     num_procs: function () 
##     output: function (id = NULL) 
##     output_files: function (include_failed = FALSE) 
##     print: function (variables = NULL, ..., digits = 2, max_rows = getOption("cmdstanr_max_rows", 
##     profile_files: function (include_failed = FALSE) 
##     profiles: function () 
##     return_codes: function () 
##     runset: CmdStanRun, R6
##     sampler_diagnostics: function (inc_warmup = FALSE, format = getOption("cmdstanr_draws_format", 
##     save_data_file: function (dir = ".", basename = NULL, timestamp = TRUE, random = TRUE) 
##     save_latent_dynamics_files: function (dir = ".", basename = NULL, timestamp = TRUE, random = TRUE) 
##     save_object: function (file, ...) 
##     save_output_files: function (dir = ".", basename = NULL, timestamp = TRUE, random = TRUE) 
##     save_profile_files: function (dir = ".", basename = NULL, timestamp = TRUE, random = TRUE) 
##     summary: function (variables = NULL, ...) 
##     time: function () 
##   Private:
##     draws_: -42.0125 -37.337 -40.2068 -38.9824 -38.4394 -38.7721 -35 ...
##     init_: NULL
##     inv_metric_: list
##     metadata_: list
##     read_csv_: function (variables = NULL, sampler_diagnostics = NULL, format = getOption("cmdstanr_draws_format", 
##     sampler_diagnostics_: 3 3 4 3 4 3 3 3 3 3 2 3 3 3 3 3 4 3 3 3 3 2 3 3 3 3 3 3  ...
##     warmup_draws_: NULL
##     warmup_sampler_diagnostics_: NULL

模型诊断:查看迭代点列的平稳性

mcmc_dens(eight_schools_fit$draws(c("mu")))

分层线性模型之生长曲线模型 (Gelfand et al. 1990)

31.4.2 rats 数据

贝叶斯分层图

# 数据准备
# modified code from https://github.com/stan-dev/example-models/tree/master/bugs_examples/vol1/rats
N <- 30
T <- 5
y <- structure(c(
  151, 145, 147, 155, 135, 159, 141, 159, 177, 134,
  160, 143, 154, 171, 163, 160, 142, 156, 157, 152, 154, 139, 146,
  157, 132, 160, 169, 157, 137, 153, 199, 199, 214, 200, 188, 210,
  189, 201, 236, 182, 208, 188, 200, 221, 216, 207, 187, 203, 212,
  203, 205, 190, 191, 211, 185, 207, 216, 205, 180, 200, 246, 249,
  263, 237, 230, 252, 231, 248, 285, 220, 261, 220, 244, 270, 242,
  248, 234, 243, 259, 246, 253, 225, 229, 250, 237, 257, 261, 248,
  219, 244, 283, 293, 312, 272, 280, 298, 275, 297, 350, 260, 313,
  273, 289, 326, 281, 288, 280, 283, 307, 286, 298, 267, 272, 285,
  286, 303, 295, 289, 258, 286, 320, 354, 328, 297, 323, 331, 305,
  338, 376, 296, 352, 314, 325, 358, 312, 324, 316, 317, 336, 321,
  334, 302, 302, 323, 331, 345, 333, 316, 291, 324
), .Dim = c(30, 5))
x <- c(8.0, 15.0, 22.0, 29.0, 36.0)
xbar <- 22.0


# 模型参数设置
chains <- 4
iter <- 1000

init <- rep(list(list(
  alpha = rep(250, 30), beta = rep(6, 30),
  alpha_c = 150, beta_c = 10,
  tausq_c = 1, tausq_alpha = 1,
  tausq_beta = 1
)), chains)
mod <- cmdstan_model(stan_file = "code/rats.stan")

rats_fit <- mod$sample(
  data = list(N = N, T = T, y = y, x = x, xbar = xbar),
  init = init,
  iter_warmup = 1000, # 每条链预处理迭代次数
  iter_sampling = 2000, # 每条链总迭代次数
  chains = chains, # 马尔科夫链的数目
  parallel_chains = 1, # 指定 CPU 核心数,可以给每条链分配一个
  show_messages = FALSE, # 不显示迭代的中间过程
  refresh = 0, # 不显示采样的进度
  seed = 20190425 # 设置随机数种子,不要使用 set.seed() 函数
)
## Running MCMC with 4 sequential chains...
## 
## Chain 1 finished in 0.6 seconds.
## Chain 2 finished in 0.7 seconds.
## Chain 3 finished in 0.7 seconds.
## Chain 4 finished in 0.6 seconds.
## 
## All 4 chains finished successfully.
## Mean chain execution time: 0.6 seconds.
## Total execution time: 2.7 seconds.

参考文献

Gelfand, Alan E., Susan E. Hills, Amy Racine-Poon, and Adrian F. M. Smith. 1990. “Illustration of Bayesian Inference in Normal Data Models Using Gibbs Sampling.” Journal of the American Statistical Association 85 (412): 972–85. https://doi.org/10.1080/01621459.1990.10474968.