diff --git a/.gitignore b/.gitignore index 9c69942e..95150d9c 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,8 @@ loo.Rproj # produced vignettes vignettes/loo2-lfo_cache/* vignettes/loo2-non-factorizable_cache/* +vignettes/articles-online-only/data-for-vignettes/ +vignettes/articles-online-only/*.html vignettes/*.html vignettes/*.pdf inst/doc diff --git a/DESCRIPTION b/DESCRIPTION index d64412da..775c5a02 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -36,6 +36,7 @@ Depends: R (>= 3.1.2) Imports: checkmate, + cli, matrixStats (>= 0.52), parallel, posterior (>= 1.7.0), diff --git a/NAMESPACE b/NAMESPACE index 3405d737..dc12a9df 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -21,15 +21,11 @@ S3method(as.psis_loo,psis_loo) S3method(as.psis_loo,psis_loo_ss) S3method(as.psis_loo_ss,psis_loo) S3method(as.psis_loo_ss,psis_loo_ss) -S3method(crps,matrix) -S3method(crps,numeric) S3method(dim,importance_sampling) S3method(dim,kfold) S3method(dim,loo) S3method(dim,psis_loo) S3method(dim,waic) -S3method(elpd,array) -S3method(elpd,matrix) S3method(importance_sampling,array) S3method(importance_sampling,default) S3method(importance_sampling,matrix) @@ -41,11 +37,9 @@ S3method(loo_approximate_posterior,array) S3method(loo_approximate_posterior,matrix) S3method(loo_compare,default) S3method(loo_compare,psis_loo_ss_list) -S3method(loo_crps,matrix) S3method(loo_model_weights,default) S3method(loo_moment_match,default) S3method(loo_predictive_metric,matrix) -S3method(loo_scrps,matrix) S3method(loo_subsample,"function") S3method(nobs,psis_loo_ss) S3method(plot,loo) @@ -57,8 +51,11 @@ S3method(print,compare.loo_ss) S3method(print,importance_sampling) S3method(print,importance_sampling_loo) S3method(print,kfold) +S3method(print,kfold_pred_measure) S3method(print,loo) +S3method(print,loo_pred_measure) S3method(print,pareto_k_table) +S3method(print,pred_measure) S3method(print,pseudobma_bb_weights) S3method(print,pseudobma_weights) S3method(print,psis) @@ -83,8 +80,6 @@ S3method(relative_eff,array) S3method(relative_eff,default) S3method(relative_eff,importance_sampling) S3method(relative_eff,matrix) -S3method(scrps,matrix) -S3method(scrps,numeric) S3method(sis,array) S3method(sis,default) S3method(sis,matrix) @@ -100,14 +95,19 @@ export(.compute_point_estimate) export(.ndraws) export(.thin_draws) export(E_loo) +export(acc) +export(bacc) +export(brier) export(compare) -export(crps) export(elpd) +export(elpd.array) +export(elpd.matrix) export(example_loglik_array) export(example_loglik_matrix) export(extract_log_lik) export(find_model_names) export(gpdfit) +export(insample_pred_measure) export(is.kfold) export(is.loo) export(is.psis) @@ -116,6 +116,7 @@ export(is.sis) export(is.tis) export(is.waic) export(kfold) +export(kfold_pred_measure) export(kfold_split_grouped) export(kfold_split_random) export(kfold_split_stratified) @@ -128,17 +129,19 @@ export(loo_approximate_posterior.array) export(loo_approximate_posterior.function) export(loo_approximate_posterior.matrix) export(loo_compare) -export(loo_crps) export(loo_i) export(loo_model_weights) export(loo_model_weights.default) export(loo_moment_match) export(loo_moment_match.default) +export(loo_pred_measure) export(loo_predictive_metric) -export(loo_scrps) export(loo_subsample) export(loo_subsample.function) +export(mae) export(mcse_loo) +export(mlpd) +export(mse) export(nlist) export(obs_idx) export(pareto_k_ids) @@ -146,15 +149,22 @@ export(pareto_k_influence_values) export(pareto_k_table) export(pareto_k_values) export(pointwise) +export(pred_measure) export(print_dims) export(pseudobma_weights) export(psis) export(psis_n_eff_values) export(psislw) +export(ptw_log_pred_density) +export(r2) export(relative_eff) -export(scrps) +export(rmse) +export(rps) export(sis) +export(srps) export(stacking_weights) +export(supported_measures_list) +export(test_pred_measure) export(tis) export(waic) export(waic.array) diff --git a/R/crps.R b/R/crps.R deleted file mode 100644 index ff1c9e25..00000000 --- a/R/crps.R +++ /dev/null @@ -1,218 +0,0 @@ -#' Continuously ranked probability score -#' -#' The `crps()` and `scrps()` functions and their `loo_*()` counterparts can be -#' used to compute the continuously ranked probability score (CRPS) and scaled -#' CRPS (SCRPS) (as defined by Bolin and Wallin, 2023). CRPS is a proper scoring rule, and -#' strictly proper when the first moment of the predictive distribution is -#' finite. Both can be expressed in terms of samples form the predictive -#' distribution. See, for example, a paper by Gneiting and Raftery (2007) -#' for a comprehensive discussion on CRPS. -#' -#' To compute (S)CRPS, the user needs to provide two sets of draws, `x` and -#' `x2`, from the predictive distribution. This is due to the fact that formulas -#' used to compute CRPS involve an expectation of the absolute difference of `x` -#' and `x2`, both having the same distribution. See the `permutations` argument, -#' as well as Gneiting and Raftery (2007) for details. -#' -#' @export -#' @param x A `S` by `N` matrix (draws by observations), or a vector of length -#' `S` when only single observation is provided in `y`. -#' @param x2 Independent draws from the same distribution as draws in `x`. -#' Should be of the identical dimension. -#' @param y A vector of observations or a single value. -#' @param permutations An integer, with default value of 1, specifying how many -#' times the expected value of |X - X'| (`|x - x2|`) is computed. The row -#' order of `x2` is shuffled as elements `x` and `x2` are typically drawn -#' given the same values of parameters. This happens, e.g., when one calls -#' `posterior_predict()` twice for a fitted \pkg{rstanarm} or \pkg{brms} -#' model. Generating more permutations is expected to decrease the variance of -#' the computed expected value. -#' @param ... Passed on to [E_loo()] in the `loo_*()` version of these -#' functions. -#' -#' @return A list containing two elements: `estimates` and `pointwise`. -#' The former reports estimator and standard error and latter the pointwise -#' values. Following Bolin & Wallin (2023), a larger value is better. -#' -#' @examples -#' \dontrun{ -#' # An example using rstanarm -#' library(rstanarm) -#' data("kidiq") -#' fit <- stan_glm(kid_score ~ mom_hs + mom_iq, data = kidiq) -#' ypred1 <- posterior_predict(fit) -#' ypred2 <- posterior_predict(fit) -#' crps(ypred1, ypred2, y = fit$y) -#' loo_crps(ypred1, ypred2, y = fit$y, log_lik = log_lik(fit)) -#' } -#' -#' @references -#' Bolin, D., & Wallin, J. (2023). Local scale invariance and robustness of -#' proper scoring rules. Statistical Science, 38(1):140-159. -#' -#' Gneiting, T., & Raftery, A. E. (2007). Strictly Proper Scoring Rules, -#' Prediction, and Estimation. Journal of the American Statistical Association, -#' 102(477), 359–378. -crps <- function(x, ...) { - UseMethod("crps") -} - - -#' @rdname crps -#' @export -scrps <- function(x, ...) { - UseMethod("scrps") -} - -#' @rdname crps -#' @export -loo_crps <- function(x, ...) { - UseMethod("loo_crps") -} - -#' @rdname crps -#' @export -loo_scrps <- function(x, ...) { - UseMethod("loo_scrps") -} - - -#' @rdname crps -#' @export -crps.matrix <- function(x, x2, y, ..., permutations = 1) { - validate_crps_input(x, x2, y) - repeats <- replicate(permutations, EXX_compute(x, x2), simplify = F) - EXX <- Reduce(`+`, repeats) / permutations - EXy <- colMeans(abs(sweep(x, 2, y))) - crps_output(.crps_fun(EXX, EXy)) -} - - -#' Method for a single data point -#' @rdname crps -#' @export -crps.numeric <- function(x, x2, y, ..., permutations = 1) { - stopifnot(length(x) == length(x2), - length(y) == 1) - crps.matrix(as.matrix(x), as.matrix(x2), y, permutations) -} - - -#' @rdname crps -#' @export -#' @param log_lik A log-likelihood matrix the same size as `x`. -#' @param r_eff An optional vector of relative effective sample size estimates -#' containing one element per observation. See [psis()] for details. -#' @param cores The number of cores to use for parallelization of `[psis()]`. -#' See [psis()] for details. -loo_crps.matrix <- - function(x, - x2, - y, - log_lik, - ..., - permutations = 1, - r_eff = 1, - cores = getOption("mc.cores", 1)) { - validate_crps_input(x, x2, y, log_lik) - repeats <- replicate(permutations, - EXX_loo_compute(x, x2, log_lik, r_eff = r_eff, ...), - simplify = F) - EXX <- Reduce(`+`, repeats) / permutations - psis_obj <- psis(-log_lik, r_eff = r_eff, cores = cores) - EXy <- E_loo(abs(sweep(x, 2, y)), psis_obj, log_ratios = -log_lik, ...)$value - crps_output(.crps_fun(EXX, EXy)) -} - - -#' @rdname crps -#' @export -scrps.matrix <- function(x, x2, y, ..., permutations = 1) { - validate_crps_input(x, x2, y) - repeats <- replicate(permutations, EXX_compute(x, x2), simplify = F) - EXX <- Reduce(`+`, repeats) / permutations - EXy <- colMeans(abs(sweep(x, 2, y))) - crps_output(.crps_fun(EXX, EXy, scale = TRUE)) -} - -#' @rdname crps -#' @export -scrps.numeric <- function(x, x2, y, ..., permutations = 1) { - stopifnot(length(x) == length(x2), - length(y) == 1) - scrps.matrix(as.matrix(x), as.matrix(x2), y, permutations) -} - - -#' @rdname crps -#' @export -loo_scrps.matrix <- - function( - x, - x2, - y, - log_lik, - ..., - permutations = 1, - r_eff = 1, - cores = getOption("mc.cores", 1)) { - validate_crps_input(x, x2, y, log_lik) - repeats <- replicate(permutations, - EXX_loo_compute(x, x2, log_lik, r_eff = r_eff, ...), - simplify = F) - EXX <- Reduce(`+`, repeats) / permutations - psis_obj <- psis(-log_lik, r_eff = r_eff, cores = cores) - EXy <- E_loo(abs(sweep(x, 2, y)), psis_obj, log_ratios = -log_lik, ...)$value - crps_output(.crps_fun(EXX, EXy, scale = TRUE)) -} - -# ------------ Internals ---------------- - - -EXX_compute <- function(x, x2) { - S <- nrow(x) - colMeans(abs(x - x2[sample(1:S),])) -} - - -EXX_loo_compute <- function(x, x2, log_lik, r_eff = 1, ...) { - S <- nrow(x) - shuffle <- sample (1:S) - x2 <- x2[shuffle,] - log_lik2 <- log_lik[shuffle,] - psis_obj_joint <- psis(-log_lik - log_lik2 , r_eff = r_eff) - E_loo(abs(x - x2), psis_obj_joint, log_ratios = -log_lik - log_lik2, ...)$value -} - - -#' Function to compute crps and scrps -#' @noRd -.crps_fun <- function(EXX, EXy, scale = FALSE) { - if (scale) return(-EXy/EXX - 0.5 * log(EXX)) - 0.5 * EXX - EXy -} - -#' Compute output data for crps functions -#' @noRd -crps_output <- function(crps_pw) { - n <- length(crps_pw) - out <- list() - out$estimates <- c(mean(crps_pw), sd(crps_pw) / sqrt(n)) - names(out$estimates) <- c('Estimate', 'SE') - out$pointwise <- crps_pw - out -} - -#' Validate input of CRPS functions -#' -#' Check that predictive draws and observed data are of compatible shape -#' @noRd -validate_crps_input <- function(x, x2, y, log_lik = NULL) { - stopifnot(is.numeric(x), - is.numeric(x2), - is.numeric(y), - identical(dim(x), dim(x2)), - ncol(x) == length(y), - ifelse(is.null(log_lik), TRUE, identical(dim(log_lik), dim(x))) - ) -} diff --git a/R/elpd.R b/R/elpd.R index 20724e71..a132ed71 100644 --- a/R/elpd.R +++ b/R/elpd.R @@ -4,7 +4,6 @@ #' pointwise predictive density for a new dataset or the log pointwise #' predictive density of the observed data (an overestimate of the elpd). #' -#' @export #' @param x A log-likelihood array or matrix. The **Methods (by class)** #' section, below, has detailed descriptions of how to specify the inputs for #' each method. @@ -19,8 +18,8 @@ #' #' @examples #' # Calculate the lpd of the observed data -#' LLarr <- example_loglik_array() -#' elpd(LLarr) +#' # LLarr <- example_loglik_array() +#' # elpd(LLarr) #' elpd <- function(x, ...) { UseMethod("elpd") diff --git a/R/measures.R b/R/measures.R new file mode 100644 index 00000000..9caab4d9 --- /dev/null +++ b/R/measures.R @@ -0,0 +1,821 @@ +## Density scores ------------------------------------------------------------- + +#' Pointwise log predictive density (`lppd_i`) +#' +#' Computes pointwise log predictive density contributions from a matrix of +#' log predictive densities/probabilities for posterior draws. When PSIS +#' log-weights are supplied, they are used to form a weighted log-sum-exp per +#' observation. +#' +#' @param ylp A numeric matrix of log predictive densities/probabilities with +#' dimensions draws x observations. +#' @param psis_log_weights Optional numeric matrix of normalized PSIS log +#' weights with the same dimensions as `ylp`. Each column must sum to 1 on +#' the probability scale. +#' +#' @returns A numeric vector of length `ncol(ylp)` with pointwise log +#' predictive density values. +#' +#' @examples +#' ylp <- matrix(log(c(0.2, 0.4, 0.3, 0.8)), nrow = 2) +#' ptw_log_pred_density(ylp) +#' +#' lw <- matrix(log(c(0.7, 0.3, 0.6, 0.4)), nrow = 2) +#' ptw_log_pred_density(ylp, lw) +#' @export +ptw_log_pred_density <- function(ylp, psis_log_weights = NULL) { + .validate_numeric_matrix(ylp, arg = "ylp") + n_draws <- dim(ylp)[1] + + if (is.null(psis_log_weights)) { + return(matrixStats::colLogSumExps(ylp) - log(n_draws)) + } + .validate_numeric_matrix( + psis_log_weights, + arg = "psis_log_weights", + nrow = n_draws, + ncol = ncol(ylp) + ) + + col_sums <- colSums(exp(psis_log_weights)) + if (!isTRUE(all.equal(col_sums, rep(1, ncol(psis_log_weights))))) { + cli::cli_abort(c( + "'psis_log_weights' must be normalized (column sums equal to 1).", + "i" = "Range of current column sums: [{min(col_sums)}, {max(col_sums)}]." + )) + } + + matrixStats::colLogSumExps(ylp + psis_log_weights) +} + +#' Expected log pointwise predictive density (`elpd`) +#' +#' Computes ELPD as the sum of pointwise log predictive density contributions +#' (lppd_i). +#' +#' @param ylp A numeric matrix of log predictive densities/probabilities with +#' dimensions draws x observations. +#' @param log_weights Optional numeric matrix of unnormalized log-weights +#' with dimensions draws x observations. +#' @param pointwise Optional numeric vector of precomputed pointwise +#' contributions. If provided, `ylp` and `log_weights` are ignored. +#' @param revert_sign Logical; if `TRUE`, multiply the estimate and pointwise +#' values by -1 before returning. +#' +#' @returns A named list with elements `estimate` (sum over lppd_i), `se` +#' (standard error across lppd_i), and `pointwise` (lppd_i). +#' +#' @examples +#' ylp <- matrix(log(c(0.2, 0.4, 0.3, 0.8)), nrow = 2) +#' elpd(ylp) +#' @export +elpd <- function( + ylp, log_weights = NULL, pointwise = NULL, revert_sign = FALSE +) { + if (!is.null(pointwise)) { + .validate_numeric_vector(pointwise, arg = "pointwise") + .inform_ignored_inputs( + pointwise, + ignored_args = list(ylp = ylp, log_weights = log_weights), + fun_name = "elpd" + ) + lppd_i <- pointwise + } else { + .validate_numeric_matrix(ylp, arg = "ylp") + if (!is.null(log_weights)) { + log_weights <- .normalize_and_validate_log_weights( + log_weights = log_weights, + n_draws = nrow(ylp), + n_obs = ncol(ylp) + ) + } + lppd_i <- ptw_log_pred_density(ylp, log_weights) + } + + if (length(lppd_i) == 1L) { + cli::cli_warn("Only one pointwise value supplied; standard error is set to 0.") + } + + res <- list( + estimate = sum(lppd_i), + se = if (length(lppd_i) == 1L) 0 else sqrt(length(lppd_i) * var(lppd_i)), + pointwise = lppd_i + ) + + .measure_output(res, revert_sign) +} + +#' Mean log pointwise predictive density (`mlpd`) +#' +#' Computes MLPD as the average of pointwise log predictive density (lppd_i) +#' values. Inputs follow the same conventions as [elpd()]. +#' +#' @param ylp A numeric matrix of log predictive densities/probabilities with +#' dimensions draws x observations. +#' @param log_weights Optional numeric matrix of unnormalized log-weights +#' with dimensions draws x observations. +#' @param pointwise Optional numeric vector of precomputed pointwise +#' contributions. If provided, `ylp` and `log_weights` are ignored. +#' @param revert_sign Logical; if `TRUE`, multiply the estimate and pointwise +#' values by -1 before returning. +#' +#' @returns A named list with elements `estimate` (mean over lppd_i), `se` +#' (standard error), and `pointwise` (lppd_i). +#' +#' @examples +#' ylp <- matrix(log(c(0.2, 0.4, 0.3, 0.8)), nrow = 2) +#' mlpd(ylp) +#' @export +mlpd <- function( + ylp, log_weights = NULL, pointwise = NULL, revert_sign = FALSE +) { + if (!is.null(pointwise)) { + .validate_numeric_vector(pointwise, arg = "pointwise") + .inform_ignored_inputs( + pointwise, + ignored_args = list(ylp = ylp, log_weights = log_weights), + fun_name = "mlpd" + ) + lppd_i <- pointwise + } else { + .validate_numeric_matrix(ylp, arg = "ylp") + if (!is.null(log_weights)) { + log_weights <- .normalize_and_validate_log_weights( + log_weights = log_weights, + n_draws = nrow(ylp), + n_obs = ncol(ylp) + ) + } + lppd_i <- ptw_log_pred_density(ylp, log_weights) + } + + n_obs <- length(lppd_i) + if (n_obs == 1L) { + cli::cli_warn("Only one pointwise value supplied; standard error is set to 0.") + } + + res <- list( + estimate = sum(lppd_i) / n_obs, + se = if (n_obs == 1L) 0 else sqrt(n_obs * var(lppd_i)) / n_obs, + pointwise = lppd_i + ) + .measure_output(res, revert_sign) +} + +#' Classification accuracy (`acc`) +#' +#' Computes pointwise and average classification accuracy for binary or +#' multiclass outcomes from posterior predictive class assignments. For binary +#' outcomes, each draw is thresholded at 0.5. For multiclass outcomes, each +#' draw is mapped to the most likely category via `which.max()`. +#' +#' @param y An integer vector of observed class labels. +#' @param mupred A numeric array of posterior predictive means. For binary +#' outcomes use a draws x observations matrix. For multiclass outcomes use a +#' draws x observations x categories array. +#' @param log_weights Optional numeric matrix of unnormalized log-weights +#' with dimensions draws x observations. +#' @param pointwise Optional numeric vector of precomputed pointwise accuracy +#' contributions. If provided, `y`, `mupred`, and `log_weights` are ignored. +#' @param revert_sign Logical; if `TRUE`, multiply the estimate and pointwise +#' values by -1 before returning. +#' +#' @returns A named list with elements `estimate` (mean accuracy), `se` +#' (standard error), and `pointwise` (pointwise accuracy). +#' +#' @examples +#' y <- c(1L, 0L, 1L) +#' mupred <- matrix(c(0.8, 0.3, 0.7, 0.6, 0.4, 0.9), nrow = 2) +#' loo::acc(y, mupred) +#' @export +acc <- function( + y, mupred, log_weights = NULL, pointwise = NULL, revert_sign = FALSE +) { + if (!is.null(pointwise)) { + .inform_ignored_inputs( + pointwise, + ignored_args = list(y = y, mupred = mupred, log_weights = log_weights), + fun_name = "acc" + ) + acc_i <- pointwise + } else { + .validate_numeric_vector(y, arg = "y") + if (!is.numeric(mupred) || (length(dim(mupred)) != 2 && length(dim(mupred)) != 3)) { + cli::cli_abort("{.arg mupred} must be a numeric matrix or 3D numeric array.") + } + .validate_probs(mupred, arg = "mupred") + + if (!is.null(log_weights)) { + weights <- exp(.normalize_and_validate_log_weights( + log_weights = log_weights, + n_draws = nrow(mupred), + n_obs = dim(mupred)[2] + )) + } else { + weights <- rep(1 / nrow(mupred), nrow(mupred)) + } + + if (length(dim(mupred)) == 3) { + # Multiclass: (draws × obs × categories) > argmax over categories + ypred <- apply(mupred, c(1, 2), which.max) + } else { + .validate_numeric_matrix(mupred, arg = "mupred") + # Binary: (draws × obs) > threshold at 0.5 + ypred <- (mupred > 0.5) * 1L + } + acc_i <- colSums(sweep(ypred, 2, y, `==`) * weights) + } + + res <- list( + estimate = mean(acc_i), + se = sqrt(var(acc_i) / length(acc_i)), + pointwise = acc_i + ) + .measure_output(res, revert_sign) +} + +#' Balanced classification accuracy (`bacc`) +#' +#' Computes balanced accuracy by averaging class-specific mean accuracy, giving +#' each observed class equal weight regardless of class frequency. +#' +#' @inheritParams acc +#' +#' @returns A named list with elements `estimate` (balanced accuracy), `se` +#' (standard error based on balanced pointwise contributions), and +#' `pointwise` (pointwise balanced accuracy terms). +#' +#' @examples +#' y <- c(1L, 1L, 2L, 2L) +#' mupred <- array( +#' c(0.8, 0.2, 0.7, 0.3, 0.3, 0.7, 0.2, 0.8), +#' dim = c(1, 4, 2) +#' ) +#' bacc(y, mupred) +#' @export +bacc <- function( + y, mupred, log_weights = NULL, pointwise = NULL, revert_sign = FALSE +) { + if (!is.null(pointwise)) { + .inform_ignored_inputs( + pointwise, + ignored_args = list(mupred = mupred, log_weights = log_weights), + fun_name = "bacc" + ) + acc_i <- pointwise + } else { + .validate_numeric_vector(y, arg = "y") + classes <- sort(unique(y)) + K <- length(classes) + if (K < 2) { + cli::cli_abort("{.fn bacc} requires at least two outcome classes.") + } + if (!is.numeric(mupred) || (length(dim(mupred)) != 2 && length(dim(mupred)) != 3)) { + cli::cli_abort("{.arg mupred} must be a numeric matrix or 3D numeric array.") + } + .validate_probs(mupred, arg = "mupred") + + if (!is.null(log_weights)) { + weights <- exp(.normalize_and_validate_log_weights( + log_weights = log_weights, + n_draws = nrow(mupred), + n_obs = dim(mupred)[2] + )) + } else { + weights <- rep(1 / nrow(mupred), nrow(mupred)) + } + if (length(dim(mupred)) == 3) { + # Multiclass: (draws × obs × categories) > argmax over categories + ypred <- apply(mupred, c(1, 2), which.max) + } else { + .validate_numeric_matrix(mupred, arg = "mupred") + # Binary: (draws × obs) > threshold at 0.5 + ypred <- (mupred > 0.5) * 1L + } + acc_i <- colSums(sweep(ypred, 2, y, `==`) * weights) + } + + acc_c <- vapply(classes, function(c) mean(acc_i[y == c]), numeric(1)) + n_c <- tabulate(match(y, classes)) + bacc_i <- acc_i / (K * n_c[match(y, classes)]) + + res <- list( + estimate = mean(acc_c), + se = sqrt(var(bacc_i) / length(bacc_i)), + pointwise = acc_i + ) + .measure_output(res, revert_sign) +} + +#' Brier score (`brier`) +#' +#' Computes the Brier score for binary outcomes as squared error between the +#' observed label and predicted event probability. +#' +#' @param y A numeric vector of binary outcomes coded as 0 or 1. +#' @param ypred A numeric matrix of posterior predictive probabilities with +#' dimensions draws x observations. +#' @param log_weights Optional numeric matrix of unnormalized log-weights +#' with dimensions draws x observations. +#' @param pointwise Optional numeric vector of precomputed pointwise Brier +#' scores. If provided, `y`, `ypred`, and `log_weights` are ignored. +#' @param revert_sign Logical; if `TRUE`, multiply the estimate and pointwise +#' values by -1 before returning. +#' +#' @returns A named list with elements `estimate` (mean Brier score), `se` +#' (standard error), and `pointwise` (pointwise brier score). +#' +#' @examples +#' y <- c(1, 0, 1) +#' ypred <- matrix(c(0.8, 0.2, 0.7, 0.9, 0.4, 0.6), nrow = 2) +#' brier(y, ypred) +#' @export +brier <- function( + y, ypred, log_weights = NULL, pointwise = NULL, revert_sign = FALSE +) { + if (!is.null(pointwise)) { + .inform_ignored_inputs( + pointwise, + ignored_args = list(y = y, ypred = ypred, log_weights = log_weights), + fun_name = "brier" + ) + bs_i <- pointwise + } else { + .validate_numeric_vector(y, arg = "y") + if (any(y != 0 & y != 1)) { + cli::cli_abort(c( + "The brier score expects binary data 'y'.", + "i" = "Observed range: [{min(y)}, {max(y)}]", + "x" = "All elements of {.arg y} must be 0 or 1." + )) + } + .validate_numeric_matrix(ypred, arg = "ypred", ncol = length(y)) + .validate_probs(ypred, arg = "ypred") + if (!is.null(log_weights)) { + weights <- exp(.normalize_and_validate_log_weights( + log_weights = log_weights, + n_draws = nrow(ypred), + n_obs = ncol(ypred) + )) + prob_i <- colSums(ypred * weights) + } else { + prob_i <- colMeans(ypred) + } + bs_i <- (prob_i - y)^2 + } + + res <- list( + estimate = mean(bs_i), + se = sqrt(var(bs_i) / length(bs_i)), + pointwise = bs_i + ) + .measure_output(res, revert_sign) +} + +#' Mean absolute error (`mae`) +#' +#' Computes MAE between observed outcomes and posterior predictive point +#' predictions. Point predictions are obtained by averaging `mupred` draws, or +#' by PSIS-weighted averaging when `log_weights` is provided. +#' +#' @param y A numeric vector of observed outcomes. +#' @param mupred A numeric matrix of posterior expected predictions with +#' dimensions draws x observations. A length-`n` vector is also accepted. +#' @param log_weights Optional numeric matrix of unnormalized log-weights +#' with dimensions draws x observations. +#' @param pointwise Optional numeric vector of precomputed pointwise absolute +#' errors. If provided, `y`, `mupred`, and `log_weights` are ignored. +#' @param revert_sign Logical; if `TRUE`, multiply the estimate and pointwise +#' values by -1 before returning. +#' +#' @returns A named list with elements `estimate` (mean absolute error), `se` +#' (standard error), and `pointwise` (pointwise absolute errors). +#' +#' @examples +#' y <- c(1, 2, 3) +#' mupred <- matrix(c(0.9, 2.1, 2.8, 1.2, 1.9, 3.1), nrow = 2) +#' mae(y, mupred) +#' @export +mae <- function( + y, mupred, log_weights = NULL, pointwise = NULL, revert_sign = FALSE +) { + if (!is.null(pointwise)) { + .inform_ignored_inputs( + pointwise, + ignored_args = list(mupred = mupred, log_weights = log_weights), + fun_name = "mae" + ) + mae_i <- pointwise + } else { + .validate_numeric_vector(y, arg = "y") + if (!is.null(mupred) && !is.matrix(mupred)){ + .validate_numeric_vector(mupred, arg = "mupred", len = length(y)) + cli::cli_inform( + "Coercing {.arg mupred} from vector to 1 x n matrix for {.fn mae}." + ) + mupred <- matrix(mupred, nrow = 1, ncol = length(mupred)) + } + .validate_numeric_matrix(mupred, arg = "mupred", ncol = length(y)) + if (is.null(log_weights)) { + mae_i <- abs(y - colMeans(mupred)) + } else { + weights <- exp(.normalize_and_validate_log_weights( + log_weights = log_weights, + n_draws = nrow(mupred), + n_obs = ncol(mupred) + )) + mae_i <- abs(y - colSums(weights * mupred)) + } + } + + res <- list( + estimate = mean(mae_i), + se = sqrt(var(mae_i) / length(mae_i)), + pointwise = mae_i + ) + .measure_output(res, revert_sign) +} + +#' Mean squared error (`mse`) +#' +#' Computes MSE between observed outcomes and posterior predictive point +#' predictions. Point predictions are obtained by averaging `mupred` draws, or +#' by PSIS-weighted averaging when `log_weights` is provided. +#' +#' @inheritParams mae +#' +#' @returns A named list with elements `estimate` (mean squared error), `se` +#' (standard error), and `pointwise` (pointwise squared errors). +#' +#' @examples +#' y <- c(1, 2, 3) +#' mupred <- matrix(c(0.9, 2.1, 2.8, 1.2, 1.9, 3.1), nrow = 2) +#' mse(y, mupred) +#' @export +mse <- function( + y, mupred, log_weights = NULL, pointwise = NULL, revert_sign = FALSE +) { + if (!is.null(pointwise)) { + .inform_ignored_inputs( + pointwise, + ignored_args = list(mupred = mupred, log_weights = log_weights), + fun_name = "mse" + ) + sqe_i <- pointwise + } else { + .validate_numeric_vector(y, arg = "y") + if (!is.null(mupred) && !is.matrix(mupred)){ + .validate_numeric_vector(mupred, arg = "mupred", len = length(y)) + cli::cli_inform( + "Coercing {.arg mupred} from vector to 1 x n matrix for {.fn mse}." + ) + mupred <- matrix(mupred, nrow = 1, ncol = length(mupred)) + } + .validate_numeric_matrix(mupred, arg = "mupred", ncol = length(y)) + if (is.null(log_weights)) { + sqe_i <- (y - colMeans(mupred))^2 + } else { + weights <- exp(.normalize_and_validate_log_weights( + log_weights = log_weights, + n_draws = nrow(mupred), + n_obs = ncol(mupred) + )) + sqe_i <- (y - colSums(weights * mupred))^2 + } + } + + res <- list( + estimate = mean(sqe_i), + se = sqrt(var(sqe_i) / length(sqe_i)), + pointwise = sqe_i + ) + .measure_output(res, revert_sign) +} + +#' Root mean squared error (`rmse`) +#' +#' Computes RMSE as the square root of MSE and propagates uncertainty via a +#' first-order delta-method approximation. +#' +#' @inheritParams mae +#' +#' @returns A named list with elements `estimate` (root mean squared error), +#' `se` (standard error), and `pointwise` (pointwise squared errors). +#' +#' @examples +#' y <- c(1, 2, 3) +#' mupred <- matrix(c(0.9, 2.1, 2.8, 1.2, 1.9, 3.1), nrow = 2) +#' rmse(y, mupred) +#' @export +rmse <- function( + y, mupred, log_weights = NULL, pointwise = NULL, revert_sign = FALSE +) { + mse_res <- mse( + y = y, mupred = mupred, log_weights = log_weights, + pointwise = pointwise + ) + sqe_i <- mse_res$pointwise + rmse_est <- sqrt(mse_res$estimates[1]) + if (rmse_est == 0) { + rmse_se <- 0 + } else { + rmse_se <- mse_res$estimates[2] / (2 * rmse_est) + } + + res <- list( + estimate = rmse_est, + se = rmse_se, + pointwise = sqe_i + ) + .measure_output(res, revert_sign) +} + +#' Predictive R-squared (`r2`) +#' +#' Computes predictive R-squared as one minus the ratio of prediction MSE to +#' the empirical variance of `y`. The standard error is computed with a +#' first-order delta-method approximation. +#' +#' @inheritParams mae +#' +#' @returns A named list with elements `estimate` (predictive R2), `se` +#' (standard error), and `pointwise` (pointwise squared errors). +#' +#' @examples +#' y <- c(1, 2, 3) +#' mupred <- matrix(c(0.9, 2.1, 2.8, 1.2, 1.9, 3.1), nrow = 2) +#' r2(y, mupred) +#' @export +r2 <- function( + y, mupred, log_weights = NULL, pointwise = NULL, revert_sign = FALSE +) { + .validate_numeric_vector(y, arg = "y") + if (var(y) == 0) { + cli::cli_abort( + "{.fn r2} is undefined when {.arg y} has zero variance." + ) + } + + mse_res <- mse( + y = y, + mupred = mupred, + log_weights = log_weights, + pointwise = pointwise + ) + mse_hat <- mse_res$estimate + sqe_i <- mse_res$pointwise + n_obs <- length(sqe_i) + + mse_y_i <- (y - mean(y))^2 + mse_y_hat <- mean(mse_y_i) + + var_mse_hat <- mse_res$se^2 + cov_mse_msey <- stats::cov(sqe_i, mse_y_i) / n_obs + var_mse_y_hat <- var(mse_y_i) / n_obs + + t1 <- var_mse_hat + t2 <- -2 * (mse_hat / mse_y_hat) * cov_mse_msey + t3 <- (mse_hat^2 / mse_y_hat^2) * var_mse_y_hat + se_r2 <- sqrt(t1 + t2 + t3) * (1 / mse_y_hat) + + est_r2 <- 1 - mse_hat / mse_y_hat + + res <- list( + estimate = est_r2, + se = se_r2, + pointwise = sqe_i + ) + .measure_output(res, revert_sign) +} + +#' Ranked Probability Score (RPS, SRPS, CRPS, SCRPS) +#' +#' Computes proper scoring rules based on the ranked probability score family, +#' covering both discrete and continuous outcomes, with optional scaling. +#' The specific score computed depends on the type of `y` and `ypred` and the +#' value of `scaled`: +#' +#' | `y`/`ypred` type | `scaled = FALSE` | `scaled = TRUE` | +#' |------------------|-----------------|-----------------| +#' | Discrete | RPS | SRPS | +#' | Continuous | CRPS | SCRPS | +#' +#' **Scoring rules:** +#' +#' - **RPS** (Epstein, 1969): Compares predictive and observed cumulative +#' distributions for ordered discrete outcomes. +#' - **CRPS** (Matheson & Winkler, 1976; Gneiting & Raftery, 2007): Generalizes +#' RPS to continuous outcomes. Defined as +#' \deqn{\mathrm{CRPS}(X; y) = \frac{1}{2} E[|X - X'|] - E[|X - y|],} +#' where \eqn{X, X'} are independent draws from the predictive distribution. +#' - **SRPS/SCRPS** (Bolin & Wallin, 2023): Scaled variants that are invariant +#' to the scale of the predictive distribution. Defined as +#' \deqn{\mathrm{SCRPS}(X; y) = -\frac{E[|X - y|]}{E[|X - X'|]} - +#' \frac{1}{2} \log E[|X - X'|].} +#' +#' **Estimation:** +#' +#' Scores are estimated using the probability-weighted moment (PWM) estimator +#' (Taillardat et al., 2016; Zamo & Naveau, 2018), which requires only a single +#' set of predictive draws unlike permutation-based estimators, which require +#' two independent draw sets. The PWM estimator is unbiased and generally more +#' accurate than single-permutation estimators. The same estimator is used for +#' both discrete and continuous outcomes; see Hosking (1990, 1996) for +#' theoretical justification in the discrete case. +#' +#' If log-weights (`log_weights`) are provided (e.g., PSIS weights +#' for LOO cross-validation), a weighted PWM estimator is used instead, which +#' accounts for the importance weights when estimating expectations. +#' +#' **Sign convention:** +#' +#' Unscaled scores are returned as utilities (higher is better), consistent +#' with the sign convention of log score / ELPD. Scaled scores are negatively +#' oriented (lower is better). Use `revert_sign = TRUE` to obtain the +#' loss convention (lower is better) used in some references. +#' +#' @param y A numeric vector of \eqn{n} observed outcomes. May be integer-valued +#' (for RPS/SRPS) or continuous (for CRPS/SCRPS). +#' @param ypred A numeric matrix of posterior predictive draws with dimensions +#' \eqn{S \times n} (draws × observations). +#' @param log_weights Optional numeric matrix of log-importance- +#' weights with dimensions \eqn{S \times n}. When provided, a weighted PWM +#' estimator is used. Useful for LOO cross-validation via importance sampling +#' (e.g., PSIS-LOO). +#' @param scaled Logical; if `TRUE`, computes the scaled variant (SRPS for +#' discrete outcomes, SCRPS for continuous outcomes). Default is `FALSE`. +#' @param revert_sign Logical; if `TRUE`, multiplies scores by \eqn{-1} to +#' return the loss convention (lower is better) rather than the utility +#' convention. Default is `FALSE`. +#' +#' @return A named list with: +#' \describe{ +#' \item{`estimate`}{Mean score across all observations.} +#' \item{`se`}{Standard error of the mean score.} +#' \item{`pointwise`}{Numeric vector of length \eqn{n} with per-observation +#' scores.} +#' } +#' +#' @examples +#' # Discrete outcomes: RPS +#' y <- c(2L, 1L, 3L) +#' ypred <- matrix(c(2, 1, 2, 3, 1, 3), nrow = 2) +#' rps(y, ypred) +#' +#' # Discrete outcomes: SRPS (scaled) +#' rps(y, ypred, scaled = TRUE) +#' +#' # Continuous outcomes: CRPS +#' y_cont <- c(0.5, -1.2, 2.3) +#' ypred_cont <- matrix(rnorm(200), nrow = 100, ncol = 3) +#' rps(y_cont, ypred_cont) +#' +#' # With importance weights: LOO-CRPS +#' log_weights <- matrix(rnorm(200), nrow = 100, ncol = 3) +#' rps(y_cont, ypred_cont, log_weights = log_weights) +#' +#' @references +#' Bolin, D. and Wallin, J. (2023). Local scale invariance and robustness of +#' proper scoring rules. *Statistical Science*, 38(1):140–159. +#' +#' Epstein, E. S. (1969). A scoring system for probability forecasts of ranked +#' categories. *Journal of Applied Meteorology*, 8(6):985–987. +#' +#' Gneiting, T. and Raftery, A. E. (2007). Strictly proper scoring rules, +#' prediction, and estimation. *Journal of the American Statistical +#' Association*, 102(477):359–378. +#' +#' Hosking, J. R. M. (1990). L-moments: Analysis and estimation of +#' distributions using linear combinations of order statistics. *Journal of +#' the Royal Statistical Society Series B*, 52(1):105–124. +#' +#' Hosking, J. R. M. (1996). Some theoretical results concerning L-moments. +#' Research report RC 14492. IBM Thomas J. Watson Research Division. +#' +#' Matheson, J. E. and Winkler, R. L. (1976). Scoring rules for continuous +#' probability distributions. *Management Science*, 22(10):1087–1096. +#' +#' Taillardat, M., Mestre, O., Zamo, M., and Naveau, P. (2016). Calibrated +#' ensemble forecasts using quantile regression forests and ensemble model +#' output statistics. *Monthly Weather Review*, 144(6):2375–2393. +#' +#' Zamo, M. and Naveau, P. (2018). Estimation of the continuous ranked +#' probability score with limited information and applications to ensemble +#' weather forecasts. *Mathematical Geosciences*, 50:209–234. +#' +#' @export +rps <- function(y, ypred, log_weights = NULL, scaled = FALSE, revert_sign = FALSE) { + if (is.null(log_weights)) { + EXy <- colMeans(abs(sweep(ypred, 2, y))) + ypred_sorted <- apply(ypred, 2, sort) + S <- nrow(ypred) + EXX <- colMeans(ypred_sorted * ((1:S) * (4 / (S - 1)) - 2)) + if (scaled) { + # Scaled version by Bolin & Wallin (2023) + rps_i <- -EXy/EXX - 0.5 * log(EXX) + } else { + # Gneiting & Raftery (2007) + rps_i <- - 0.5 * EXX + EXy + } + } else { + S <- nrow(ypred) + n <- ncol(ypred) + w <- exp(.normalize_log_weights(log_weights)) + w_csum <- sapply(1:n, function(j) { + perm <- order(ypred[,j]) + result <- numeric(S) + result[perm] <- cumsum(w[perm, j]) + result + }) + + EXX <- 2 * colSums(ypred * w * (1 - (2*(1 - w_csum)) / (1 - w))) + EXy <- colSums(w * abs(sweep(ypred, 2, y))) + if (scaled) { + # Scaled version by Bolin & Wallin (2023) + rps_i <- -EXy/EXX - 0.5 * log(EXX) + } else { + # Gneiting & Raftery (2007) + rps_i <- EXy - 0.5 * EXX + } + } + res <- list( + estimate = mean(rps_i), + se = sqrt(var(rps_i) / length(rps_i)), + pointwise = rps_i + ) + .measure_output(res, revert_sign) +} + +#' Scaled Ranked Probability Score (SRPS, SCRPS) +#' +#' A convenience wrapper around [rps()] with `scaled = TRUE`. Computes the +#' scaled ranked probability score (SRPS) for discrete outcomes or the scaled +#' continuously ranked probability score (SCRPS) for continuous outcomes. +#' Scaling makes the score invariant to the spread of the predictive +#' distribution, which can be useful when comparing models across observations +#' with very different predictive uncertainties. +#' +#' See [rps()] for full details on arguments, estimation, and references. +#' +#' @inheritParams rps +#' +#' @inherit rps return +#' +#' @examples +#' y <- c(2L, 1L, 3L) +#' ypred <- matrix(c(2, 1, 2, 3, 1, 3), nrow = 2) +#' srps(y, ypred) +#' +#' @export +srps <- function(y, ypred, log_weights = NULL, revert_sign = FALSE) { + rps( + y = y, ypred = ypred, log_weights = log_weights, + scaled = TRUE, revert_sign = revert_sign + ) +} + +# measure overview ----------------------------- +# +# Built-in measures are registered in `.measure_spec`. Users can also pass +# custom functions to `measure` in the pred_measure family; those functions +# must set `attr(fun, "measure_name")` and return `estimate`, `se`, and +# `pointwise` (see `?insample_pred_measure`). + +# internal function to get the measure specification +# @noRd +# @param measure The measure used. +# @return The measure specification. +.measure_spec <- list( + elpd = elpd, + mlpd = mlpd, + mae = mae, + r2 = r2, + rmse = rmse, + mse = mse, + acc = acc, + bacc = bacc, + rps = rps, + srps = srps +) + +#' Supported predictive measure names +#' +#' A character vector of measure names that can be passed to the `measure` +#' argument of [insample_pred_measure()], [loo_pred_measure()], +#' [kfold_pred_measure()], [test_pred_measure()], and [pred_measure()]. +#' +#' @export +supported_measures_list <- names(.measure_spec) + +# internal function that produces output format for measures +.measure_output <- function(res, revert_sign) { + if (isTRUE(revert_sign)) { + res$estimate <- -res$estimate + res$pointwise <- -res$pointwise + } + out <- list() + out$estimates <- c(res$estimate, res$se) + names(out$estimates) <- c('Estimate', 'SE') + out$pointwise <- res$pointwise + out +} \ No newline at end of file diff --git a/R/pred_measure.R b/R/pred_measure.R new file mode 100644 index 00000000..44df8691 --- /dev/null +++ b/R/pred_measure.R @@ -0,0 +1,690 @@ +#' Shared parameters for predictive measure wrappers +#' +#' @description +#' Parameter definitions shared by the user-facing entry points and the +#' internal engine [pred_measure_engine()]. +#' +#' @param y Vector of observed values (`n`). Required for distributional and +#' point-prediction measures such as `crps`, `mae`, and `acc`. +#' @param ypred Matrix of posterior predictive draws (`S` draws × `n` +#' observations), typically from [brms::posterior_predict()]. Required for +#' distributional measures such as `crps`, `rps`, and `scrps`. +#' @param mupred Matrix of posterior expected values (`S` × `n`), typically from +#' [brms::posterior_epred()]. Required for point-prediction measures such as +#' `mae`, `rmse`, `r2`, and `acc`. +#' @param ylp Matrix of pointwise log predictive densities or probabilities +#' (`S` × `n`), typically from [brms::log_lik()]. Required for density-based +#' summaries (`elpd`, `mlpd`, `ic`). +#' @param ylp_test Matrix of pointwise log predictive densities for holdout +#' observations (`S` × `n_test`), typically from +#' `brms::log_lik(fit, newdata = test_data)`. Used with `ylp` (from the +#' training fit) in [test_pred_measure()] to score genuinely new data. +#' @param predperf An existing predictive measure object (class +#' `"pred_measure"`) to update. When supplied, base density summaries and +#' (for LOO) PSIS weights are reused instead of recomputed. +#' @param measure Additional measures beyond the base summaries (`elpd` and +#' `ic`, which are always included). Can be: +#' \itemize{ +#' \item A **character vector** of built-in names; see +#' [supported_measures_list()]. +#' \item A **function** with attribute `"measure_name"` for one custom measure. +#' \item A **list** mixing character scalars (built-in names) and named +#' functions (custom measures), e.g. `list("rps", my_metric = my_fun)`. +#' } +#' Custom functions are called with any of `y`, `ypred`, `mupred`, `ylp`, and +#' `log_weights` that appear in their formals, plus arguments from `control`. +#' They must return a list with `estimates` and `pointwise`. +#' @param measure_name For a single custom function, set +#' `attr(my_fun, "measure_name") <- "my_metric"` before passing `my_fun` to +#' `measure`. +#' @param group_ids Optional vector of group identifiers for grouped summaries +#' (reserved; not yet implemented). +#' @param loo A [loo::loo()] result, computed with +#' `save_psis = TRUE` so that PSIS weights are available for additional +#' measures. See [loo_pred_measure()]. +#' @param kfold A `kfold` object from [brms::kfold()]. Supplies ELPD summaries +#' and fold structure for [kfold_pred_measure()]; pass `y`, `ypred`, and/or +#' `mupred` when requesting additional measures. +#' @param psis_object A `psis` object with LOO importance weights. An +#' alternative to passing a full `loo` object; must be supplied together with +#' `ylp` when computing `elpd`. +#' @param save_psis Logical. If `TRUE`, store the `psis` object in the result +#' so that additional measures can be added later with [pred_measure()] without +#' recomputing PSIS weights. +#' @param control Named list of per-measure settings. Each name must match an +#' element of `measure`; the value is a list of arguments passed to that +#' measure's summary function (e.g. `list(new_measure = list(add_arg = 10))`). +#' @param source Character string indicating the evaluation mode: `"insample"`, +#' `"loo"`, `"kfold"`, or `"test"`. Set automatically by the wrapper +#' functions; required when calling [pred_measure_engine()] directly. +#' +#' @keywords internal +#' @name pred_measure_params +pred_measure_engine <- function( + y = NULL, + ypred = NULL, + mupred = NULL, + ylp = NULL, + ylp_test = NULL, + measure = NULL, + predperf = NULL, + loo = NULL, + kfold = NULL, + group_ids = NULL, + psis_object = NULL, + save_psis = FALSE, + source = NULL, + control = list() +) { + # input validation --------------------------------------------------- + checkmate::assert_list(control, types = "list", names = "named") + measures <- .prepare_measures(measure, predperf, supported_measures_list) + + if (source == "loo") { + if (is.null(predperf)) { + if (!is.null(loo) && is.null(loo$psis_object)) { + cli::cli_abort(c( + "No `psis_object` found in `loo` object. Did you run loo(..., save_psis = 'TRUE')." + )) + } + if (is.null(ylp) && !is.null(psis_object)) { + cli::cli_abort(c( + "For computation of `elpd` it is required to pass `ylp` besides `psis_object`." + )) + } + } else { + if (is.null(psis_object) && !is.null(predperf$psis_object)) { + cli::cli_inform("Using psis_object for LOO CV from `predperf`") + psis_object <- predperf$psis_object + } + } + checkmate::assert_null(ylp_test) + } else if (source == "insample") { + checkmate::assert_null(ylp_test) + checkmate::assert_null(psis_object) + } else if (source == "test") { + checkmate::assert_null(psis_object) + } else { # kfold + checkmate::assert_null(psis_object) + } + + # core logic --------------------------------------------------- + if (source == "loo") { + psis_object <- .get_psis_object( + loo = loo, + predperf = predperf, + psis_object = psis_object, + ylp = ylp, + r_eff = 1 + ) + } + + base_measure <- .compute_base_measure( + ylp = ylp, + ylp_test = ylp_test, + loo = loo, + kfold = kfold, + predperf = predperf, + psis_object = psis_object, + source = source + ) + + estimates <- base_measure$estimates + pointwise <- base_measure$pointwise + + log_weights <- if (!is.null(psis_object)) psis_object$log_weights else NULL + + for (entry in measures) { + sel_measure <- .compute_measure( + y = y, + ypred = ypred, + mupred = mupred, + ylp = ylp, + measure_entry = entry, + log_weights = log_weights, + control = control + ) + estimates <- .merge_matrix( + source = source, + mat = estimates, + name = entry$name, + values = .measure_estimate_se(sel_measure), + margin = 1 + ) + pointwise <- .merge_matrix( + source = source, + mat = pointwise, + name = entry$name, + values = sel_measure$pointwise, + margin = 2 + ) + } + + predperf_res <- .build_pred_measure( + estimates = estimates, + pointwise = pointwise, + diagnostics = base_measure$diagnostics, + psis_object = psis_object, + save_psis = save_psis + ) + + .add_attributes( + save_psis, + predperf_res, + y, + ypred, + mupred, + ylp, + ylp_test, + kfold, + loo, + predperf, + source + ) + } + +# internal helper functions --------------------------------------------------- + +#' Resolve or compute the PSIS object for LOO scoring +#' +#' @description +#' Selects a PSIS object from the available inputs, or computes one from `ylp` +#' when no precomputed weights are supplied (assuming `r_eff = 1`). +#' +#' Resolution order: +#' \enumerate{ +#' \item If both `psis_object` and `loo` are provided, return `psis_object` +#' after verifying it matches `loo$psis_object`. +#' \item Extract from `loo$psis_object` when `loo` is provided. +#' \item Use the supplied `psis_object`. +#' \item Reuse `predperf$psis_object` when accumulating measures. +#' \item Compute from `ylp` via [loo::psis()] on `-ylp` log ratios. +#' } +#' +#' @param ylp Matrix of pointwise log predictive densities (`S` × `n`). +#' @param loo Optional [loo::loo()] result containing a `psis_object`. +#' @param predperf Optional existing measure object with a stored `psis_object`. +#' @param psis_object Optional precomputed PSIS object. +#' @param r_eff Relative effective sample size passed to [loo::psis()]; +#' default `1`. +#' +#' @return A `psis` object with `log_weights` and `diagnostics`. +#' +#' @note See developer notes on computation of the `psis_object` for details. +#' @noRd +.get_psis_object <- function( + ylp, + loo, + predperf, + psis_object, + r_eff = 1 +) { + # psis_object + loo are both provided -> return psis_object + if (!is.null(psis_object) && !is.null(loo)) { + psis_equal_loo <- all(psis_object == loo$psis_object) + if (!psis_equal_loo) { + cli::cli_abort( + "Provided `psis_object` and `loo$psis_object` are not identical." + ) + } + return(psis_object) + # loo is provided + } else if (!is.null(loo)) { + return(loo$psis_object) + # psis_object is provided + } else if (!is.null(psis_object)) { + return(psis_object) + # predperf with psis_object is provided + } else if (!is.null(predperf$psis_object)) { + return(predperf$psis_object) + # ylp is provided + } else if (is.null(loo) && is.null(psis_object) && !is.null(ylp)) { + cli::cli_inform( + "Compute `psis_object` internally from `ylp` assuming `r_eff = 1`." + ) + log_ratios <- -1 * ylp + return(psis(log_ratios, r_eff = r_eff)) + # nothing is provided + } else { + cli::cli_abort(c( + "psis_object can not be computed, either of `psis_object`, `loo`, or", + "`ylp` needs to be provided." + )) + }} + +#' Compute a single predictive measure +#' +#' @description +#' Dispatches one requested predictive measure to the appropriate summary +#' function. The measure's `family` in `.measure_spec` determines which inputs +#' are passed through: +#' +#' \describe{ +#' \item{`metrics`}{`y` and `mupred` (e.g. MAE, RMSE, accuracy).} +#' \item{`rank_scores`}{`y` and `ypred` (e.g. RPS, CRPS).} +#' \item{`density_scores`}{`ylp` (e.g. ELPD, MLPD).} +#' } +#' +#' @param y Vector of observed values (n). +#' @param ypred Matrix of posterior predictive draws (S × n). +#' @param mupred Matrix of posterior point predictions (S × n). +#' @param ylp Matrix of pointwise log predictive densities (S × n). +#' @param measure_entry A normalized measure entry with elements +#' `name`, `type` (`"builtin"` or `"custom"`), and `key`. +#' @param log_weights Matrix of log-weights (S × n), as returned by +#' `.compute_log_weights()`. +#' @param control Named list of per-measure settings passed from +#' [pred_measure()]; the active slice is `control[[measure_entry$name]]`. +#' +#' @return A named list with three elements: +#' \describe{ +#' \item{`estimate`}{Scalar point estimate for the measure.} +#' \item{`se`}{Scalar standard error for the estimate.} +#' \item{`pointwise`}{Length-`n` vector of observation-level contributions.} +#' } +#' +#' Extract estimate and SE from a measure result +#' +#' Supports `estimate`/`se` (pred_measure functions) and `estimates` (CRPS). +#' +#' @noRd +.measure_estimate_se <- function(res) { + if (!is.null(res$estimates)) { + return(res$estimates) + } + c(res$estimate, res$se) +} + +#' @noRd +.compute_measure <- function( + y, + ypred, + mupred, + ylp, + measure_entry, + log_weights, + control = list() +) { + if (measure_entry$type == "builtin") { + measure_fun <- .measure_spec[[measure_entry$key]] + if (is.null(measure_fun)) { + cli::cli_abort("Unknown built-in measure {.val {measure_entry$key}}.") + } + } else { + measure_fun <- measure_entry$key + } + + measure_control <- control[[measure_entry$name]] + if (is.null(measure_control)) { + measure_control <- list() + } + + pool <- c( + list( + y = y, + ypred = ypred, + mupred = mupred, + ylp = ylp, + log_weights = log_weights + ), + measure_control + ) + args <- pool[intersect(names(formals(measure_fun)), names(pool))] + res <- do.call(measure_fun, args) + if (measure_entry$type == "custom") { + n_obs <- .measure_n_obs(y, ypred, mupred, ylp) + res <- .validate_measure_result(res, measure_entry$name, n_obs = n_obs) + } + res +} + +#' Compute base density summaries for a predictive measure object +#' +#' @description +#' Forms the default block of log-density summaries (`elpd`, `ic`, and for LOO +#' `p_loo`) that underlie every [pred_measure()] result. Additional measures +#' requested via `measure` are merged into the returned matrices later. +#' +#' When `predperf` is provided (incremental update), returns `predperf` unchanged. +#' For `source = "kfold"` or `"loo"` with a precomputed object, extracts existing +#' summaries from `kfold` or `loo`. +#' +#' Otherwise ELPD is computed from: +#' \itemize{ +#' \item `insample`: `ylp` directly (in-sample log predictive density). +#' \item `loo`: `ylp` reweighted with PSIS log weights from `psis_object`. +#' \item `test`: `ylp_test` on holdout observations. +#' } +#' +#' `ic` is always \eqn{-2 \times} ELPD on the same scale as LOOIC. For +#' `source = "loo"`, `p_loo` (effective number of parameters) is also included, +#' computed by \code{.compute_effective_param()} as the difference between +#' in-sample and LOO log predictive density; see `p_loo` in [loo::loo()] and the +#' [CV-FAQ on p_loo](https://users.aalto.fi/~ave/CV-FAQ.html#p_loo). +#' +#' @param ylp Matrix of pointwise log predictive densities for training data +#' (`S` × `n`). +#' @param ylp_test Matrix of pointwise log predictive densities for holdout +#' data (`S` × `n_test`; `test` source only). +#' @param loo Optional [loo::loo()] result. When supplied for `source = "loo"`, +#' base summaries are taken from the `loo` object. +#' @param kfold Optional `kfold` object. When supplied for `source = "kfold"`, +#' base summaries are taken from the `kfold` object. +#' @param predperf Existing [pred_measure()] object. When not `NULL`, returned +#' unchanged so base summaries are not recomputed. +#' @param psis_object PSIS object with LOO log weights and diagnostics. +#' @param source Character string; one of `"insample"`, `"loo"`, `"kfold"`, or +#' `"test"`. +#' +#' @return A named list with: +#' \describe{ +#' \item{`estimates`}{Matrix with rows `elpd`, optionally `p_loo` (LOO only), +#' and `ic`, plus a source suffix when applicable (`_loo`, `_kfold`, +#' `_test`).} +#' \item{`pointwise`}{Matrix of observation-level contributions.} +#' \item{`diagnostics`}{From `psis_object$diagnostics` when LOO weights are +#' used; otherwise `NULL` or taken from the input `loo`/`kfold` object.} +#' } +#' +#' @noRd +.compute_base_measure <- function( + ylp, + ylp_test = ylp_test, + loo, + kfold, + predperf, + psis_object, + source +) { + if (!is.null(predperf)) { + return(predperf) + } + if (source == "kfold") { + if ("diagnostics" %in% names(kfold)) { + return(kfold[c("estimates", "pointwise", "diagnostics")]) + } else { + return(kfold[c("estimates", "pointwise")]) + } + } + if (source == "loo" && !is.null(loo)) { + return(loo[c("estimates", "pointwise", "diagnostics")]) + } + + elpd_res <- switch( + source, + insample = elpd(ylp = ylp), + loo = elpd(ylp = ylp, log_weights = psis_object$log_weights), + test = elpd(ylp_test) + ) + if (!is.null(elpd_res$estimates)) { + elpd_res <- list( + estimate = unname(elpd_res$estimates["Estimate"]), + se = unname(elpd_res$estimates["SE"]), + pointwise = elpd_res$pointwise + ) + } + suffix <- if (source == "insample") "" else paste0("_", source) + add_p_eff <- source == "loo" + est_mat <- rbind( + elpd = c(elpd_res$estimate, elpd_res$se), + ic = c(-2 * elpd_res$estimate, 2 * elpd_res$se) + ) + pw_mat <- cbind( + elpd = elpd_res$pointwise, + ic = -2 * elpd_res$pointwise + ) + if (add_p_eff) { + p_eff_res <- .compute_effective_param(ylp, elpd_res$pointwise) + est_mat <- rbind( + est_mat["elpd", , drop = FALSE], + p_eff = c(p_eff_res$estimate, p_eff_res$se), + est_mat["ic", , drop = FALSE] + ) + pw_mat <- cbind( + elpd = pw_mat[, "elpd"], + p_eff = p_eff_res$pointwise, + ic = pw_mat[, "ic"] + ) + } + + rownames(est_mat) <- paste0(rownames(est_mat), suffix) + colnames(est_mat) <- c("Estimate", "SE") + colnames(pw_mat) <- paste0(colnames(pw_mat), suffix) + + list( + estimates = est_mat, + pointwise = pw_mat, + diagnostics = psis_object$diagnostics + ) +} + +# compute the effective number of parameters +#' Compute effective number of parameters (`p_loo`) +#' +#' @description +#' Per-observation effective number of parameters as the difference between +#' the log posterior predictive density (`lpd`) and the cross-validated log +#' predictive density (`elpd`). Summed across observations, this matches +#' `p_loo` from the **loo** package: it describes how much harder it is to +#' predict held-out data than the data used to fit the model. +#' +#' @param ylp Matrix of pointwise log predictive densities (`S` × `n`). +#' @param elpd_cv_i Numeric vector of length `n` with LOO pointwise ELPD +#' contributions. +#' +#' @return A named list with `estimate` (total `p_loo`), `se`, and `pointwise` +#' (`p_loo` per observation). +#' +#' @references See the **loo** package glossary (`vignette("loo2", package = "loo")`) +#' and \url{https://users.aalto.fi/~ave/CV-FAQ.html#p_loo}. +#' +#' @noRd +.compute_effective_param <- function(ylp, elpd_cv_i) { + lpd_i <- matrixStats::colLogSumExps(ylp) - log(nrow(ylp)) + p_eff_i <- lpd_i - elpd_cv_i + + list( + estimate = sum(p_eff_i), + se = sqrt(ncol(ylp) * var(p_eff_i)), + pointwise = p_eff_i + ) +} + +#' Add or update a row or column in a summary matrix +#' +#' @description +#' Builds the `estimates` and `pointwise` matrices used in +#' [pred_measure()] results. When `margin = 1`, appends or updates a **row** +#' (for estimates). When `margin = 2`, appends or updates a **column** +#' (for pointwise). +#' +#' If `mat` is `NULL`, returns a new one-row or one-column matrix for `name`. +#' If `name` is already present along that margin, the update is skipped with a +#' warning. Otherwise the new slice is bound with [rbind()] or [cbind()]. +#' +#' @param source Character string; one of `"loo"`, `"insample"`, `"kfold"`, or +#' `"test"`. +#' @param mat Existing matrix, or `NULL` when adding the first measure. +#' @param name Character label for the row (`margin = 1`) or column +#' (`margin = 2`). +#' @param values Numeric vector to store. For `margin = 1`, length-2 vector +#' `(estimate, se)`; for `margin = 2`, length-`n` pointwise vector. +#' @param margin `1` to merge along rows (estimates table), `2` along columns +#' (pointwise table). +#' +#' @return Updated matrix with `name` as a row or column name. +#' +#' @noRd +.merge_matrix <- function(source, mat, name, values, margin) { + is_row <- margin == 1 + along <- if (is_row) rownames else colnames + bind_fn <- if (is_row) rbind else cbind + name_updated <- switch( + source, + kfold = paste0(name, "_kfold"), + loo = paste0(name, "_loo"), + test = paste0(name, "_test"), + insample = name + ) + + new_slice <- if (is_row) { + matrix(values, nrow = 1, dimnames = list(name_updated, c("Estimate", "SE"))) + } else { + matrix(values, ncol = 1, dimnames = list(NULL, name_updated)) + } + + if (is.null(mat)) return(new_slice) + # && margin == 1 condition ensures that warning is shown only once + if (name_updated %in% along(mat) && margin == 1) { + # TODO: Check whether this behavior is wanted + cli::cli_warn( + c( + "{.field {name_updated}} already present in results. Skipping the update." + ) + ) + return(mat) + } + bind_fn(mat, new_slice) +} + +#' Construct the S3 predictive measure result object +#' +#' @description +#' Wraps computed summaries into the list structure returned by the +#' [pred_measure()] pipeline. S3 classes and attributes are attached later by +#' \code{.add_attributes()}. +#' +#' When `save_psis = TRUE`, the `psis_object` is stored in the result; otherwise +#' that slot is omitted. When a `psis_object` is available, its `log_weights` are +#' copied to the result. +#' +#' @param estimates Matrix of overall estimates and standard errors (rows = +#' measures, columns = `Estimate` and `SE`). +#' @param pointwise Matrix of observation-level contributions (columns = +#' measures). +#' @param diagnostics Optional PSIS or other diagnostic information, or `NULL`. +#' @param psis_object PSIS object with LOO weights and diagnostics, or `NULL`. +#' @param save_psis Logical; if `TRUE`, include `psis_object` in the result. +#' +#' @return A list with elements `estimates`, `pointwise`, and optionally +#' `diagnostics`, `psis_object`, and `log_weights`. Class attributes are added +#' by \code{.add_attributes()}. +#' +#' @noRd +.build_pred_measure <- function( + estimates, + pointwise, + diagnostics = NULL, + psis_object, + save_psis +) { + output_list <- list( + estimates = estimates, + pointwise = pointwise + ) + if (!is.null(diagnostics)) { + output_list$diagnostics <- diagnostics + } + if (isTRUE(save_psis)) { + output_list$psis_object <- psis_object + } + if (!is.null(psis_object)) { + output_list$log_weights <- psis_object$log_weights + } + + structure(output_list) +} + +#' Attach S3 classes and metadata attributes to a result +#' +#' @description +#' Sets `class`, `source`, and `dims` attributes on a predictive measure object. +#' +#' When updating an existing result (`predperf` is not `NULL`), copies attributes +#' from `predperf` and refreshes `dims` from newly supplied input matrices. +#' When `save_psis = FALSE`, clears any stored `psis_object` from the prior +#' result. +#' +#' For new objects, copies relevant attributes from `loo` or `kfold` inputs +#' (e.g. `yhash`, `model_name`, fold structure) and assigns a source-specific +#' subclass (`"insample_pred_measure"`, `"loo_pred_measure"`, etc.). +#' +#' @param save_psis Logical; when `FALSE` and accumulating, clears stored +#' `psis_object` from the prior result. +#' @param predperf_res List returned by \code{.build_pred_measure()}. +#' @param y Vector of observed values; used indirectly via matrix `dims`. +#' @param ypred Matrix of posterior predictive draws; used to set `dims`. +#' @param mupred Matrix of posterior expected values; used to set `dims`. +#' @param ylp Matrix of pointwise log predictive densities; used to set `dims`. +#' @param ylp_test Matrix of holdout log predictive densities; sets `dims` for +#' `source = "test"`. +#' @param kfold Optional `kfold` object whose attributes are inherited. +#' @param loo Optional [loo::loo()] object whose attributes are inherited. +#' @param predperf Existing object when accumulating measures, or `NULL`. +#' @param source Character evaluation mode (`"insample"`, `"loo"`, `"kfold"`, +#' or `"test"`). +#' +#' @return The updated `predperf_res` with class and attributes set. +#' +#' @noRd +.add_attributes <- function(save_psis, predperf_res, y, ypred, mupred, ylp, ylp_test, kfold, loo, predperf, source) { + if (!is.null(predperf)) { + if (isFALSE(save_psis)) { + predperf$psis_object <- NULL + } + attributes(predperf_res) <- attributes(predperf) + + dims <- if (!is.null(ypred)) { + dim(ypred) + } else if (!is.null(mupred)) { + dim(mupred) + } else if (!is.null(ylp)) { + dim(ylp) + } + attr(predperf_res, "dims") <- dims + + return(predperf_res) + } + + predperf_res <- switch( + source, + kfold = .copy_attrs( + predperf_res, + kfold, + setdiff(names(attributes(kfold)), "names") + ), + loo = .copy_attrs( + predperf_res, + loo, + setdiff(names(attributes(loo)), "names") + ), + test = , # fall through (same as insample) + insample = predperf_res + ) + + if (source %in% c("insample", "test") || + (is.null(attr(predperf_res, "dims")) && !is.null(ylp))){ + # make attribute names consistent between pred_measure classes + if (source == "test") { + attr(predperf_res, "dims") <- attr(ylp_test, "dim") + } else { + attr(predperf_res, "dims") <- attr(ylp, "dim") + } + } + + attr(predperf_res, "class") <- c( + "pred_measure", + switch( + source, + loo = "loo_pred_measure", + insample = "insample_pred_measure", + kfold = "kfold_pred_measure", + test = "test_pred_measure", + "pred_measure" + ), + attr(predperf_res, "class") + ) + attr(predperf_res, "source") <- source + + return(predperf_res) +} \ No newline at end of file diff --git a/R/pred_measure_helpers.R b/R/pred_measure_helpers.R new file mode 100644 index 00000000..efb3fac8 --- /dev/null +++ b/R/pred_measure_helpers.R @@ -0,0 +1,420 @@ +#' Normalize the `measure` argument to an internal list +#' +#' @description +#' Converts `measure` (character, function, list, or `NULL`) into a list of +#' entries with elements `name`, `type` (`"builtin"` or `"custom"`), and `key` +#' (built-in name or function). +#' +#' @param measure User-supplied `measure` argument. +#' +#' @return A list of normalized measure entries, or an empty list when +#' `measure` is `NULL`. +#' +#' @noRd +.normalize_measure <- function(measure) { + if (is.null(measure)) { + return(list()) + } + + if (is.function(measure)) { + entries <- list(.measure_entry_custom(measure)) + .check_measure_entry_names(entries) + return(entries) + } + + if (is.character(measure)) { + entries <- lapply(measure, function(nm) { + list(name = nm, type = "builtin", key = nm) + }) + .check_measure_entry_names(entries) + return(entries) + } + + if (is.list(measure)) { + if (length(measure) == 0L) { + return(list()) + } + entries <- lapply(seq_along(measure), function(i) { + el <- measure[[i]] + nm <- names(measure)[i] + if (is.character(el) && length(el) == 1L) { + list(name = el, type = "builtin", key = el) + } else if (is.function(el)) { + if (is.null(nm) || !nzchar(nm)) { + cli::cli_abort(c( + "Each custom function in {.arg measure} must be named.", + "i" = "Use {.code measure = list(my_metric = my_fun)}." + )) + } + list(name = nm, type = "custom", key = el) + } else { + cli::cli_abort(c( + "Each element of {.arg measure} must be a character scalar (built-in", + "name) or a function (custom measure).", + "i" = "Element {i} has type {.cls {class(el)[1]}}." + )) + } + }) + .check_measure_entry_names(entries) + return(entries) + } + + cli::cli_abort(c( + "{.arg measure} must be a character vector, a function, a list, or", + "{.code NULL}.", + "i" = "Got an object of class {.cls {class(measure)[1]}}." + )) +} + +#' Build a custom measure entry from a function +#' +#' @param fun Function implementing a custom measure. +#' @noRd +.measure_entry_custom <- function(fun) { + name <- attr(fun, "measure_name", exact = TRUE) + if (is.null(name) || length(name) != 1L || !nzchar(name)) { + cli::cli_abort(c( + "A custom function passed to {.arg measure} must have attribute", + "{.code measure_name}.", + "i" = "Set {.code attr(my_fun, \"measure_name\") <- \"my_metric\"}." + )) + } + list(name = name, type = "custom", key = fun) +} + +#' Check duplicate and reserved measure names +#' +#' @param entries List of normalized measure entries from `.normalize_measure()`. +#' @noRd +.check_measure_entry_names <- function(entries) { + names <- vapply(entries, `[[`, "", "name") + dups <- names[duplicated(names)] + if (length(dups) > 0L) { + cli::cli_abort(c( + "Duplicate measure names in {.arg measure}: {.val {unique(dups)}}", + "i" = "Each measure may appear only once." + )) + } + invisible(NULL) +} + +#' Normalize, validate, and filter the `measure` argument +#' +#' @description +#' Converts `measure` via `.normalize_measure()`, validates built-in names and +#' custom functions, and drops entries already present in `predperf`. +#' +#' @param measure User-supplied `measure` argument (see `.normalize_measure()`). +#' @param predperf Existing pred_measure object used when accumulating measures. +#' @param supported_measures_list Character vector of allowed built-in names. +#' +#' @return A list of normalized measure entries ready for computation. +#' +#' @noRd +.prepare_measures <- function(measure, predperf, supported_measures_list) { + entries <- .normalize_measure(measure) + if (length(entries) == 0L) { + return(entries) + } + + is_builtin <- vapply(entries, function(e) e$type == "builtin", logical(1L)) + builtin_keys <- vapply(entries[is_builtin], function(e) e$key, character(1L)) + invalid <- setdiff(builtin_keys, supported_measures_list) + if (length(invalid) > 0L) { + cli::cli_abort(c( + "Invalid measure{?s}: {paste(shQuote(invalid), collapse = ', ')}", + "i" = "Built-in measures must be one of:", + " " = "{paste(shQuote(supported_measures_list), collapse = ', ')}" + )) + } + + is_custom <- !is_builtin + if (any(is_custom)) { + for (entry in entries[is_custom]) { + checkmate::assert_function(entry$key, .var.name = "measure") + } + } + + if (!is.null(predperf)) { + existing_measures <- rownames(predperf$estimates) + entry_names <- vapply(entries, function(e) e$name, character(1L)) + dups <- intersect(entry_names, existing_measures) + if (length(dups) > 0L) { + cli::cli_warn(c( + "!" = "{cli::qty(length(dups))} Measure{?s} {.val {dups}} {?is/are}", + "already present in {.arg predperf} and will be skipped." + )) + } + keep <- !vapply( + entries, + function(e) e$name %in% existing_measures, + logical(1L) + ) + entries <- entries[keep] + } + + entries +} + +#' Infer number of observations from measure inputs +#' +#' @noRd +.measure_n_obs <- function(y, ypred, mupred, ylp) { + if (!is.null(y)) { + return(length(y)) + } + if (!is.null(ypred)) { + return(ncol(ypred)) + } + if (!is.null(mupred)) { + return(if (length(dim(mupred)) == 3L) dim(mupred)[2L] else ncol(mupred)) + } + if (!is.null(ylp)) { + return(ncol(ylp)) + } + NULL +} + +#' Validate the return value of a custom measure function +#' +#' @param res Object returned by a custom measure function. +#' @param measure_name Label used in error messages. +#' @param n_obs Expected length of `pointwise`, or `NULL` to skip. +#' +#' @return `res`, invisibly. +#' +#' @noRd +.validate_measure_result <- function(res, measure_name, n_obs = NULL) { + if (!is.list(res)) { + cli::cli_abort(c( + "Custom measure {.val {measure_name}} must return a list.", + "i" = "Got an object of class {.cls {class(res)[1]}}." + )) + } + + if (!is.null(res$estimates)) { + if (!is.numeric(res$estimates) || length(res$estimates) != 2L) { + cli::cli_abort(c( + "{.field estimates} from custom measure {.val {measure_name}} must be", + "a numeric vector of length 2 (estimate and SE)." + )) + } + } else { + missing <- setdiff(c("estimate", "se", "pointwise"), names(res)) + if (length(missing) > 0L) { + cli::cli_abort(c( + "Custom measure {.val {measure_name}} must return a list with", + "{.field estimate}, {.field se}, and {.field pointwise}.", + "x" = "Missing: {.field {missing}}" + )) + } + if (!is.numeric(res$estimate) || length(res$estimate) != 1L) { + cli::cli_abort( + "{.field estimate} from custom measure {.val {measure_name}} must be a numeric scalar." + ) + } + if (!is.numeric(res$se) || length(res$se) != 1L) { + cli::cli_abort( + "{.field se} from custom measure {.val {measure_name}} must be a numeric scalar." + ) + } + } + + if (is.null(res$pointwise)) { + cli::cli_abort( + "Custom measure {.val {measure_name}} must return {.field pointwise}." + ) + } + if (!is.numeric(res$pointwise) || length(res$pointwise) < 1L) { + cli::cli_abort( + "{.field pointwise} from custom measure {.val {measure_name}} must be a numeric vector." + ) + } + if (!is.null(n_obs) && length(res$pointwise) != n_obs) { + cli::cli_abort(c( + "{.field pointwise} from custom measure {.val {measure_name}} must have", + "length {.val {n_obs}}, not {.val {length(res$pointwise)}}." + )) + } + + invisible(res) +} + +#' Validate a numeric matrix argument +#' +#' @description +#' Checks that `x` is a numeric matrix with at least one row and column. +#' Optionally enforces expected `nrow` and/or `ncol`. Aborts via +#' [cli::cli_abort()] when validation fails. +#' +#' @param x Object to validate. +#' @param arg Name of the argument (used in error messages). +#' @param nrow Expected number of rows, or `NULL` to skip this check. +#' @param ncol Expected number of columns, or `NULL` to skip this check. +#' +#' @return `NULL`, invisibly, on success. +#' +#' @noRd +.validate_numeric_matrix <- function(x, arg, nrow = NULL, ncol = NULL) { + if (!is.matrix(x) || !is.numeric(x)) { + cli::cli_abort("{.arg {arg}} must be a numeric matrix.") + } + if (!is.null(nrow) && nrow(x) != nrow) { + cli::cli_abort( + "{.arg {arg}} must have {.val {nrow}} row{?s}, not {.val {nrow(x)}}." + ) + } + if (!is.null(ncol) && ncol(x) != ncol) { + cli::cli_abort( + "{.arg {arg}} must have {.val {ncol}} column{?s}, not {.val {ncol(x)}}." + ) + } + if (nrow(x) < 1 || ncol(x) < 1) { + cli::cli_abort("{.arg {arg}} must have at least 1 row and 1 column.") + } +} + +#' Validate a numeric vector argument +#' +#' @description +#' Checks that `x` is a numeric atomic vector (not a matrix or array) with +#' length at least one. Optionally enforces an expected `len`. Aborts via +#' [cli::cli_abort()] when validation fails. +#' +#' @param x Object to validate. +#' @param arg Name of the argument (used in error messages). +#' @param len Expected length, or `NULL` to skip this check. +#' +#' @return `NULL`, invisibly, on success. +#' +#' @noRd +.validate_numeric_vector <- function(x, arg, len = NULL) { + if (!is.atomic(x) || !is.numeric(x) || is.matrix(x) || is.array(x)) { + cli::cli_abort("{.arg {arg}} must be a numeric vector.") + } + if (!is.null(len) && length(x) != len) { + cli::cli_abort( + "{.arg {arg}} must have length {.val {len}}, not {.val {length(x)}}." + ) + } + if (length(x) < 1) { + cli::cli_abort("{.arg {arg}} must not be empty.") + } +} + +#' Validate and normalize log weights +#' +#' @description +#' Validates that `log_weights` is a numeric matrix of size `n_draws` by +#' `n_obs`, then column-normalizes it via `.normalize_log_weights()`. +#' +#' @param log_weights Numeric matrix of log weights (`n_draws` \eqn{\times} +#' `n_obs`). +#' @param n_draws Expected number of rows (posterior draws). +#' @param n_obs Expected number of columns (observations). +#' +#' @return Numeric matrix of the same dimensions as `log_weights` with +#' column-normalized log weights. +#' +#' @noRd +.normalize_and_validate_log_weights <- function(log_weights, n_draws, n_obs) { + .validate_numeric_matrix( + log_weights, + arg = "log_weights", + nrow = n_draws, + ncol = n_obs + ) + .normalize_log_weights(log_weights) +} + +#' Inform about ignored inputs when pointwise is supplied +#' +#' @description +#' When `pointwise` is not `NULL`, emits an informative message via +#' [cli::cli_inform()] listing non-`NULL` entries in `ignored_args` that are +#' not used. +#' +#' @param pointwise Optional precomputed pointwise contributions. When +#' `NULL`, no message is emitted. +#' @param ignored_args Named list of arguments that may be ignored (e.g. +#' `ylp`, `log_weights`). +#' @param fun_name Name of the calling function (shown in the message). +#' +#' @return `NULL`, invisibly. +#' +#' @noRd +.inform_ignored_inputs <- function(pointwise, ignored_args, fun_name) { + if (is.null(pointwise)) { + return(invisible(NULL)) + } + supplied <- names(ignored_args)[vapply(ignored_args, Negate(is.null), logical(1))] + if (length(supplied) > 0L) { + cli::cli_inform( + "In {.fn {fun_name}}, {.arg pointwise} is provided; ignoring {.arg {supplied}}." + ) + } + invisible(NULL) +} + +#' Validate probability values +#' +#' @description +#' Checks that all elements of `x` lie in the closed interval \eqn{[0, 1]}. +#' Aborts via [cli::cli_abort()] when any value is out of range. +#' +#' @param x Numeric vector or matrix of probabilities. +#' @param arg Name of the argument (used in error messages). +#' +#' @return `NULL`, invisibly, on success. +#' +#' @noRd +.validate_probs <- function(x, arg) { + if (!all(x >= 0 | x <= 1)) { + cli::cli_abort("{.arg {arg}} must contain values in [0, 1].") + } +} + +#' Copy selected attributes between objects +#' +#' @description +#' Copies attributes named in `which` from `from` onto `to`, overwriting any +#' existing attributes with the same names. +#' +#' @param to Object receiving attributes. +#' @param from Object supplying attributes. +#' @param which Character vector of attribute names to copy. +#' +#' @return `to`, with updated attributes. +#' +#' @noRd +.copy_attrs <- function(to, from, which) { + for (nm in which) { + attr(to, nm) <- attr(from, nm) + } + to +} + +#' Normalize log weights +#' +#' @description +#' Normalizes a matrix of log weights column-wise so that the weights in each +#' column sum to one on the probability scale. Normalization is performed by +#' subtracting the log-sum-exp of each column from its elements, equivalent to +#' dividing each column's weights by their sum on the probability scale. +#' +#' @param log_weights Numeric matrix of log weights (`n_draws` \eqn{\times} +#' `n_obs`), where rows are draws and columns are observations. +#' +#' @return Numeric matrix of the same dimensions as `log_weights` with +#' column-normalized log weights. +#' +#' @noRd +.normalize_log_weights <- function(log_weights) { + sweep( + log_weights, + 2, + matrixStats::colLogSumExps(log_weights), + FUN = "-", + check.margin = FALSE + ) +} diff --git a/R/pred_measure_wrappers.R b/R/pred_measure_wrappers.R new file mode 100644 index 00000000..dd5096c2 --- /dev/null +++ b/R/pred_measure_wrappers.R @@ -0,0 +1,475 @@ +#' In-sample predictive performance measures +#' +#' @description +#' Compute predictive performance measures on the same data used to fit the +#' model. This is the simplest entry point when you want density scores +#' (`elpd`, `ic`) and optional distributional or point-prediction metrics in +#' one call. +#' +#' In-sample `elpd` sums the expected log pointwise predictive density (ELPD) +#' over the training observations. Because the model has already seen these +#' data, in-sample scores are **optimistically biased** for predicting future +#' or otherwise unseen observations. For out-of-sample performance, use +#' [loo_pred_measure()], [kfold_pred_measure()], or [test_pred_measure()] +#' instead; see Vehtari's +#' [Cross-validation FAQ](https://users.aalto.fi/~ave/CV-FAQ.html) (Section 4). +#' +#' @inheritParams pred_measure_params +#' +#' @return +#' An object of class `"insample_pred_measure"` and `"pred_measure"`: a list +#' with: +#' \describe{ +#' \item{`estimates`}{Matrix of summary estimates and standard errors (rows +#' are measures, columns are `Estimate` and `SE`). Base rows `elpd` and +#' `ic` are always present; `ic` equals \eqn{-2 \times} `elpd`.} +#' \item{`pointwise`}{Matrix of observation-level contributions (one column +#' per measure).} +#' } +#' +#' The attribute `source` is `"insample"`. Attribute `dims` gives posterior +#' draws × observations. Use [print()] for a readable summary table. +#' +#' @details +#' **Input requirements by measure.** Supply only the inputs each measure +#' needs: +#' +#' | Measure | `ylp` | `y` | `ypred` | `mupred` | +#' |:---|:---:|:---:|:---:|:---:| +#' | `elpd`, `mlpd`, `ic` | ✓ | | | | +#' | `crps`, `scrps`, `rps`, `srps` | | ✓ | ✓ | | +#' | `acc`, `bacc` | | ✓ | | ✓ | +#' | `mae`, `mse`, `rmse`, `r2` | | ✓ | | ✓ | +#' +#' Base measures (`elpd`, `ic`) are always computed when `ylp` is provided. +#' Pass additional built-in names via `measure`, or supply a custom function; +#' see [supported_measures_list] and `vignette("overview-measures")` for +#' definitions and orientation (higher vs lower is better). +#' +#' **Custom measures.** A function passed to `measure` must have attribute +#' `measure_name` and return `estimate`, `se`, and `pointwise`. Only arguments +#' declared in the function signature among `y`, `ypred`, `mupred`, `ylp`, and +#' `log_weights` are supplied automatically. +#' +#' @examples +#' \donttest{ +#' if (requireNamespace("brms", quietly = TRUE)) { +#' fit <- brms::brm( +#' Reaction ~ Days, data = lme4::sleepstudy, +#' refresh = 0, chains = 2, iter = 1000 +#' ) +#' insample_pred_measure( +#' ylp = brms::log_lik(fit), +#' y = fit$data$Reaction, +#' ypred = brms::posterior_predict(fit), +#' mupred = brms::posterior_epred(fit), +#' measure = c("rmse", "r2") +#' ) +#' } +#' } +#' \dontrun{ +#' # Custom measure (same contract as built-in point-prediction metrics) +#' my_abs_err <- function(y, mupred, log_weights = NULL) { +#' mu <- colMeans(mupred) +#' pw <- abs(y - mu) +#' list( +#' estimate = mean(pw), +#' se = sd(pw) / sqrt(length(pw)), +#' pointwise = pw +#' ) +#' } +#' attr(my_abs_err, "measure_name") <- "my_abs_err" +#' # insample_pred_measure(y = y, mupred = mupred, ylp = ylp, measure = my_abs_err) +#' } +#' +#' @seealso [pred_measure()] to add measures incrementally, +#' [loo_pred_measure()], [kfold_pred_measure()], [test_pred_measure()], +#' [supported_measures_list], +#' `vignette("pred_measure_tutorial")` +#' +#' @export +insample_pred_measure <- function( + y = NULL, + ypred = NULL, + mupred = NULL, + ylp = NULL, + measure = NULL, + group_ids = NULL, + save_psis = FALSE, + control = list() +) { + pred_measure_engine( + y = y, + ypred = ypred, + mupred = mupred, + ylp = ylp, + measure = measure, + predperf = NULL, + loo = NULL, + kfold = NULL, + group_ids = group_ids, + psis_object = NULL, + save_psis = save_psis, + source = "insample", + control = control + ) +} + +#' PSIS-LOO predictive performance measures +#' +#' @description +#' Estimate out-of-sample predictive performance with **PSIS-LOO** +#' (Pareto-smoothed importance sampling leave-one-out cross-validation). +#' PSIS-LOO approximates exact LOO-CV without refitting the model once per +#' observation: each held-out point is scored by reweighting the full-data +#' posterior draws. +#' +#' The primary summary is `elpd_loo`, the LOO estimate of expected log +#' pointwise predictive density (ELPD). Additional rows include `ic_loo` +#' (\eqn{-2 \times} `elpd_loo`) and `p_loo` (effective number of parameters, +#' the difference between in-sample and LOO log predictive density). See +#' [loo::loo()] and the +#' [Cross-validation FAQ](https://users.aalto.fi/~ave/CV-FAQ.html) for +#' interpretation. +#' +#' @inheritParams pred_measure_params +#' +#' @return +#' An object of class `"loo_pred_measure"`, `"pred_measure"`, and (when a +#' `loo` object is supplied) `"loo"`. In addition to `estimates` and +#' `pointwise`, the list may contain: +#' \describe{ +#' \item{`diagnostics`}{PSIS diagnostics, including Pareto \eqn{\hat{k}} in +#' `diagnostics$pareto_k`. Values above 0.7 suggest unreliable +#' LOO estimates for those observations.} +#' \item{`log_weights`}{Normalized log importance weights used for LOO +#' scoring.} +#' \item{`psis_object`}{Stored when `save_psis = TRUE`; needed to add +#' further measures with [pred_measure()] without recomputing weights.} +#' } +#' +#' Measure names carry a `_loo` suffix (e.g. `elpd_loo`, `crps_loo`). +#' +#' @details +#' **Three equivalent input patterns:** +#' +#' \describe{ +#' \item{Precomputed `loo` object}{`loo_pred_measure(loo = loo_fit, ...)`. +#' Run [loo::loo()] with `save_psis = TRUE`.} +#' \item{`ylp` + `psis_object`}{Pass both when you have already computed +#' PSIS weights separately.} +#' \item{`ylp` only}{PSIS weights are computed internally from `ylp`.} +#' } +#' +#' For distributional and point-prediction measures (`crps`, `r2`, etc.), +#' supply `y`, `ypred`, and/or `mupred` as for [insample_pred_measure()]. When +#' adding measures incrementally, call [pred_measure()] with `predperf` set to +#' an existing result; use `save_psis = TRUE` on the initial call so weights +#' are stored. +#' +#' @examples +#' \donttest{ +#' if (requireNamespace("brms", quietly = TRUE)) { +#' fit <- brms::brm( +#' Reaction ~ Days, data = lme4::sleepstudy, +#' refresh = 0, chains = 2, iter = 1000 +#' ) +#' loo_fit <- loo::loo(fit, save_psis = TRUE) +#' loo_pred_measure( +#' loo = loo_fit, +#' y = fit$data$Reaction, +#' ypred = brms::posterior_predict(fit), +#' measure = c("rmse", "r2") +#' ) +#' } +#' } +#' +#' @seealso [insample_pred_measure()], [pred_measure()], [loo::loo()], +#' [supported_measures_list], +#' `vignette("pred_measure_tutorial")` +#' +#' @export +loo_pred_measure <- function( + y = NULL, + ypred = NULL, + mupred = NULL, + ylp = NULL, + measure = NULL, + loo = NULL, + group_ids = NULL, + psis_object = NULL, + save_psis = FALSE, + control = list() +) { + pred_measure_engine( + y = y, + ypred = ypred, + mupred = mupred, + ylp = ylp, + ylp_test = NULL, + measure = measure, + predperf = NULL, + loo = loo, + kfold = NULL, + group_ids = group_ids, + psis_object = psis_object, + save_psis = save_psis, + source = "loo", + control = control + ) +} + +#' K-fold cross-validation predictive performance measures +#' +#' @description +#' Compute predictive performance measures under **k-fold cross-validation**. +#' K-fold CV holds out groups of observations, refits (or reuses stored fits), +#' and scores the held-out folds. +#' +#' Pass a `kfold` object from [brms::kfold()] (with `save_fits = TRUE` when +#' you need posterior predictions on held-out folds). Base density summaries +#' (`elpd_kfold`, `ic_kfold`, `p_kfold`) come from the `kfold` object; +#' additional measures require the same optional inputs as +#' [insample_pred_measure()]. +#' +#' @inheritParams pred_measure_params +#' +#' @return +#' An object of class `"kfold_pred_measure"` and `"pred_measure"`, inheriting +#' attributes from the `kfold` object (`K`, `folds`, `fold_type`, etc.). The +#' list contains `estimates` and `pointwise`; measure names carry a `_kfold` +#' suffix (e.g. `elpd_kfold`, `crps_kfold`). +#' +#' @details +#' For distributional measures on held-out folds, obtain posterior predictions +#' with `brms::kfold_predict()` and pass the resulting `yrep` matrices as +#' `ypred` and/or `mupred`. See the sleep-study workflow in +#' `vignette("pred_measure_tutorial")`. +#' +#' @examples +#' \donttest{ +#' if (requireNamespace("brms", quietly = TRUE)) { +#' fit <- brms::brm( +#' Reaction ~ Days, data = lme4::sleepstudy, +#' refresh = 0, chains = 2, iter = 1000 +#' ) +#' kf <- brms::kfold(fit, K = 5, save_fits = TRUE) +#' ypred_kf <- brms::kfold_predict(kf, method = "predict")$yrep +#' kfold_pred_measure( +#' y = fit$data$Reaction, +#' ypred = ypred_kf, +#' kfold = kf, +#' measure = "rmse" +#' ) +#' } +#' } +#' +#' @seealso [loo_pred_measure()], [insample_pred_measure()], [pred_measure()], +#' [brms::kfold()], [supported_measures_list], +#' `vignette("pred_measure_tutorial")` +#' +#' @export +kfold_pred_measure <- function( + y = NULL, + ypred = NULL, + mupred = NULL, + ylp = NULL, + measure = NULL, + kfold = NULL, + group_ids = NULL, + control = list() +) { + pred_measure_engine( + y = y, + ypred = ypred, + mupred = mupred, + ylp = ylp, + ylp_test = NULL, + measure = measure, + predperf = NULL, + loo = NULL, + kfold = kfold, + group_ids = group_ids, + psis_object = NULL, + save_psis = FALSE, + source = "kfold", + control = control + ) +} + +#' Holdout predictive performance measures +#' +#' @description +#' Score predictive performance on **genuinely new (holdout) data** that was +#' not used to fit the model. This mirrors the cross-validation goal of +#' assessing how well a model predicts unseen observations, but with an +#' explicit train/test split rather than LOO or k-fold reweighting. +#' +#' Supply `ylp` from the **training** fit and `ylp_test` from log predictive +#' densities evaluated on the holdout set (e.g. +#' `brms::log_lik(fit, newdata = test_data)`). Optional distributional and +#' point-prediction measures use observed and predicted values on the test set +#' only. +#' +#' @inheritParams pred_measure_params +#' +#' @return +#' An object of class `"test_pred_measure"` and `"pred_measure"` with +#' `estimates` and `pointwise`. Measure names carry a `_test` suffix (e.g. +#' `elpd_test`, `crps_test`). Attribute `dims` reflects the test-set size +#' (from `ylp_test`), not the training data. +#' +#' @details +#' When both `ylp` (training) and `ylp_test` (holdout) are provided, the +#' effective number of parameters (`p_test`) compares in-sample and holdout log +#' predictive density, analogous to `p_loo` for PSIS-LOO. +#' +#' @examples +#' \donttest{ +#' if (requireNamespace("brms", quietly = TRUE)) { +#' data <- lme4::sleepstudy +#' train <- data[1:150, ] +#' test <- data[151:nrow(data), ] +#' fit <- brms::brm( +#' Reaction ~ Days, data = train, +#' refresh = 0, chains = 2, iter = 1000 +#' ) +#' test_pred_measure( +#' y = test$Reaction, +#' ypred = brms::posterior_predict(fit, newdata = test), +#' mupred = brms::posterior_epred(fit, newdata = test), +#' ylp = brms::log_lik(fit), +#' ylp_test = brms::log_lik(fit, newdata = test), +#' measure = c("rmse", "r2") +#' ) +#' } +#' } +#' +#' @seealso [insample_pred_measure()], [loo_pred_measure()], +#' [kfold_pred_measure()], [pred_measure()], [supported_measures_list], +#' `vignette("pred_measure_tutorial")` +#' +#' @export +test_pred_measure <- function( + y = NULL, + ypred = NULL, + mupred = NULL, + ylp = NULL, + ylp_test = NULL, + measure = NULL, + group_ids = NULL, + control = list() +) { + pred_measure_engine( + y = y, + ypred = ypred, + mupred = mupred, + ylp = ylp, + ylp_test = ylp_test, + measure = measure, + predperf = NULL, + loo = NULL, + kfold = NULL, + group_ids = group_ids, + psis_object = NULL, + save_psis = FALSE, + source = "test", + control = control + ) +} + +#' Add predictive performance measures to an existing result +#' +#' @description +#' Extend a `"pred_measure"` object with additional measures **without +#' recomputing** what is already stored. Use this for interactive exploration +#' or when you first compute base density summaries and later add distributional +#' or point-prediction metrics. +#' +#' Pass the existing object as `predperf` and supply any inputs newly required +#' by the requested measures (see the input table in +#' [insample_pred_measure()]). The evaluation mode (`"insample"`, `"loo"`, +#' `"kfold"`, or `"test"`) is taken from `predperf`; LOO paths reuse stored +#' PSIS weights when available. +#' +#' @inheritParams pred_measure_params +#' +#' @return +#' An updated object of the same class as `predperf`, with new rows in +#' `estimates` and columns in `pointwise` for each requested measure. Base +#' summaries (`elpd`, `ic`, and LOO/k-fold complexity terms) are not +#' recomputed. +#' +#' @details +#' **Typical workflow:** +#' +#' \preformatted{ +#' result <- loo_pred_measure(loo = loo_fit, save_psis = TRUE) +#' pred_measure( +#' y = y, +#' mupred = mupred, +#' predperf = result, +#' measure = c("rmse", "r2") +#' ) +#' } +#' +#' When extending a LOO result, ensure the initial call used `save_psis = TRUE` +#' (or that `predperf` already contains a `psis_object`) so LOO weights are +#' available for additional measures. +#' +#' @examples +#' \donttest{ +#' if (requireNamespace("brms", quietly = TRUE)) { +#' fit <- brms::brm( +#' Reaction ~ Days, data = lme4::sleepstudy, +#' refresh = 0, chains = 2, iter = 1000 +#' ) +#' result <- insample_pred_measure( +#' ylp = brms::log_lik(fit), +#' y = fit$data$Reaction, +#' ypred = brms::posterior_predict(fit), +#' measure = "rmse" +#' ) +#' pred_measure( +#' y = fit$data$Reaction, +#' mupred = brms::posterior_epred(fit), +#' predperf = result, +#' measure = "r2" +#' ) +#' } +#' } +#' +#' @seealso [insample_pred_measure()], [loo_pred_measure()], +#' [kfold_pred_measure()], [test_pred_measure()], [supported_measures_list], +#' `vignette("pred_measure_tutorial", package = "pred_measures")` +#' +#' @export +pred_measure <- function( + y = NULL, + ypred = NULL, + mupred = NULL, + ylp = NULL, + measure = NULL, + predperf, + group_ids = NULL, + psis_object = NULL, + save_psis = FALSE, + control = list() +) { + pred_measure_engine( + y = y, + ypred = ypred, + mupred = mupred, + ylp = ylp, + ylp_test = NULL, + measure = measure, + predperf = predperf, + loo = NULL, + kfold = NULL, + group_ids = group_ids, + psis_object = psis_object, + save_psis = save_psis, + source = attr(predperf, "source"), + control = control + ) +} diff --git a/R/print.R b/R/print.R index 6a3f2139..808814a4 100644 --- a/R/print.R +++ b/R/print.R @@ -247,3 +247,107 @@ convert_old_object <- function(x, digits = 1, ...) { ses <- grepl("se", nms) list(estimates = data.frame(Estimate = uz[!ses], SE = uz[ses])) } + +# print.R: S3 print methods for predictive measure objects and source +# labeling helpers. + + +#' @export +print.pred_measure <- function(x, digits = 1, ...) { + # TODO: ppl should be able to choose what is printed + dims <- attr(x, "dims") + if (is.null(dims) && !is.null(x$log_weights)) { + dims <- dim(x$log_weights) + } + source <- .pred_measure_source_label(x) + + cat("\n") + if (!is.null(dims) && length(dims) == 2) { + cat( + sprintf( + "Computed from %s posterior draws and %s observations.\n", + dims[1], + dims[2] + ) + ) + } + cat(sprintf("Data source: %s\n\n", source)) + print( + format(round(as.data.frame(x$estimates), digits), nsmall = digits), + quote = FALSE + ) + invisible(x) +} + +#' @export +print.kfold_pred_measure <- function(x, digits = 1, ...) { + print.pred_measure(x, digits = digits, ...) + if (!is.null(x$metadata$fold_id)) { + cat(sprintf("Folds: %s\n", length(unique(x$metadata$fold_id)))) + } + invisible(x) +} + +#' @export +print.loo_pred_measure <- function(x, digits = 1, plot_k = FALSE, ...) { + print.pred_measure(x, digits = digits, ...) + cat("------\n") + pareto_k <- x$diagnostics$pareto_k + if (is.null(pareto_k)) { + cat("No Pareto-k diagnostics available.\n") + return(invisible(x)) + } + + n <- length(pareto_k) + labels <- c("good", "bad", "very bad") + ranges <- c("k <= 0.7", "0.7 < k <= 1", "k > 1") + bins <- cut( + pareto_k, + breaks = c(-Inf, 0.7, 1, Inf), + labels = labels, + include.lowest = TRUE + ) + counts <- tabulate(as.integer(bins[!is.na(bins)]), nbins = length(labels)) + + cat("Pareto k diagnostic values:\n") + for (i in seq_along(labels)) { + cat(sprintf( + " %s (%s): %d (%.1f%%)\n", + labels[i], + ranges[i], + counts[i], + 100 * counts[i] / n + )) + } + + if (plot_k) { + graphics::plot( + pareto_k, + ylab = "Pareto-k", + xlab = "Observation", + main = "Pareto-k diagnostics", + pch = 16 + ) + } + invisible(x) +} + +.pred_measure_source_label <- function(x) { + cls <- class(x) + if ("loo_pred_measure" %in% cls) { + return("loo") + } + if ("insample_pred_measure" %in% cls) { + return("in-sample") + } + if ("kfold_pred_measure" %in% cls) { + return("k-fold") + } + if ("test_pred_measure" %in% cls) { + return("test") + } + if (!is.null(x$metadata$source)) { + return(as.character(x$metadata$source)) + } + "unknown" +} diff --git a/developer-notes.md b/developer-notes.md new file mode 100644 index 00000000..8dffa755 --- /dev/null +++ b/developer-notes.md @@ -0,0 +1,126 @@ +# Developer Notes: `loo` Refactoring & `pred_measure` Feature + +> **Status:** In Progress +> **Branch:** `pred_measure` +> **Related PR / Issue:** `https://github.com/stan-dev/loo/pull/363` +> **Contributors:** @florence-bockting, @avehtari, @VisruthSK, @jgabry +> **Last updated:** 2026-06-03 + +These notes document the internal discussions and ongoing progress on the `loo` +refactoring and the addition of `pred_measure` features. They serve as a +transparent, shared reference for all developers involved, covering the current +status, open tasks, resolved items, and outstanding decisions. + +## Status at a Glance + +| Area | Status | +|---|---| +| Refactoring | In Progress | +| Scores & Metrics Implementation | In Progress | +| Documentation | In Progress | +| Grouping via `group_ids` | Not Started | + +## Open Decisions + +*These questions need to be resolved before or alongside implementation. +Discuss, record the agreed outcome, and update the status below.* + +### D1: Sign convention for pointwise estimates + ++ **Context:** Some measures have opposite orientations (e.g., `rps`: lower is +better; `srps`: higher is better). It may be useful to align measures of +different orientations so that comparisons are meaningful. ++ **Question:** Should we allow the sign of pointwise estimates to be reversed +via a user-facing argument? ++ **Options under consideration:** + - `lower_is_better = TRUE / FALSE` + - `orientation = "utility" / "loss"` + - `revert_sign = TRUE / FALSE` ++ **Decision:** *pending* + +### D2: Scaled `rps`: separate function vs. argument ++ **Context:** We consider to use `rps` as a unified score for continuous and +ordered categorical inputs, subsuming `crps`, `scrps`, `srps`, and `rps`. +As they can use the same underlying implementation (see implementation shared +by Aki; contact Aki or Flo) ++ **Question:** Do we need an explicit `srps` function, or is a `scale = TRUE / +FALSE` argument inside `rps` sufficient? ++ **Decision:** Yes. We need `srps` for easy user-input of measures in `pred_measure`. Example: +``` +pred_measure( + y = y, ypred = ypred, measure = c("rps", "srps"), ... +) +``` +Without having the scaled version as explicit form, we would have to create a custom measure with a different name otherwise we would have duplicate names. + +### D3: Handling of `r_eff` ++ **Context:** There is an open upstream discussion on `r_eff` handling in +`posterior` by Aki: +[stan-dev/posterior#446](https://github.com/stan-dev/posterior/issues/446). ++ **Question:** How should `r_eff` be handled in this package? ++ **Decision:** *pending* + + + +## Tasks + +### Refactoring + +- [ ] Remove the "old" `crps` implementation that requires two independent + `ypred` inputs; retain only the new ECDF-based implementation. +- [ ] Align the currently two existing `elpd` implementations +(old and new implementation). +- [ ] Move `ic` out of "base measures"; it should not be included + automatically but must be explicitly requested by the user. +- [ ] Provide an interface to `loo_compare` and verify consistency with it. +- [ ] Resolve handling of `r_eff` *(see Decision D3 above)*. + +### Implementation of Scores & Metrics + +- [ ] Consolidate `crps`, `rps`, `scrps`, and `srps` into a unified `rps` + function usable for continuous and ordered categorical inputs (including + scaled variants). + - [ ] Use the implementation shared by Aki with Flo. + - [ ] Verify consistency with Seth's derivations. + - [ ] Resolve the `srps` question *(see Decision D2 above)*. +- [ ] Address the `kfold_pred_measure()` limitation with families that return + 3D objects (e.g., categorical models produce an `S × N × K` array from + `brms::kfold_predict()`): + - [x] Open a corresponding issue in `brms`: [Issue#1889](https://github.com/paul-buerkner/brms/issues/1889) + + Created also a corresponding [PR#1890](https://github.com/paul-buerkner/brms/pull/1890) + - [ ] Until the `brms` issue is resolved: document which families are not + yet supported and return a `"not implemented"` error with a pointer to + the `brms` issue. + +### Documentation + +- [ ] Write detailed descriptions (including derivations where appropriate) + for individual measures. +- [ ] Write overview document with short descriptions of all measures > + [`vignettes/articles-online-only/pred_measure-formulas.Rmd`] +- [ ] Write overview document explaining the new `pred_measure` family > + [`vignettes/articles-online-only/pred_measure-family.Rmd`] +- [ ] Ensure overview documents are published as online-only articles. +- [ ] Extend glossary with key terminology (measure, metric, score, utility, + loss, ...) > [`R/loo-glossary.R`] + +### Grouping via `group_ids` + +- [ ] Work out the different grouping scenarios for `loo` and `kfold`. +- [ ] Add `group_ids` support to `kfold_pred_measure()`, + `loo_pred_measure()`, and `pred_measure()`. + + > **Blocker:** The implementation approach is still uncertain. The + > grouping scenarios must be fully understood before any implementation + > begins. + + + +## References & Resources + +| Resource | Link / Contact | +|---|---| +| Vignette: `kfold` and test data with `loo` | | +| Discussion: `r_eff` handling in `posterior` (Aki) | | +| `rps` derivation by Seth | Contact @florence-bockting or @avehtari | +| `rps` implementation by Aki | Contact @florence-bockting or @avehtari | \ No newline at end of file diff --git a/man/acc.Rd b/man/acc.Rd new file mode 100644 index 00000000..e64bab92 --- /dev/null +++ b/man/acc.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures.R +\name{acc} +\alias{acc} +\title{Classification accuracy (\code{acc})} +\usage{ +acc(y, mupred, log_weights = NULL, pointwise = NULL, revert_sign = FALSE) +} +\arguments{ +\item{y}{An integer vector of observed class labels.} + +\item{mupred}{A numeric array of posterior predictive means. For binary +outcomes use a draws x observations matrix. For multiclass outcomes use a +draws x observations x categories array.} + +\item{log_weights}{Optional numeric matrix of unnormalized log-weights +with dimensions draws x observations.} + +\item{pointwise}{Optional numeric vector of precomputed pointwise accuracy +contributions. If provided, \code{y}, \code{mupred}, and \code{log_weights} are ignored.} + +\item{revert_sign}{Logical; if \code{TRUE}, multiply the estimate and pointwise +values by -1 before returning.} +} +\value{ +A named list with elements \code{estimate} (mean accuracy), \code{se} +(standard error), and \code{pointwise} (pointwise accuracy). +} +\description{ +Computes pointwise and average classification accuracy for binary or +multiclass outcomes from posterior predictive class assignments. For binary +outcomes, each draw is thresholded at 0.5. For multiclass outcomes, each +draw is mapped to the most likely category via \code{which.max()}. +} +\examples{ +y <- c(1L, 0L, 1L) +mupred <- matrix(c(0.8, 0.3, 0.7, 0.6, 0.4, 0.9), nrow = 2) +loo::acc(y, mupred) +} diff --git a/man/bacc.Rd b/man/bacc.Rd new file mode 100644 index 00000000..115fd3df --- /dev/null +++ b/man/bacc.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures.R +\name{bacc} +\alias{bacc} +\title{Balanced classification accuracy (\code{bacc})} +\usage{ +bacc(y, mupred, log_weights = NULL, pointwise = NULL, revert_sign = FALSE) +} +\arguments{ +\item{y}{An integer vector of observed class labels.} + +\item{mupred}{A numeric array of posterior predictive means. For binary +outcomes use a draws x observations matrix. For multiclass outcomes use a +draws x observations x categories array.} + +\item{log_weights}{Optional numeric matrix of unnormalized log-weights +with dimensions draws x observations.} + +\item{pointwise}{Optional numeric vector of precomputed pointwise accuracy +contributions. If provided, \code{y}, \code{mupred}, and \code{log_weights} are ignored.} + +\item{revert_sign}{Logical; if \code{TRUE}, multiply the estimate and pointwise +values by -1 before returning.} +} +\value{ +A named list with elements \code{estimate} (balanced accuracy), \code{se} +(standard error based on balanced pointwise contributions), and +\code{pointwise} (pointwise balanced accuracy terms). +} +\description{ +Computes balanced accuracy by averaging class-specific mean accuracy, giving +each observed class equal weight regardless of class frequency. +} +\examples{ +y <- c(1L, 1L, 2L, 2L) +mupred <- array( + c(0.8, 0.2, 0.7, 0.3, 0.3, 0.7, 0.2, 0.8), + dim = c(1, 4, 2) +) +bacc(y, mupred) +} diff --git a/man/brier.Rd b/man/brier.Rd new file mode 100644 index 00000000..9b6a3004 --- /dev/null +++ b/man/brier.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures.R +\name{brier} +\alias{brier} +\title{Brier score (\code{brier})} +\usage{ +brier(y, ypred, log_weights = NULL, pointwise = NULL, revert_sign = FALSE) +} +\arguments{ +\item{y}{A numeric vector of binary outcomes coded as 0 or 1.} + +\item{ypred}{A numeric matrix of posterior predictive probabilities with +dimensions draws x observations.} + +\item{log_weights}{Optional numeric matrix of unnormalized log-weights +with dimensions draws x observations.} + +\item{pointwise}{Optional numeric vector of precomputed pointwise Brier +scores. If provided, \code{y}, \code{ypred}, and \code{log_weights} are ignored.} + +\item{revert_sign}{Logical; if \code{TRUE}, multiply the estimate and pointwise +values by -1 before returning.} +} +\value{ +A named list with elements \code{estimate} (mean Brier score), \code{se} +(standard error), and \code{pointwise} (pointwise brier score). +} +\description{ +Computes the Brier score for binary outcomes as squared error between the +observed label and predicted event probability. +} +\examples{ +y <- c(1, 0, 1) +ypred <- matrix(c(0.8, 0.2, 0.7, 0.9, 0.4, 0.6), nrow = 2) +brier(y, ypred) +} diff --git a/man/crps.Rd b/man/crps.Rd deleted file mode 100644 index 72f848df..00000000 --- a/man/crps.Rd +++ /dev/null @@ -1,123 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/crps.R -\name{crps} -\alias{crps} -\alias{scrps} -\alias{loo_crps} -\alias{loo_scrps} -\alias{crps.matrix} -\alias{crps.numeric} -\alias{loo_crps.matrix} -\alias{scrps.matrix} -\alias{scrps.numeric} -\alias{loo_scrps.matrix} -\title{Continuously ranked probability score} -\usage{ -crps(x, ...) - -scrps(x, ...) - -loo_crps(x, ...) - -loo_scrps(x, ...) - -\method{crps}{matrix}(x, x2, y, ..., permutations = 1) - -\method{crps}{numeric}(x, x2, y, ..., permutations = 1) - -\method{loo_crps}{matrix}( - x, - x2, - y, - log_lik, - ..., - permutations = 1, - r_eff = 1, - cores = getOption("mc.cores", 1) -) - -\method{scrps}{matrix}(x, x2, y, ..., permutations = 1) - -\method{scrps}{numeric}(x, x2, y, ..., permutations = 1) - -\method{loo_scrps}{matrix}( - x, - x2, - y, - log_lik, - ..., - permutations = 1, - r_eff = 1, - cores = getOption("mc.cores", 1) -) -} -\arguments{ -\item{x}{A \code{S} by \code{N} matrix (draws by observations), or a vector of length -\code{S} when only single observation is provided in \code{y}.} - -\item{...}{Passed on to \code{\link[=E_loo]{E_loo()}} in the \verb{loo_*()} version of these -functions.} - -\item{x2}{Independent draws from the same distribution as draws in \code{x}. -Should be of the identical dimension.} - -\item{y}{A vector of observations or a single value.} - -\item{permutations}{An integer, with default value of 1, specifying how many -times the expected value of |X - X'| (\verb{|x - x2|}) is computed. The row -order of \code{x2} is shuffled as elements \code{x} and \code{x2} are typically drawn -given the same values of parameters. This happens, e.g., when one calls -\code{posterior_predict()} twice for a fitted \pkg{rstanarm} or \pkg{brms} -model. Generating more permutations is expected to decrease the variance of -the computed expected value.} - -\item{log_lik}{A log-likelihood matrix the same size as \code{x}.} - -\item{r_eff}{An optional vector of relative effective sample size estimates -containing one element per observation. See \code{\link[=psis]{psis()}} for details.} - -\item{cores}{The number of cores to use for parallelization of \verb{[psis()]}. -See \code{\link[=psis]{psis()}} for details.} -} -\value{ -A list containing two elements: \code{estimates} and \code{pointwise}. -The former reports estimator and standard error and latter the pointwise -values. Following Bolin & Wallin (2023), a larger value is better. -} -\description{ -The \code{crps()} and \code{scrps()} functions and their \verb{loo_*()} counterparts can be -used to compute the continuously ranked probability score (CRPS) and scaled -CRPS (SCRPS) (as defined by Bolin and Wallin, 2023). CRPS is a proper scoring rule, and -strictly proper when the first moment of the predictive distribution is -finite. Both can be expressed in terms of samples form the predictive -distribution. See, for example, a paper by Gneiting and Raftery (2007) -for a comprehensive discussion on CRPS. -} -\details{ -To compute (S)CRPS, the user needs to provide two sets of draws, \code{x} and -\code{x2}, from the predictive distribution. This is due to the fact that formulas -used to compute CRPS involve an expectation of the absolute difference of \code{x} -and \code{x2}, both having the same distribution. See the \code{permutations} argument, -as well as Gneiting and Raftery (2007) for details. -} -\examples{ -\dontrun{ -# An example using rstanarm -library(rstanarm) -data("kidiq") -fit <- stan_glm(kid_score ~ mom_hs + mom_iq, data = kidiq) -ypred1 <- posterior_predict(fit) -ypred2 <- posterior_predict(fit) -crps(ypred1, ypred2, y = fit$y) -loo_crps(ypred1, ypred2, y = fit$y, log_lik = log_lik(fit)) -} - -} -\references{ -Bolin, D., & Wallin, J. (2023). Local scale invariance and robustness of -proper scoring rules. Statistical Science, 38(1):140-159. - -Gneiting, T., & Raftery, A. E. (2007). Strictly Proper Scoring Rules, -Prediction, and Estimation. Journal of the American Statistical Association, -102(477), 359–378. -} diff --git a/man/elpd.Rd b/man/elpd.Rd index 35bde6eb..5ea3fc3e 100644 --- a/man/elpd.Rd +++ b/man/elpd.Rd @@ -1,49 +1,72 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/elpd.R +% Please edit documentation in R/elpd.R, R/measures.R \name{elpd} \alias{elpd} \alias{elpd.array} \alias{elpd.matrix} \title{Generic (expected) log-predictive density} \usage{ -elpd(x, ...) +elpd(ylp, log_weights = NULL, pointwise = NULL, revert_sign = FALSE) -\method{elpd}{array}(x, ...) +elpd.array(x, ...) -\method{elpd}{matrix}(x, ...) +elpd.matrix(x, ...) + +elpd(ylp, log_weights = NULL, pointwise = NULL, revert_sign = FALSE) } \arguments{ +\item{ylp}{A numeric matrix of log predictive densities/probabilities with +dimensions draws x observations.} + +\item{log_weights}{Optional numeric matrix of unnormalized log-weights +with dimensions draws x observations.} + +\item{pointwise}{Optional numeric vector of precomputed pointwise +contributions. If provided, \code{ylp} and \code{log_weights} are ignored.} + +\item{revert_sign}{Logical; if \code{TRUE}, multiply the estimate and pointwise +values by -1 before returning.} + \item{x}{A log-likelihood array or matrix. The \strong{Methods (by class)} section, below, has detailed descriptions of how to specify the inputs for each method.} \item{...}{Currently ignored.} } +\value{ +A named list with elements \code{estimate} (sum over lppd_i), \code{se} +(standard error across lppd_i), and \code{pointwise} (lppd_i). +} \description{ The \code{elpd()} methods for arrays and matrices can compute the expected log pointwise predictive density for a new dataset or the log pointwise predictive density of the observed data (an overestimate of the elpd). + +Computes ELPD as the sum of pointwise log predictive density contributions +(lppd_i). } \details{ The \code{elpd()} function is an S3 generic and methods are provided for 3-D pointwise log-likelihood arrays and matrices. } -\section{Methods (by class)}{ +\section{Functions}{ \itemize{ -\item \code{elpd(array)}: An \eqn{I} by \eqn{C} by \eqn{N} array, where \eqn{I} +\item \code{elpd.array()}: An \eqn{I} by \eqn{C} by \eqn{N} array, where \eqn{I} is the number of MCMC iterations per chain, \eqn{C} is the number of chains, and \eqn{N} is the number of data points. -\item \code{elpd(matrix)}: An \eqn{S} by \eqn{N} matrix, where \eqn{S} is the size +\item \code{elpd.matrix()}: An \eqn{S} by \eqn{N} matrix, where \eqn{S} is the size of the posterior sample (with all chains merged) and \eqn{N} is the number of data points. }} \examples{ # Calculate the lpd of the observed data -LLarr <- example_loglik_array() -elpd(LLarr) +# LLarr <- example_loglik_array() +# elpd(LLarr) +ylp <- matrix(log(c(0.2, 0.4, 0.3, 0.8)), nrow = 2) +elpd(ylp) } \seealso{ The vignette \emph{Holdout validation and K-fold cross-validation of Stan diff --git a/man/insample_pred_measure.Rd b/man/insample_pred_measure.Rd new file mode 100644 index 00000000..455926be --- /dev/null +++ b/man/insample_pred_measure.Rd @@ -0,0 +1,144 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pred_measure_wrappers.R +\name{insample_pred_measure} +\alias{insample_pred_measure} +\title{In-sample predictive performance measures} +\usage{ +insample_pred_measure( + y = NULL, + ypred = NULL, + mupred = NULL, + ylp = NULL, + measure = NULL, + group_ids = NULL, + save_psis = FALSE, + control = list() +) +} +\arguments{ +\item{y}{Vector of observed values (\code{n}). Required for distributional and +point-prediction measures such as \code{crps}, \code{mae}, and \code{acc}.} + +\item{ypred}{Matrix of posterior predictive draws (\code{S} draws × \code{n} +observations), typically from \code{\link[brms:posterior_predict.brmsfit]{brms::posterior_predict()}}. Required for +distributional measures such as \code{crps}, \code{rps}, and \code{scrps}.} + +\item{mupred}{Matrix of posterior expected values (\code{S} × \code{n}), typically from +\code{\link[brms:posterior_epred.brmsfit]{brms::posterior_epred()}}. Required for point-prediction measures such as +\code{mae}, \code{rmse}, \code{r2}, and \code{acc}.} + +\item{ylp}{Matrix of pointwise log predictive densities or probabilities +(\code{S} × \code{n}), typically from \code{\link[brms:log_lik.brmsfit]{brms::log_lik()}}. Required for density-based +summaries (\code{elpd}, \code{mlpd}, \code{ic}).} + +\item{measure}{Additional measures beyond the base summaries (\code{elpd} and +\code{ic}, which are always included). Can be: +\itemize{ +\item A \strong{character vector} of built-in names; see +\code{\link[=supported_measures_list]{supported_measures_list()}}. +\item A \strong{function} with attribute \code{"measure_name"} for one custom measure. +\item A \strong{list} mixing character scalars (built-in names) and named +functions (custom measures), e.g. \code{list("rps", my_metric = my_fun)}. +} +Custom functions are called with any of \code{y}, \code{ypred}, \code{mupred}, \code{ylp}, and +\code{log_weights} that appear in their formals, plus arguments from \code{control}. +They must return a list with \code{estimates} and \code{pointwise}.} + +\item{group_ids}{Optional vector of group identifiers for grouped summaries +(reserved; not yet implemented).} + +\item{save_psis}{Logical. If \code{TRUE}, store the \code{psis} object in the result +so that additional measures can be added later with \code{\link[=pred_measure]{pred_measure()}} without +recomputing PSIS weights.} + +\item{control}{Named list of per-measure settings. Each name must match an +element of \code{measure}; the value is a list of arguments passed to that +measure's summary function (e.g. \code{list(new_measure = list(add_arg = 10))}).} +} +\value{ +An object of class \code{"insample_pred_measure"} and \code{"pred_measure"}: a list +with: +\describe{ +\item{\code{estimates}}{Matrix of summary estimates and standard errors (rows +are measures, columns are \code{Estimate} and \code{SE}). Base rows \code{elpd} and +\code{ic} are always present; \code{ic} equals \eqn{-2 \times} \code{elpd}.} +\item{\code{pointwise}}{Matrix of observation-level contributions (one column +per measure).} +} + +The attribute \code{source} is \code{"insample"}. Attribute \code{dims} gives posterior +draws × observations. Use \code{\link[=print]{print()}} for a readable summary table. +} +\description{ +Compute predictive performance measures on the same data used to fit the +model. This is the simplest entry point when you want density scores +(\code{elpd}, \code{ic}) and optional distributional or point-prediction metrics in +one call. + +In-sample \code{elpd} sums the expected log pointwise predictive density (ELPD) +over the training observations. Because the model has already seen these +data, in-sample scores are \strong{optimistically biased} for predicting future +or otherwise unseen observations. For out-of-sample performance, use +\code{\link[=loo_pred_measure]{loo_pred_measure()}}, \code{\link[=kfold_pred_measure]{kfold_pred_measure()}}, or \code{\link[=test_pred_measure]{test_pred_measure()}} +instead; see Vehtari's +\href{https://users.aalto.fi/~ave/CV-FAQ.html}{Cross-validation FAQ} (Section 4). +} +\details{ +\strong{Input requirements by measure.} Supply only the inputs each measure +needs:\tabular{lcccc}{ + Measure \tab \code{ylp} \tab \code{y} \tab \code{ypred} \tab \code{mupred} \cr + \code{elpd}, \code{mlpd}, \code{ic} \tab ✓ \tab \tab \tab \cr + \code{crps}, \code{scrps}, \code{rps}, \code{srps} \tab \tab ✓ \tab ✓ \tab \cr + \code{acc}, \code{bacc} \tab \tab ✓ \tab \tab ✓ \cr + \code{mae}, \code{mse}, \code{rmse}, \code{r2} \tab \tab ✓ \tab \tab ✓ \cr +} + + +Base measures (\code{elpd}, \code{ic}) are always computed when \code{ylp} is provided. +Pass additional built-in names via \code{measure}, or supply a custom function; +see \link{supported_measures_list} and \code{vignette("overview-measures")} for +definitions and orientation (higher vs lower is better). + +\strong{Custom measures.} A function passed to \code{measure} must have attribute +\code{measure_name} and return \code{estimate}, \code{se}, and \code{pointwise}. Only arguments +declared in the function signature among \code{y}, \code{ypred}, \code{mupred}, \code{ylp}, and +\code{log_weights} are supplied automatically. +} +\examples{ +\donttest{ +if (requireNamespace("brms", quietly = TRUE)) { + fit <- brms::brm( + Reaction ~ Days, data = lme4::sleepstudy, + refresh = 0, chains = 2, iter = 1000 + ) + insample_pred_measure( + ylp = brms::log_lik(fit), + y = fit$data$Reaction, + ypred = brms::posterior_predict(fit), + mupred = brms::posterior_epred(fit), + measure = c("rmse", "r2") + ) +} +} +\dontrun{ +# Custom measure (same contract as built-in point-prediction metrics) +my_abs_err <- function(y, mupred, log_weights = NULL) { + mu <- colMeans(mupred) + pw <- abs(y - mu) + list( + estimate = mean(pw), + se = sd(pw) / sqrt(length(pw)), + pointwise = pw + ) +} +attr(my_abs_err, "measure_name") <- "my_abs_err" +# insample_pred_measure(y = y, mupred = mupred, ylp = ylp, measure = my_abs_err) +} + +} +\seealso{ +\code{\link[=pred_measure]{pred_measure()}} to add measures incrementally, +\code{\link[=loo_pred_measure]{loo_pred_measure()}}, \code{\link[=kfold_pred_measure]{kfold_pred_measure()}}, \code{\link[=test_pred_measure]{test_pred_measure()}}, +\link{supported_measures_list}, +\code{vignette("pred_measure_tutorial")} +} diff --git a/man/kfold_pred_measure.Rd b/man/kfold_pred_measure.Rd new file mode 100644 index 00000000..f44d415d --- /dev/null +++ b/man/kfold_pred_measure.Rd @@ -0,0 +1,104 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pred_measure_wrappers.R +\name{kfold_pred_measure} +\alias{kfold_pred_measure} +\title{K-fold cross-validation predictive performance measures} +\usage{ +kfold_pred_measure( + y = NULL, + ypred = NULL, + mupred = NULL, + ylp = NULL, + measure = NULL, + kfold = NULL, + group_ids = NULL, + control = list() +) +} +\arguments{ +\item{y}{Vector of observed values (\code{n}). Required for distributional and +point-prediction measures such as \code{crps}, \code{mae}, and \code{acc}.} + +\item{ypred}{Matrix of posterior predictive draws (\code{S} draws × \code{n} +observations), typically from \code{\link[brms:posterior_predict.brmsfit]{brms::posterior_predict()}}. Required for +distributional measures such as \code{crps}, \code{rps}, and \code{scrps}.} + +\item{mupred}{Matrix of posterior expected values (\code{S} × \code{n}), typically from +\code{\link[brms:posterior_epred.brmsfit]{brms::posterior_epred()}}. Required for point-prediction measures such as +\code{mae}, \code{rmse}, \code{r2}, and \code{acc}.} + +\item{ylp}{Matrix of pointwise log predictive densities or probabilities +(\code{S} × \code{n}), typically from \code{\link[brms:log_lik.brmsfit]{brms::log_lik()}}. Required for density-based +summaries (\code{elpd}, \code{mlpd}, \code{ic}).} + +\item{measure}{Additional measures beyond the base summaries (\code{elpd} and +\code{ic}, which are always included). Can be: +\itemize{ +\item A \strong{character vector} of built-in names; see +\code{\link[=supported_measures_list]{supported_measures_list()}}. +\item A \strong{function} with attribute \code{"measure_name"} for one custom measure. +\item A \strong{list} mixing character scalars (built-in names) and named +functions (custom measures), e.g. \code{list("rps", my_metric = my_fun)}. +} +Custom functions are called with any of \code{y}, \code{ypred}, \code{mupred}, \code{ylp}, and +\code{log_weights} that appear in their formals, plus arguments from \code{control}. +They must return a list with \code{estimates} and \code{pointwise}.} + +\item{kfold}{A \code{kfold} object from \code{\link[brms:kfold.brmsfit]{brms::kfold()}}. Supplies ELPD summaries +and fold structure for \code{\link[=kfold_pred_measure]{kfold_pred_measure()}}; pass \code{y}, \code{ypred}, and/or +\code{mupred} when requesting additional measures.} + +\item{group_ids}{Optional vector of group identifiers for grouped summaries +(reserved; not yet implemented).} + +\item{control}{Named list of per-measure settings. Each name must match an +element of \code{measure}; the value is a list of arguments passed to that +measure's summary function (e.g. \code{list(new_measure = list(add_arg = 10))}).} +} +\value{ +An object of class \code{"kfold_pred_measure"} and \code{"pred_measure"}, inheriting +attributes from the \code{kfold} object (\code{K}, \code{folds}, \code{fold_type}, etc.). The +list contains \code{estimates} and \code{pointwise}; measure names carry a \verb{_kfold} +suffix (e.g. \code{elpd_kfold}, \code{crps_kfold}). +} +\description{ +Compute predictive performance measures under \strong{k-fold cross-validation}. +K-fold CV holds out groups of observations, refits (or reuses stored fits), +and scores the held-out folds. + +Pass a \code{kfold} object from \code{\link[brms:kfold.brmsfit]{brms::kfold()}} (with \code{save_fits = TRUE} when +you need posterior predictions on held-out folds). Base density summaries +(\code{elpd_kfold}, \code{ic_kfold}, \code{p_kfold}) come from the \code{kfold} object; +additional measures require the same optional inputs as +\code{\link[=insample_pred_measure]{insample_pred_measure()}}. +} +\details{ +For distributional measures on held-out folds, obtain posterior predictions +with \code{brms::kfold_predict()} and pass the resulting \code{yrep} matrices as +\code{ypred} and/or \code{mupred}. See the sleep-study workflow in +\code{vignette("pred_measure_tutorial")}. +} +\examples{ +\donttest{ +if (requireNamespace("brms", quietly = TRUE)) { + fit <- brms::brm( + Reaction ~ Days, data = lme4::sleepstudy, + refresh = 0, chains = 2, iter = 1000 + ) + kf <- brms::kfold(fit, K = 5, save_fits = TRUE) + ypred_kf <- brms::kfold_predict(kf, method = "predict")$yrep + kfold_pred_measure( + y = fit$data$Reaction, + ypred = ypred_kf, + kfold = kf, + measure = "rmse" + ) +} +} + +} +\seealso{ +\code{\link[=loo_pred_measure]{loo_pred_measure()}}, \code{\link[=insample_pred_measure]{insample_pred_measure()}}, \code{\link[=pred_measure]{pred_measure()}}, +\code{\link[brms:kfold.brmsfit]{brms::kfold()}}, \link{supported_measures_list}, +\code{vignette("pred_measure_tutorial")} +} diff --git a/man/loo_pred_measure.Rd b/man/loo_pred_measure.Rd new file mode 100644 index 00000000..62086136 --- /dev/null +++ b/man/loo_pred_measure.Rd @@ -0,0 +1,138 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pred_measure_wrappers.R +\name{loo_pred_measure} +\alias{loo_pred_measure} +\title{PSIS-LOO predictive performance measures} +\usage{ +loo_pred_measure( + y = NULL, + ypred = NULL, + mupred = NULL, + ylp = NULL, + measure = NULL, + loo = NULL, + group_ids = NULL, + psis_object = NULL, + save_psis = FALSE, + control = list() +) +} +\arguments{ +\item{y}{Vector of observed values (\code{n}). Required for distributional and +point-prediction measures such as \code{crps}, \code{mae}, and \code{acc}.} + +\item{ypred}{Matrix of posterior predictive draws (\code{S} draws × \code{n} +observations), typically from \code{\link[brms:posterior_predict.brmsfit]{brms::posterior_predict()}}. Required for +distributional measures such as \code{crps}, \code{rps}, and \code{scrps}.} + +\item{mupred}{Matrix of posterior expected values (\code{S} × \code{n}), typically from +\code{\link[brms:posterior_epred.brmsfit]{brms::posterior_epred()}}. Required for point-prediction measures such as +\code{mae}, \code{rmse}, \code{r2}, and \code{acc}.} + +\item{ylp}{Matrix of pointwise log predictive densities or probabilities +(\code{S} × \code{n}), typically from \code{\link[brms:log_lik.brmsfit]{brms::log_lik()}}. Required for density-based +summaries (\code{elpd}, \code{mlpd}, \code{ic}).} + +\item{measure}{Additional measures beyond the base summaries (\code{elpd} and +\code{ic}, which are always included). Can be: +\itemize{ +\item A \strong{character vector} of built-in names; see +\code{\link[=supported_measures_list]{supported_measures_list()}}. +\item A \strong{function} with attribute \code{"measure_name"} for one custom measure. +\item A \strong{list} mixing character scalars (built-in names) and named +functions (custom measures), e.g. \code{list("rps", my_metric = my_fun)}. +} +Custom functions are called with any of \code{y}, \code{ypred}, \code{mupred}, \code{ylp}, and +\code{log_weights} that appear in their formals, plus arguments from \code{control}. +They must return a list with \code{estimates} and \code{pointwise}.} + +\item{loo}{A \code{\link[=loo]{loo()}} result, computed with +\code{save_psis = TRUE} so that PSIS weights are available for additional +measures. See \code{\link[=loo_pred_measure]{loo_pred_measure()}}.} + +\item{group_ids}{Optional vector of group identifiers for grouped summaries +(reserved; not yet implemented).} + +\item{psis_object}{A \code{psis} object with LOO importance weights. An +alternative to passing a full \code{loo} object; must be supplied together with +\code{ylp} when computing \code{elpd}.} + +\item{save_psis}{Logical. If \code{TRUE}, store the \code{psis} object in the result +so that additional measures can be added later with \code{\link[=pred_measure]{pred_measure()}} without +recomputing PSIS weights.} + +\item{control}{Named list of per-measure settings. Each name must match an +element of \code{measure}; the value is a list of arguments passed to that +measure's summary function (e.g. \code{list(new_measure = list(add_arg = 10))}).} +} +\value{ +An object of class \code{"loo_pred_measure"}, \code{"pred_measure"}, and (when a +\code{loo} object is supplied) \code{"loo"}. In addition to \code{estimates} and +\code{pointwise}, the list may contain: +\describe{ +\item{\code{diagnostics}}{PSIS diagnostics, including Pareto \eqn{\hat{k}} in +\code{diagnostics$pareto_k}. Values above 0.7 suggest unreliable +LOO estimates for those observations.} +\item{\code{log_weights}}{Normalized log importance weights used for LOO +scoring.} +\item{\code{psis_object}}{Stored when \code{save_psis = TRUE}; needed to add +further measures with \code{\link[=pred_measure]{pred_measure()}} without recomputing weights.} +} + +Measure names carry a \verb{_loo} suffix (e.g. \code{elpd_loo}, \code{crps_loo}). +} +\description{ +Estimate out-of-sample predictive performance with \strong{PSIS-LOO} +(Pareto-smoothed importance sampling leave-one-out cross-validation). +PSIS-LOO approximates exact LOO-CV without refitting the model once per +observation: each held-out point is scored by reweighting the full-data +posterior draws. + +The primary summary is \code{elpd_loo}, the LOO estimate of expected log +pointwise predictive density (ELPD). Additional rows include \code{ic_loo} +(\eqn{-2 \times} \code{elpd_loo}) and \code{p_loo} (effective number of parameters, +the difference between in-sample and LOO log predictive density). See +\code{\link[=loo]{loo()}} and the +\href{https://users.aalto.fi/~ave/CV-FAQ.html}{Cross-validation FAQ} for +interpretation. +} +\details{ +\strong{Three equivalent input patterns:} + +\describe{ +\item{Precomputed \code{loo} object}{\code{loo_pred_measure(loo = loo_fit, ...)}. +Run \code{\link[=loo]{loo()}} with \code{save_psis = TRUE}.} +\item{\code{ylp} + \code{psis_object}}{Pass both when you have already computed +PSIS weights separately.} +\item{\code{ylp} only}{PSIS weights are computed internally from \code{ylp}.} +} + +For distributional and point-prediction measures (\code{crps}, \code{r2}, etc.), +supply \code{y}, \code{ypred}, and/or \code{mupred} as for \code{\link[=insample_pred_measure]{insample_pred_measure()}}. When +adding measures incrementally, call \code{\link[=pred_measure]{pred_measure()}} with \code{predperf} set to +an existing result; use \code{save_psis = TRUE} on the initial call so weights +are stored. +} +\examples{ +\donttest{ +if (requireNamespace("brms", quietly = TRUE)) { + fit <- brms::brm( + Reaction ~ Days, data = lme4::sleepstudy, + refresh = 0, chains = 2, iter = 1000 + ) + loo_fit <- loo::loo(fit, save_psis = TRUE) + loo_pred_measure( + loo = loo_fit, + y = fit$data$Reaction, + ypred = brms::posterior_predict(fit), + measure = c("rmse", "r2") + ) +} +} + +} +\seealso{ +\code{\link[=insample_pred_measure]{insample_pred_measure()}}, \code{\link[=pred_measure]{pred_measure()}}, \code{\link[=loo]{loo()}}, +\link{supported_measures_list}, +\code{vignette("pred_measure_tutorial")} +} diff --git a/man/mae.Rd b/man/mae.Rd new file mode 100644 index 00000000..530ec762 --- /dev/null +++ b/man/mae.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures.R +\name{mae} +\alias{mae} +\title{Mean absolute error (\code{mae})} +\usage{ +mae(y, mupred, log_weights = NULL, pointwise = NULL, revert_sign = FALSE) +} +\arguments{ +\item{y}{A numeric vector of observed outcomes.} + +\item{mupred}{A numeric matrix of posterior expected predictions with +dimensions draws x observations. A length-\code{n} vector is also accepted.} + +\item{log_weights}{Optional numeric matrix of unnormalized log-weights +with dimensions draws x observations.} + +\item{pointwise}{Optional numeric vector of precomputed pointwise absolute +errors. If provided, \code{y}, \code{mupred}, and \code{log_weights} are ignored.} + +\item{revert_sign}{Logical; if \code{TRUE}, multiply the estimate and pointwise +values by -1 before returning.} +} +\value{ +A named list with elements \code{estimate} (mean absolute error), \code{se} +(standard error), and \code{pointwise} (pointwise absolute errors). +} +\description{ +Computes MAE between observed outcomes and posterior predictive point +predictions. Point predictions are obtained by averaging \code{mupred} draws, or +by PSIS-weighted averaging when \code{log_weights} is provided. +} +\examples{ +y <- c(1, 2, 3) +mupred <- matrix(c(0.9, 2.1, 2.8, 1.2, 1.9, 3.1), nrow = 2) +mae(y, mupred) +} diff --git a/man/mlpd.Rd b/man/mlpd.Rd new file mode 100644 index 00000000..99e21055 --- /dev/null +++ b/man/mlpd.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures.R +\name{mlpd} +\alias{mlpd} +\title{Mean log pointwise predictive density (\code{mlpd})} +\usage{ +mlpd(ylp, log_weights = NULL, pointwise = NULL, revert_sign = FALSE) +} +\arguments{ +\item{ylp}{A numeric matrix of log predictive densities/probabilities with +dimensions draws x observations.} + +\item{log_weights}{Optional numeric matrix of unnormalized log-weights +with dimensions draws x observations.} + +\item{pointwise}{Optional numeric vector of precomputed pointwise +contributions. If provided, \code{ylp} and \code{log_weights} are ignored.} + +\item{revert_sign}{Logical; if \code{TRUE}, multiply the estimate and pointwise +values by -1 before returning.} +} +\value{ +A named list with elements \code{estimate} (mean over lppd_i), \code{se} +(standard error), and \code{pointwise} (lppd_i). +} +\description{ +Computes MLPD as the average of pointwise log predictive density (lppd_i) +values. Inputs follow the same conventions as \code{\link[=elpd]{elpd()}}. +} +\examples{ +ylp <- matrix(log(c(0.2, 0.4, 0.3, 0.8)), nrow = 2) +mlpd(ylp) +} diff --git a/man/mse.Rd b/man/mse.Rd new file mode 100644 index 00000000..9c501545 --- /dev/null +++ b/man/mse.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures.R +\name{mse} +\alias{mse} +\title{Mean squared error (\code{mse})} +\usage{ +mse(y, mupred, log_weights = NULL, pointwise = NULL, revert_sign = FALSE) +} +\arguments{ +\item{y}{A numeric vector of observed outcomes.} + +\item{mupred}{A numeric matrix of posterior expected predictions with +dimensions draws x observations. A length-\code{n} vector is also accepted.} + +\item{log_weights}{Optional numeric matrix of unnormalized log-weights +with dimensions draws x observations.} + +\item{pointwise}{Optional numeric vector of precomputed pointwise absolute +errors. If provided, \code{y}, \code{mupred}, and \code{log_weights} are ignored.} + +\item{revert_sign}{Logical; if \code{TRUE}, multiply the estimate and pointwise +values by -1 before returning.} +} +\value{ +A named list with elements \code{estimate} (mean squared error), \code{se} +(standard error), and \code{pointwise} (pointwise squared errors). +} +\description{ +Computes MSE between observed outcomes and posterior predictive point +predictions. Point predictions are obtained by averaging \code{mupred} draws, or +by PSIS-weighted averaging when \code{log_weights} is provided. +} +\examples{ +y <- c(1, 2, 3) +mupred <- matrix(c(0.9, 2.1, 2.8, 1.2, 1.9, 3.1), nrow = 2) +mse(y, mupred) +} diff --git a/man/pred_measure.Rd b/man/pred_measure.Rd new file mode 100644 index 00000000..e63b1d42 --- /dev/null +++ b/man/pred_measure.Rd @@ -0,0 +1,130 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pred_measure_wrappers.R +\name{pred_measure} +\alias{pred_measure} +\title{Add predictive performance measures to an existing result} +\usage{ +pred_measure( + y = NULL, + ypred = NULL, + mupred = NULL, + ylp = NULL, + measure = NULL, + predperf, + group_ids = NULL, + psis_object = NULL, + save_psis = FALSE, + control = list() +) +} +\arguments{ +\item{y}{Vector of observed values (\code{n}). Required for distributional and +point-prediction measures such as \code{crps}, \code{mae}, and \code{acc}.} + +\item{ypred}{Matrix of posterior predictive draws (\code{S} draws × \code{n} +observations), typically from \code{\link[brms:posterior_predict.brmsfit]{brms::posterior_predict()}}. Required for +distributional measures such as \code{crps}, \code{rps}, and \code{scrps}.} + +\item{mupred}{Matrix of posterior expected values (\code{S} × \code{n}), typically from +\code{\link[brms:posterior_epred.brmsfit]{brms::posterior_epred()}}. Required for point-prediction measures such as +\code{mae}, \code{rmse}, \code{r2}, and \code{acc}.} + +\item{ylp}{Matrix of pointwise log predictive densities or probabilities +(\code{S} × \code{n}), typically from \code{\link[brms:log_lik.brmsfit]{brms::log_lik()}}. Required for density-based +summaries (\code{elpd}, \code{mlpd}, \code{ic}).} + +\item{measure}{Additional measures beyond the base summaries (\code{elpd} and +\code{ic}, which are always included). Can be: +\itemize{ +\item A \strong{character vector} of built-in names; see +\code{\link[=supported_measures_list]{supported_measures_list()}}. +\item A \strong{function} with attribute \code{"measure_name"} for one custom measure. +\item A \strong{list} mixing character scalars (built-in names) and named +functions (custom measures), e.g. \code{list("rps", my_metric = my_fun)}. +} +Custom functions are called with any of \code{y}, \code{ypred}, \code{mupred}, \code{ylp}, and +\code{log_weights} that appear in their formals, plus arguments from \code{control}. +They must return a list with \code{estimates} and \code{pointwise}.} + +\item{predperf}{An existing predictive measure object (class +\code{"pred_measure"}) to update. When supplied, base density summaries and +(for LOO) PSIS weights are reused instead of recomputed.} + +\item{group_ids}{Optional vector of group identifiers for grouped summaries +(reserved; not yet implemented).} + +\item{psis_object}{A \code{psis} object with LOO importance weights. An +alternative to passing a full \code{loo} object; must be supplied together with +\code{ylp} when computing \code{elpd}.} + +\item{save_psis}{Logical. If \code{TRUE}, store the \code{psis} object in the result +so that additional measures can be added later with \code{\link[=pred_measure]{pred_measure()}} without +recomputing PSIS weights.} + +\item{control}{Named list of per-measure settings. Each name must match an +element of \code{measure}; the value is a list of arguments passed to that +measure's summary function (e.g. \code{list(new_measure = list(add_arg = 10))}).} +} +\value{ +An updated object of the same class as \code{predperf}, with new rows in +\code{estimates} and columns in \code{pointwise} for each requested measure. Base +summaries (\code{elpd}, \code{ic}, and LOO/k-fold complexity terms) are not +recomputed. +} +\description{ +Extend a \code{"pred_measure"} object with additional measures \strong{without +recomputing} what is already stored. Use this for interactive exploration +or when you first compute base density summaries and later add distributional +or point-prediction metrics. + +Pass the existing object as \code{predperf} and supply any inputs newly required +by the requested measures (see the input table in +\code{\link[=insample_pred_measure]{insample_pred_measure()}}). The evaluation mode (\code{"insample"}, \code{"loo"}, +\code{"kfold"}, or \code{"test"}) is taken from \code{predperf}; LOO paths reuse stored +PSIS weights when available. +} +\details{ +\strong{Typical workflow:} + +\preformatted{ +result <- loo_pred_measure(loo = loo_fit, save_psis = TRUE) +pred_measure( + y = y, + mupred = mupred, + predperf = result, + measure = c("rmse", "r2") +) +} + +When extending a LOO result, ensure the initial call used \code{save_psis = TRUE} +(or that \code{predperf} already contains a \code{psis_object}) so LOO weights are +available for additional measures. +} +\examples{ +\donttest{ +if (requireNamespace("brms", quietly = TRUE)) { + fit <- brms::brm( + Reaction ~ Days, data = lme4::sleepstudy, + refresh = 0, chains = 2, iter = 1000 + ) + result <- insample_pred_measure( + ylp = brms::log_lik(fit), + y = fit$data$Reaction, + ypred = brms::posterior_predict(fit), + measure = "rmse" + ) + pred_measure( + y = fit$data$Reaction, + mupred = brms::posterior_epred(fit), + predperf = result, + measure = "r2" + ) +} +} + +} +\seealso{ +\code{\link[=insample_pred_measure]{insample_pred_measure()}}, \code{\link[=loo_pred_measure]{loo_pred_measure()}}, +\code{\link[=kfold_pred_measure]{kfold_pred_measure()}}, \code{\link[=test_pred_measure]{test_pred_measure()}}, \link{supported_measures_list}, +\code{vignette("pred_measure_tutorial", package = "pred_measures")} +} diff --git a/man/pred_measure_params.Rd b/man/pred_measure_params.Rd new file mode 100644 index 00000000..dc0dbf98 --- /dev/null +++ b/man/pred_measure_params.Rd @@ -0,0 +1,98 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pred_measure.R +\name{pred_measure_params} +\alias{pred_measure_params} +\alias{pred_measure_engine} +\title{Shared parameters for predictive measure wrappers} +\usage{ +pred_measure_engine( + y = NULL, + ypred = NULL, + mupred = NULL, + ylp = NULL, + ylp_test = NULL, + measure = NULL, + predperf = NULL, + loo = NULL, + kfold = NULL, + group_ids = NULL, + psis_object = NULL, + save_psis = FALSE, + source = NULL, + control = list() +) +} +\arguments{ +\item{y}{Vector of observed values (\code{n}). Required for distributional and +point-prediction measures such as \code{crps}, \code{mae}, and \code{acc}.} + +\item{ypred}{Matrix of posterior predictive draws (\code{S} draws × \code{n} +observations), typically from \code{\link[brms:posterior_predict.brmsfit]{brms::posterior_predict()}}. Required for +distributional measures such as \code{crps}, \code{rps}, and \code{scrps}.} + +\item{mupred}{Matrix of posterior expected values (\code{S} × \code{n}), typically from +\code{\link[brms:posterior_epred.brmsfit]{brms::posterior_epred()}}. Required for point-prediction measures such as +\code{mae}, \code{rmse}, \code{r2}, and \code{acc}.} + +\item{ylp}{Matrix of pointwise log predictive densities or probabilities +(\code{S} × \code{n}), typically from \code{\link[brms:log_lik.brmsfit]{brms::log_lik()}}. Required for density-based +summaries (\code{elpd}, \code{mlpd}, \code{ic}).} + +\item{ylp_test}{Matrix of pointwise log predictive densities for holdout +observations (\code{S} × \code{n_test}), typically from +\code{brms::log_lik(fit, newdata = test_data)}. Used with \code{ylp} (from the +training fit) in \code{\link[=test_pred_measure]{test_pred_measure()}} to score genuinely new data.} + +\item{measure}{Additional measures beyond the base summaries (\code{elpd} and +\code{ic}, which are always included). Can be: +\itemize{ +\item A \strong{character vector} of built-in names; see +\code{\link[=supported_measures_list]{supported_measures_list()}}. +\item A \strong{function} with attribute \code{"measure_name"} for one custom measure. +\item A \strong{list} mixing character scalars (built-in names) and named +functions (custom measures), e.g. \code{list("rps", my_metric = my_fun)}. +} +Custom functions are called with any of \code{y}, \code{ypred}, \code{mupred}, \code{ylp}, and +\code{log_weights} that appear in their formals, plus arguments from \code{control}. +They must return a list with \code{estimates} and \code{pointwise}.} + +\item{predperf}{An existing predictive measure object (class +\code{"pred_measure"}) to update. When supplied, base density summaries and +(for LOO) PSIS weights are reused instead of recomputed.} + +\item{loo}{A \code{\link[=loo]{loo()}} result, computed with +\code{save_psis = TRUE} so that PSIS weights are available for additional +measures. See \code{\link[=loo_pred_measure]{loo_pred_measure()}}.} + +\item{kfold}{A \code{kfold} object from \code{\link[brms:kfold.brmsfit]{brms::kfold()}}. Supplies ELPD summaries +and fold structure for \code{\link[=kfold_pred_measure]{kfold_pred_measure()}}; pass \code{y}, \code{ypred}, and/or +\code{mupred} when requesting additional measures.} + +\item{group_ids}{Optional vector of group identifiers for grouped summaries +(reserved; not yet implemented).} + +\item{psis_object}{A \code{psis} object with LOO importance weights. An +alternative to passing a full \code{loo} object; must be supplied together with +\code{ylp} when computing \code{elpd}.} + +\item{save_psis}{Logical. If \code{TRUE}, store the \code{psis} object in the result +so that additional measures can be added later with \code{\link[=pred_measure]{pred_measure()}} without +recomputing PSIS weights.} + +\item{source}{Character string indicating the evaluation mode: \code{"insample"}, +\code{"loo"}, \code{"kfold"}, or \code{"test"}. Set automatically by the wrapper +functions; required when calling \code{\link[=pred_measure_engine]{pred_measure_engine()}} directly.} + +\item{control}{Named list of per-measure settings. Each name must match an +element of \code{measure}; the value is a list of arguments passed to that +measure's summary function (e.g. \code{list(new_measure = list(add_arg = 10))}).} + +\item{measure_name}{For a single custom function, set +\code{attr(my_fun, "measure_name") <- "my_metric"} before passing \code{my_fun} to +\code{measure}.} +} +\description{ +Parameter definitions shared by the user-facing entry points and the +internal engine \code{\link[=pred_measure_engine]{pred_measure_engine()}}. +} +\keyword{internal} diff --git a/man/ptw_log_pred_density.Rd b/man/ptw_log_pred_density.Rd new file mode 100644 index 00000000..4d6a925d --- /dev/null +++ b/man/ptw_log_pred_density.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures.R +\name{ptw_log_pred_density} +\alias{ptw_log_pred_density} +\title{Pointwise log predictive density (\code{lppd_i})} +\usage{ +ptw_log_pred_density(ylp, psis_log_weights = NULL) +} +\arguments{ +\item{ylp}{A numeric matrix of log predictive densities/probabilities with +dimensions draws x observations.} + +\item{psis_log_weights}{Optional numeric matrix of normalized PSIS log +weights with the same dimensions as \code{ylp}. Each column must sum to 1 on +the probability scale.} +} +\value{ +A numeric vector of length \code{ncol(ylp)} with pointwise log +predictive density values. +} +\description{ +Computes pointwise log predictive density contributions from a matrix of +log predictive densities/probabilities for posterior draws. When PSIS +log-weights are supplied, they are used to form a weighted log-sum-exp per +observation. +} +\examples{ +ylp <- matrix(log(c(0.2, 0.4, 0.3, 0.8)), nrow = 2) +ptw_log_pred_density(ylp) + +lw <- matrix(log(c(0.7, 0.3, 0.6, 0.4)), nrow = 2) +ptw_log_pred_density(ylp, lw) +} diff --git a/man/r2.Rd b/man/r2.Rd new file mode 100644 index 00000000..2967740d --- /dev/null +++ b/man/r2.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures.R +\name{r2} +\alias{r2} +\title{Predictive R-squared (\code{r2})} +\usage{ +r2(y, mupred, log_weights = NULL, pointwise = NULL, revert_sign = FALSE) +} +\arguments{ +\item{y}{A numeric vector of observed outcomes.} + +\item{mupred}{A numeric matrix of posterior expected predictions with +dimensions draws x observations. A length-\code{n} vector is also accepted.} + +\item{log_weights}{Optional numeric matrix of unnormalized log-weights +with dimensions draws x observations.} + +\item{pointwise}{Optional numeric vector of precomputed pointwise absolute +errors. If provided, \code{y}, \code{mupred}, and \code{log_weights} are ignored.} + +\item{revert_sign}{Logical; if \code{TRUE}, multiply the estimate and pointwise +values by -1 before returning.} +} +\value{ +A named list with elements \code{estimate} (predictive R2), \code{se} +(standard error), and \code{pointwise} (pointwise squared errors). +} +\description{ +Computes predictive R-squared as one minus the ratio of prediction MSE to +the empirical variance of \code{y}. The standard error is computed with a +first-order delta-method approximation. +} +\examples{ +y <- c(1, 2, 3) +mupred <- matrix(c(0.9, 2.1, 2.8, 1.2, 1.9, 3.1), nrow = 2) +r2(y, mupred) +} diff --git a/man/rmse.Rd b/man/rmse.Rd new file mode 100644 index 00000000..304cdef5 --- /dev/null +++ b/man/rmse.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures.R +\name{rmse} +\alias{rmse} +\title{Root mean squared error (\code{rmse})} +\usage{ +rmse(y, mupred, log_weights = NULL, pointwise = NULL, revert_sign = FALSE) +} +\arguments{ +\item{y}{A numeric vector of observed outcomes.} + +\item{mupred}{A numeric matrix of posterior expected predictions with +dimensions draws x observations. A length-\code{n} vector is also accepted.} + +\item{log_weights}{Optional numeric matrix of unnormalized log-weights +with dimensions draws x observations.} + +\item{pointwise}{Optional numeric vector of precomputed pointwise absolute +errors. If provided, \code{y}, \code{mupred}, and \code{log_weights} are ignored.} + +\item{revert_sign}{Logical; if \code{TRUE}, multiply the estimate and pointwise +values by -1 before returning.} +} +\value{ +A named list with elements \code{estimate} (root mean squared error), +\code{se} (standard error), and \code{pointwise} (pointwise squared errors). +} +\description{ +Computes RMSE as the square root of MSE and propagates uncertainty via a +first-order delta-method approximation. +} +\examples{ +y <- c(1, 2, 3) +mupred <- matrix(c(0.9, 2.1, 2.8, 1.2, 1.9, 3.1), nrow = 2) +rmse(y, mupred) +} diff --git a/man/rps.Rd b/man/rps.Rd new file mode 100644 index 00000000..ca4b4964 --- /dev/null +++ b/man/rps.Rd @@ -0,0 +1,133 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures.R +\name{rps} +\alias{rps} +\title{Ranked Probability Score (RPS, SRPS, CRPS, SCRPS)} +\usage{ +rps(y, ypred, log_weights = NULL, scaled = FALSE, revert_sign = FALSE) +} +\arguments{ +\item{y}{A numeric vector of \eqn{n} observed outcomes. May be integer-valued +(for RPS/SRPS) or continuous (for CRPS/SCRPS).} + +\item{ypred}{A numeric matrix of posterior predictive draws with dimensions +\eqn{S \times n} (draws × observations).} + +\item{log_weights}{Optional numeric matrix of log-importance- +weights with dimensions \eqn{S \times n}. When provided, a weighted PWM +estimator is used. Useful for LOO cross-validation via importance sampling +(e.g., PSIS-LOO).} + +\item{scaled}{Logical; if \code{TRUE}, computes the scaled variant (SRPS for +discrete outcomes, SCRPS for continuous outcomes). Default is \code{FALSE}.} + +\item{revert_sign}{Logical; if \code{TRUE}, multiplies scores by \eqn{-1} to +return the loss convention (lower is better) rather than the utility +convention. Default is \code{FALSE}.} +} +\value{ +A named list with: +\describe{ +\item{\code{estimate}}{Mean score across all observations.} +\item{\code{se}}{Standard error of the mean score.} +\item{\code{pointwise}}{Numeric vector of length \eqn{n} with per-observation +scores.} +} +} +\description{ +Computes proper scoring rules based on the ranked probability score family, +covering both discrete and continuous outcomes, with optional scaling. +The specific score computed depends on the type of \code{y} and \code{ypred} and the +value of \code{scaled}: +} +\details{ +\tabular{lll}{ + \code{y}/\code{ypred} type \tab \code{scaled = FALSE} \tab \code{scaled = TRUE} \cr + Discrete \tab RPS \tab SRPS \cr + Continuous \tab CRPS \tab SCRPS \cr +} + + +\strong{Scoring rules:} +\itemize{ +\item \strong{RPS} (Epstein, 1969): Compares predictive and observed cumulative +distributions for ordered discrete outcomes. +\item \strong{CRPS} (Matheson & Winkler, 1976; Gneiting & Raftery, 2007): Generalizes +RPS to continuous outcomes. Defined as +\deqn{\mathrm{CRPS}(X; y) = \frac{1}{2} E[|X - X'|] - E[|X - y|],} +where \eqn{X, X'} are independent draws from the predictive distribution. +\item \strong{SRPS/SCRPS} (Bolin & Wallin, 2023): Scaled variants that are invariant +to the scale of the predictive distribution. Defined as +\deqn{\mathrm{SCRPS}(X; y) = -\frac{E[|X - y|]}{E[|X - X'|]} - + \frac{1}{2} \log E[|X - X'|].} +} + +\strong{Estimation:} + +Scores are estimated using the probability-weighted moment (PWM) estimator +(Taillardat et al., 2016; Zamo & Naveau, 2018), which requires only a single +set of predictive draws unlike permutation-based estimators, which require +two independent draw sets. The PWM estimator is unbiased and generally more +accurate than single-permutation estimators. The same estimator is used for +both discrete and continuous outcomes; see Hosking (1990, 1996) for +theoretical justification in the discrete case. + +If log-weights (\code{log_weights}) are provided (e.g., PSIS weights +for LOO cross-validation), a weighted PWM estimator is used instead, which +accounts for the importance weights when estimating expectations. + +\strong{Sign convention:} + +Unscaled scores are returned as utilities (higher is better), consistent +with the sign convention of log score / ELPD. Scaled scores are negatively +oriented (lower is better). Use \code{revert_sign = TRUE} to obtain the +loss convention (lower is better) used in some references. +} +\examples{ +# Discrete outcomes: RPS +y <- c(2L, 1L, 3L) +ypred <- matrix(c(2, 1, 2, 3, 1, 3), nrow = 2) +rps(y, ypred) + +# Discrete outcomes: SRPS (scaled) +rps(y, ypred, scaled = TRUE) + +# Continuous outcomes: CRPS +y_cont <- c(0.5, -1.2, 2.3) +ypred_cont <- matrix(rnorm(200), nrow = 100, ncol = 3) +rps(y_cont, ypred_cont) + +# With importance weights: LOO-CRPS +log_weights <- matrix(rnorm(200), nrow = 100, ncol = 3) +rps(y_cont, ypred_cont, log_weights = log_weights) + +} +\references{ +Bolin, D. and Wallin, J. (2023). Local scale invariance and robustness of +proper scoring rules. \emph{Statistical Science}, 38(1):140–159. + +Epstein, E. S. (1969). A scoring system for probability forecasts of ranked +categories. \emph{Journal of Applied Meteorology}, 8(6):985–987. + +Gneiting, T. and Raftery, A. E. (2007). Strictly proper scoring rules, +prediction, and estimation. \emph{Journal of the American Statistical +Association}, 102(477):359–378. + +Hosking, J. R. M. (1990). L-moments: Analysis and estimation of +distributions using linear combinations of order statistics. \emph{Journal of +the Royal Statistical Society Series B}, 52(1):105–124. + +Hosking, J. R. M. (1996). Some theoretical results concerning L-moments. +Research report RC 14492. IBM Thomas J. Watson Research Division. + +Matheson, J. E. and Winkler, R. L. (1976). Scoring rules for continuous +probability distributions. \emph{Management Science}, 22(10):1087–1096. + +Taillardat, M., Mestre, O., Zamo, M., and Naveau, P. (2016). Calibrated +ensemble forecasts using quantile regression forests and ensemble model +output statistics. \emph{Monthly Weather Review}, 144(6):2375–2393. + +Zamo, M. and Naveau, P. (2018). Estimation of the continuous ranked +probability score with limited information and applications to ensemble +weather forecasts. \emph{Mathematical Geosciences}, 50:209–234. +} diff --git a/man/srps.Rd b/man/srps.Rd new file mode 100644 index 00000000..a2b37d23 --- /dev/null +++ b/man/srps.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures.R +\name{srps} +\alias{srps} +\title{Scaled Ranked Probability Score (SRPS, SCRPS)} +\usage{ +srps(y, ypred, log_weights = NULL, revert_sign = FALSE) +} +\arguments{ +\item{y}{A numeric vector of \eqn{n} observed outcomes. May be integer-valued +(for RPS/SRPS) or continuous (for CRPS/SCRPS).} + +\item{ypred}{A numeric matrix of posterior predictive draws with dimensions +\eqn{S \times n} (draws × observations).} + +\item{log_weights}{Optional numeric matrix of log-importance- +weights with dimensions \eqn{S \times n}. When provided, a weighted PWM +estimator is used. Useful for LOO cross-validation via importance sampling +(e.g., PSIS-LOO).} + +\item{revert_sign}{Logical; if \code{TRUE}, multiplies scores by \eqn{-1} to +return the loss convention (lower is better) rather than the utility +convention. Default is \code{FALSE}.} +} +\value{ +A named list with: +\describe{ +\item{\code{estimate}}{Mean score across all observations.} +\item{\code{se}}{Standard error of the mean score.} +\item{\code{pointwise}}{Numeric vector of length \eqn{n} with per-observation +scores.} +} +} +\description{ +A convenience wrapper around \code{\link[=rps]{rps()}} with \code{scaled = TRUE}. Computes the +scaled ranked probability score (SRPS) for discrete outcomes or the scaled +continuously ranked probability score (SCRPS) for continuous outcomes. +Scaling makes the score invariant to the spread of the predictive +distribution, which can be useful when comparing models across observations +with very different predictive uncertainties. +} +\details{ +See \code{\link[=rps]{rps()}} for full details on arguments, estimation, and references. +} +\examples{ +y <- c(2L, 1L, 3L) +ypred <- matrix(c(2, 1, 2, 3, 1, 3), nrow = 2) +srps(y, ypred) + +} diff --git a/man/supported_measures_list.Rd b/man/supported_measures_list.Rd new file mode 100644 index 00000000..20b3a397 --- /dev/null +++ b/man/supported_measures_list.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures.R +\docType{data} +\name{supported_measures_list} +\alias{supported_measures_list} +\title{Supported predictive measure names} +\format{ +An object of class \code{character} of length 10. +} +\usage{ +supported_measures_list +} +\description{ +A character vector of measure names that can be passed to the \code{measure} +argument of \code{\link[=insample_pred_measure]{insample_pred_measure()}}, \code{\link[=loo_pred_measure]{loo_pred_measure()}}, +\code{\link[=kfold_pred_measure]{kfold_pred_measure()}}, \code{\link[=test_pred_measure]{test_pred_measure()}}, and \code{\link[=pred_measure]{pred_measure()}}. +} +\keyword{datasets} diff --git a/man/test_pred_measure.Rd b/man/test_pred_measure.Rd new file mode 100644 index 00000000..cb1ead6c --- /dev/null +++ b/man/test_pred_measure.Rd @@ -0,0 +1,108 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pred_measure_wrappers.R +\name{test_pred_measure} +\alias{test_pred_measure} +\title{Holdout predictive performance measures} +\usage{ +test_pred_measure( + y = NULL, + ypred = NULL, + mupred = NULL, + ylp = NULL, + ylp_test = NULL, + measure = NULL, + group_ids = NULL, + control = list() +) +} +\arguments{ +\item{y}{Vector of observed values (\code{n}). Required for distributional and +point-prediction measures such as \code{crps}, \code{mae}, and \code{acc}.} + +\item{ypred}{Matrix of posterior predictive draws (\code{S} draws × \code{n} +observations), typically from \code{\link[brms:posterior_predict.brmsfit]{brms::posterior_predict()}}. Required for +distributional measures such as \code{crps}, \code{rps}, and \code{scrps}.} + +\item{mupred}{Matrix of posterior expected values (\code{S} × \code{n}), typically from +\code{\link[brms:posterior_epred.brmsfit]{brms::posterior_epred()}}. Required for point-prediction measures such as +\code{mae}, \code{rmse}, \code{r2}, and \code{acc}.} + +\item{ylp}{Matrix of pointwise log predictive densities or probabilities +(\code{S} × \code{n}), typically from \code{\link[brms:log_lik.brmsfit]{brms::log_lik()}}. Required for density-based +summaries (\code{elpd}, \code{mlpd}, \code{ic}).} + +\item{ylp_test}{Matrix of pointwise log predictive densities for holdout +observations (\code{S} × \code{n_test}), typically from +\code{brms::log_lik(fit, newdata = test_data)}. Used with \code{ylp} (from the +training fit) in \code{\link[=test_pred_measure]{test_pred_measure()}} to score genuinely new data.} + +\item{measure}{Additional measures beyond the base summaries (\code{elpd} and +\code{ic}, which are always included). Can be: +\itemize{ +\item A \strong{character vector} of built-in names; see +\code{\link[=supported_measures_list]{supported_measures_list()}}. +\item A \strong{function} with attribute \code{"measure_name"} for one custom measure. +\item A \strong{list} mixing character scalars (built-in names) and named +functions (custom measures), e.g. \code{list("rps", my_metric = my_fun)}. +} +Custom functions are called with any of \code{y}, \code{ypred}, \code{mupred}, \code{ylp}, and +\code{log_weights} that appear in their formals, plus arguments from \code{control}. +They must return a list with \code{estimates} and \code{pointwise}.} + +\item{group_ids}{Optional vector of group identifiers for grouped summaries +(reserved; not yet implemented).} + +\item{control}{Named list of per-measure settings. Each name must match an +element of \code{measure}; the value is a list of arguments passed to that +measure's summary function (e.g. \code{list(new_measure = list(add_arg = 10))}).} +} +\value{ +An object of class \code{"test_pred_measure"} and \code{"pred_measure"} with +\code{estimates} and \code{pointwise}. Measure names carry a \verb{_test} suffix (e.g. +\code{elpd_test}, \code{crps_test}). Attribute \code{dims} reflects the test-set size +(from \code{ylp_test}), not the training data. +} +\description{ +Score predictive performance on \strong{genuinely new (holdout) data} that was +not used to fit the model. This mirrors the cross-validation goal of +assessing how well a model predicts unseen observations, but with an +explicit train/test split rather than LOO or k-fold reweighting. + +Supply \code{ylp} from the \strong{training} fit and \code{ylp_test} from log predictive +densities evaluated on the holdout set (e.g. +\code{brms::log_lik(fit, newdata = test_data)}). Optional distributional and +point-prediction measures use observed and predicted values on the test set +only. +} +\details{ +When both \code{ylp} (training) and \code{ylp_test} (holdout) are provided, the +effective number of parameters (\code{p_test}) compares in-sample and holdout log +predictive density, analogous to \code{p_loo} for PSIS-LOO. +} +\examples{ +\donttest{ +if (requireNamespace("brms", quietly = TRUE)) { + data <- lme4::sleepstudy + train <- data[1:150, ] + test <- data[151:nrow(data), ] + fit <- brms::brm( + Reaction ~ Days, data = train, + refresh = 0, chains = 2, iter = 1000 + ) + test_pred_measure( + y = test$Reaction, + ypred = brms::posterior_predict(fit, newdata = test), + mupred = brms::posterior_epred(fit, newdata = test), + ylp = brms::log_lik(fit), + ylp_test = brms::log_lik(fit, newdata = test), + measure = c("rmse", "r2") + ) +} +} + +} +\seealso{ +\code{\link[=insample_pred_measure]{insample_pred_measure()}}, \code{\link[=loo_pred_measure]{loo_pred_measure()}}, +\code{\link[=kfold_pred_measure]{kfold_pred_measure()}}, \code{\link[=pred_measure]{pred_measure()}}, \link{supported_measures_list}, +\code{vignette("pred_measure_tutorial")} +} diff --git a/tests/testthat/_snaps/loo_and_waic.md b/tests/testthat/_snaps/loo_and_waic.md index 018b6d98..c259bb67 100644 --- a/tests/testthat/_snaps/loo_and_waic.md +++ b/tests/testthat/_snaps/loo_and_waic.md @@ -1,54 +1,54 @@ # loo, waic and elpd results haven't changed - WAoAAAACAAQFAAACAwAAAAMTAAAACgAAAg4AAAAGwFTltnf8GWZACqFzqDCa8kBk5bZ3/Blm - QBEipbqh/pI/8m8Dok+RiEAhIqW6of6SAAAEAgAAAAEABAAJAAAAA2RpbQAAAA0AAAACAAAA + WAoAAAACAAQDAwACAwAAAAMTAAAACgAAAg4AAAAGwFTltnf8GWhACqFzqDCbAEBk5bZ3/Blo + QBEipbqh/pA/8m8Dok+Rg0AhIqW6of6QAAAEAgAAAAEABAAJAAAAA2RpbQAAAA0AAAACAAAA AwAAAAIAAAQCAAAAAQAEAAkAAAAIZGltbmFtZXMAAAATAAAAAgAAABAAAAADAAQACQAAAAhl bHBkX2xvbwAEAAkAAAAFcF9sb28ABAAJAAAABWxvb2ljAAAAEAAAAAIABAAJAAAACEVzdGlt YXRlAAQACQAAAAJTRQAAAP4AAAIOAAAAoMAC+fJnaLHwwAEP7abtCtDAArJiAQQUDsABa6HW - xx3swAC19bRGOqbAAOVsk57EhMAHXE6oz8X6wAhOMwo4HzjAAxWJEz7wYMAAvCuQLdoawAEz - +EIq/LDAARqnVc5fZMAAuMi8GPlmwAJBKzhjQ9zAAe5YKyVuTsADlXoqYwIGwBIRlu/H12/A + xx3swAC19bRGOqbAAOVsk57EhMAHXE6oz8X6wAhOMwo4HzjAAxWJEz7wXsAAvCuQLdoawAEz + +EIq/LDAARqnVc5fZMAAuMi8GPlmwAJBKzhjQ9zAAe5YKyVuUsADlXoqYwIGwBIRlu/H127A E/WagapcWsACpJaoNucYwBLYLgxdc6fAA7rSrp1oIMAEf9XEej+SwAa4oZCQGv7ABf5oQger - 6sADXWMppHVuwAEJEltQ9nLAAOaxVhD86sAB7SPP8uDQwAnDIjSfxULABAp5a+6r9sAFJHaV - 7SH8wAEph84QBEw/eQDDaV43ET9xp+BQBCmTP3qfwKytXZ0/ctyOXeTxzz9xqi9N9aR4P3IZ + 6sADXWMppHVuwAEJEltQ9nLAAOaxVhD86sAB7SPP8uDawAnDIjSfxULABAp5a+6r9sAFJHaV + 7SH8wAEph84QBFA/eQDDaV43ET9xp+BQBCmTP3qfwKytXZ0/ctyOXeTxzz9xqi9N9aR4P3IZ d1RCmrs/hDyfFGZAlj+GNSCYztY2P3e6iLhToUY/cX4fUwwY4j9yNONK++tcP3MW2+Teir8/ chrQtYH1VT94S/hkJOXvP4KGRiL5AqY/jR2tM+V2kj+xFwrjXj1kP6T55+1SS6g/h0J5DiSH Lz+l12L3soNYP36Eg9WZBwg/fE5degjr7j+CgjOtLdigP4MAE38XAjM/efuEjSOY2z938dyR QJQSP3UOviSS5E0/hJj7+xYimz+KmFtUHEseP3x0d1lSXvQ/fyjSq08iSD9x5Q1KokZ4P6BC 0qXNAwA/kYGquy8AAD+imuIOTvsAP5NUpOQ/eAA/kQtugOBEAD+RMeghdl4AP7Jn3QxtCsA/ - tE9QFCi4gD+bSeGjWlIAP5FVoLo7XwA/kbVn03S8AD+Ui/4kS28AP5F+F0mQ2wA/mP14d7y0 - AD+pRBELXxKAP7xVzQoMgYA/6p0tx9H/CD/kuDZpjnkQP7DPsKVRKQA/5Y1g8Rh9sD+m1FBE + tE9QFCi4gD+bSeGjWlEAP5FVoLo7XwA/kbVn03S8AD+Ui/4kS28AP5F+F0mQ2wA/mP14d7y0 + AD+pRBELXxOAP7xVzQoMgYA/6p0tx9H/AD/kuDZpjnkQP7DPsKVRKQA/5Y1g8Rh9sD+m1FBE ZMYAP6LmZaqjsIA/rvRkyt/1gD+wN1KCt4TAP6FZe4TNQ4A/lxkATIN0AD+UCq5AO8gAP6ku - qlvkC4A/u837aXIIAD+joD1aCLQAP6afX0AzCAA/khQ6CvIGAEAS+fJnaLHwQBEP7abtCtBA + qlvkDgA/u837aXIIAD+joD1aCLQAP6afX0AzCAA/khQ6CvIIAEAS+fJnaLHwQBEP7abtCtBA ErJiAQQUDkARa6HWxx3sQBC19bRGOqZAEOVsk57EhEAXXE6oz8X6QBhOMwo4HzhAExWJEz7w - YEAQvCuQLdoaQBEz+EIq/LBAERqnVc5fZEAQuMi8GPlmQBJBKzhjQ9xAEe5YKyVuTkATlXoq - YwIGQCIRlu/H129AI/WagapcWkASpJaoNucYQCLYLgxdc6dAE7rSrp1oIEAUf9XEej+SQBa4 - oZCQGv5AFf5oQger6kATXWMppHVuQBEJEltQ9nJAEOaxVhD86kAR7SPP8uDQQBnDIjSfxUJA - FAp5a+6r9kAVJHaV7SH8QBEph84QBEw/qQv9LeIWyb+uWWyqUA9QP7GREVrajU6/qkhYWDHd - rL+9tsdoV7uMv7fBKJv14eU/t7YcNUPKsT/aTE3zYuljP6G3mVtQiba/vp1jR9XXbb+gLW87 - Hw2mv7FTEHhMvSa/sxFLk3EUuz+5A6XuSljYP9idj04idy4/0bL+IIz1RT/e15NGvsbPP9VS - jdQVlsE/2LoPMwLTfD/U+wQrF7IQP6aKY3Lc//Q/zQFaAA69qT+4Rrgot4/8P8jynnS82Ey/ - nP2U9Wg1Vj+9sDYbFcUYP4hK+zwAbv4/1HzUh2soqj/D6wM3SdjbP7XPnkuMip4/xNfwCK9V - Jj+1oylZVifyAAAEAgAAAf8AAAANAAAAAgAAACAAAAAFAAAEAgAAAv8AAAATAAAAAgAAAP4A + XkAQvCuQLdoaQBEz+EIq/LBAERqnVc5fZEAQuMi8GPlmQBJBKzhjQ9xAEe5YKyVuUkATlXoq + YwIGQCIRlu/H125AI/WagapcWkASpJaoNucYQCLYLgxdc6dAE7rSrp1oIEAUf9XEej+SQBa4 + oZCQGv5AFf5oQger6kATXWMppHVuQBEJEltQ9nJAEOaxVhD86kAR7SPP8uDaQBnDIjSfxUJA + FAp5a+6r9kAVJHaV7SH8QBEph84QBFA/qQv9LeIWWL+uWWyqUBB0P7GREVrajQO/qkhYWDHd + p7+9tsdoV7yPv7fBKJv14XY/t7YcNUPKlD/aTE3zYulDP6G3mVtQig2/vp1jR9XX0b+gLW87 + Hw2jv7FTEHhMvT2/sxFLk3EU8D+5A6XuSlkgP9idj04idtI/0bL+IIz1NT/e15NGvsZcP9VS + jdQVlrw/2LoPMwLTbj/U+wQrF7IVP6aKY3LdABo/zQFaAA69lT+4Rrgot4+MP8jynnS82GS/ + nP2U9Wg06j+9sDYbFcUWP4hK+zwAbhI/1HzUh2soVD/D6wM3SdkcP7XPnkuMis0/xNfwCK9U + 2T+1oylZVifvAAAEAgAAAf8AAAANAAAAAgAAACAAAAAFAAAEAgAAAv8AAAATAAAAAgAAAP4A AAAQAAAABQAEAAkAAAAIZWxwZF9sb28ABAAJAAAADW1jc2VfZWxwZF9sb28ABAAJAAAABXBf bG9vAAQACQAAAAVsb29pYwAEAAkAAAASaW5mbHVlbmNlX3BhcmV0b19rAAAA/gAAAhMAAAAD - AAAADgAAACA/qQv9LeIWyb+uWWyqUA9QP7GREVrajU6/qkhYWDHdrL+9tsdoV7uMv7fBKJv1 - 4eU/t7YcNUPKsT/aTE3zYuljP6G3mVtQiba/vp1jR9XXbb+gLW87Hw2mv7FTEHhMvSa/sxFL - k3EUuz+5A6XuSljYP9idj04idy4/0bL+IIz1RT/e15NGvsbPP9VSjdQVlsE/2LoPMwLTfD/U - +wQrF7IQP6aKY3Lc//Q/zQFaAA69qT+4Rrgot4/8P8jynnS82Ey/nP2U9Wg1Vj+9sDYbFcUY - P4hK+zwAbv4/1HzUh2soqj/D6wM3SdjbP7XPnkuMip4/xNfwCK9VJj+1oylZVifyAAAADgAA - ACBAjGtR4vOUSECNSIXMZdptQI1OzHDwC35AjCl3nEfMhECMV+BQfP7MQItWfnwINa1Aipoa - +DhXG0CJ4jnKlWa8QIp1d1J1ZG1AjUahmmkFkkCL5UfgNu4DQI2abt7PHcNAi6rgACII5UCH - F8533XLIQIiixsC750JAiRWGP5WHBUBkAZfpVM1eQHdzaroEDUlAhbbxJuuvP0B2G3GedgHa + AAAADgAAACA/qQv9LeIWWL+uWWyqUBB0P7GREVrajQO/qkhYWDHdp7+9tsdoV7yPv7fBKJv1 + 4XY/t7YcNUPKlD/aTE3zYulDP6G3mVtQig2/vp1jR9XX0b+gLW87Hw2jv7FTEHhMvT2/sxFL + k3EU8D+5A6XuSlkgP9idj04idtI/0bL+IIz1NT/e15NGvsZcP9VSjdQVlrw/2LoPMwLTbj/U + +wQrF7IVP6aKY3LdABo/zQFaAA69lT+4Rrgot4+MP8jynnS82GS/nP2U9Wg06j+9sDYbFcUW + P4hK+zwAbhI/1HzUh2soVD/D6wM3SdkcP7XPnkuMis0/xNfwCK9U2T+1oylZVifvAAAADgAA + ACBAjGtR4vOUSECNSIXMZdptQI1OzHDwC35AjCl3nEfMgkCMV+BQfP7EQItWfnwINa1Aipoa + +DhXG0CJ4jnKlWa8QIp1d1J1ZHBAjUahmmkFkkCL5UfgNu4DQI2abt7PHcNAi6rgACII5UCH + F8533XLHQIiixsC750RAiRWGP5WHBUBkAZfpVM1YQHdzaroEDU5AhbbxJuuvP0B2G3GedgHa QIurDxL70Z1AimK5GPqZQ0CKb2AkeCpMQIqX6UZQkVlAjPSaSuWUT0CGV1Ue/HaIQIgmSdq8 - SDtAhKKy2922NECIf2FmaiksQIrvm5gPU99AioGA+OUNHUCNhdwLjHhJAAAADgAAACA/7iIj - iprtpD/uhgpFxKUzP+9C2+HteTo/7WeuRVdLxz/thpx9PZyZP+x8NVYA5gc/7bgj+feDuD/t - VJY/ijqGP+vk6tOVjqc/7oFMRnLCAz/tFUhNscONP+71S5Zdfro/7NXCRLX3pz/oSat+OPKu - P+rzPc6+kK4/7rBYQ1wZPj/h+KYWRg06P/BP+OJ7Frg/6GyNUDLmvD/v8/5XaLpuP+3Om11x + SDtAhKKy2922MkCIf2FmaiksQIrvm5gPU99AioGA+OUNHUCNhdwLjHhJAAAADgAAACA/7iIj + iprtpD/uhgpFxKUzP+9C2+HteTo/7WeuRVdLxz/thpx9PZyRP+x8NVYA5gc/7bgj+feDuD/t + VJY/ijqGP+vk6tOVjqo/7oFMRnLCAz/tFUhNscONP+71S5Zdfro/7NXCRLX3pz/oSat+OPKu + P+rzPc6+kK4/7rBYQ1wZPj/h+KYWRg06P/BP+OJ7Frs/6GyNUDLmvD/v8/5XaLpuP+3Om11x ZMc/7CaGH4VipT/tF1NNe4n9P+1mF6TWIdc/7s96p0MNpT/ndID3UIhJP+lAQASVPCI/5qSP - laOkhD/s9t8Xe5GFP+zJwNIA1oY/7It81CYJlj/uynC4minDAAAEAgAAAAEABAAJAAAABW5h + laOkgj/s9t8Xe5GFP+zJwNIA1oY/7It81CYJlj/uynC4minDAAAEAgAAAAEABAAJAAAABW5h bWVzAAAAEAAAAAMABAAJAAAACHBhcmV0b19rAAQACQAAAAVuX2VmZgAEAAkAAAAFcl9lZmYA - AAD+AAAA/gAAAA4AAAABwFTltnf8GWYAAAAOAAAAAUAKoXOoMJryAAAADgAAAAFAZOW2d/wZ - ZgAAAA4AAAABQBEipbqh/pIAAAAOAAAAAT/ybwOiT5GIAAAADgAAAAFAISKluqH+kgAABAIA + AAD+AAAA/gAAAA4AAAABwFTltnf8GWgAAAAOAAAAAUAKoXOoMJsAAAAADgAAAAFAZOW2d/wZ + aAAAAA4AAAABQBEipbqh/pAAAAAOAAAAAT/ybwOiT5GDAAAADgAAAAFAISKluqH+kAAABAIA AAP/AAAAEAAAAAoABAAJAAAACWVzdGltYXRlcwAEAAkAAAAJcG9pbnR3aXNlAAQACQAAAAtk aWFnbm9zdGljcwAEAAkAAAALcHNpc19vYmplY3QABAAJAAAACGVscGRfbG9vAAQACQAAAAVw X2xvbwAEAAkAAAAFbG9vaWMABAAJAAAAC3NlX2VscGRfbG9vAAQACQAAAAhzZV9wX2xvbwAE @@ -58,7 +58,7 @@ --- - WAoAAAACAAQFAAACAwAAAAMTAAAACAAAAg4AAAAGwFTh8N3JQlhACijAYdW5U0Bk4fDdyUJY + WAoAAAACAAQDAwACAwAAAAMTAAAACAAAAg4AAAAGwFTh8N3JQlhACijAYdW5U0Bk4fDdyUJY QBEIPbMRcF8/8f1l7HLzXkAhCD2zEXBfAAAEAgAAAAEABAAJAAAAA2RpbQAAAA0AAAACAAAA AwAAAAIAAAQCAAAAAQAEAAkAAAAIZGltbmFtZXMAAAATAAAAAgAAABAAAAADAAQACQAAAAll bHBkX3dhaWMABAAJAAAABnBfd2FpYwAEAAkAAAAEd2FpYwAAABAAAAACAAQACQAAAAhFc3Rp @@ -88,7 +88,7 @@ --- - WAoAAAACAAQFAAACAwAAAAMTAAAAAgAAAg4AAAAEwFQQqtq6lJBAZBCq2rqUkEAJ4d/NRDUI + WAoAAAACAAQDAwACAwAAAAMTAAAAAgAAAg4AAAAEwFQQqtq6lJBAZBCq2rqUkEAJ4d/NRDUI QBnh381ENQgAAAQCAAAAAQAEAAkAAAADZGltAAAADQAAAAIAAAACAAAAAgAABAIAAAABAAQA CQAAAAhkaW1uYW1lcwAAABMAAAACAAAAEAAAAAIABAAJAAAABGVscGQABAAJAAAAAmljAAAA EAAAAAIABAAJAAAACEVzdGltYXRlAAQACQAAAAJTRQAAAP4AAAIOAAAAQMACuOcc0X3kwADs diff --git a/tests/testthat/_snaps/print.md b/tests/testthat/_snaps/print.md new file mode 100644 index 00000000..2c3cc8f5 --- /dev/null +++ b/tests/testthat/_snaps/print.md @@ -0,0 +1,288 @@ +# loo_pred_measure print snapshots + + + Computed from 400 posterior draws and 262 observations. + Data source: loo + + Estimate SE + elpd_loo -5440.7 690.2 + p_loo 225.4 45.5 + looic 10881.4 1380.3 + r2_loo 0.2 0.8 + +--- + + + Computed from 400 posterior draws and 262 observations. + Data source: loo + + Estimate SE + elpd_loo -5440.7 690.2 + p_loo 225.4 45.5 + looic 10881.4 1380.3 + r2_loo 0.2 0.8 + rmse_loo 44.6 4.6 + +--- + + + Computed from 400 posterior draws and 262 observations. + Data source: loo + + Estimate SE + elpd_loo -5440.7 690.2 + p_loo 225.4 45.5 + looic 10881.4 1380.3 + r2_loo 0.2 0.8 + rmse_loo 44.6 4.6 + mse_loo 1991.3 413.8 + +--- + + + Computed from 400 posterior draws and 262 observations. + Data source: loo + + Estimate SE + elpd_loo -5440.7 690.2 + p_loo 225.4 45.5 + looic 10881.4 1380.3 + r2_loo 0.2 0.8 + rmse_loo 44.6 4.6 + mse_loo 1991.3 413.8 + mae_loo 24.6 2.3 + +--- + + + Computed from 400 posterior draws and 262 observations. + Data source: loo + + Estimate SE + elpd_loo -5440.7 690.2 + p_loo 225.4 45.5 + looic 10881.4 1380.3 + r2_loo 0.2 0.8 + rmse_loo 44.6 4.6 + mse_loo 1991.3 413.8 + mae_loo 24.6 2.3 + rps_loo 22.0 2.3 + +--- + + + Computed from 400 posterior draws and 262 observations. + Data source: loo + + Estimate SE + elpd_loo -5440.7 690.2 + p_loo 225.4 45.5 + looic 10881.4 1380.3 + r2_loo 0.2 0.8 + rmse_loo 44.6 4.6 + mse_loo 1991.3 413.8 + mae_loo 24.6 2.3 + rps_loo 22.0 2.3 + crps_loo 21.3 2.2 + +--- + + + Computed from 400 posterior draws and 262 observations. + Data source: loo + + Estimate SE + elpd_loo -5440.7 690.2 + p_loo 225.4 45.5 + looic 10881.4 1380.3 + r2_loo 0.2 0.8 + rmse_loo 44.6 4.6 + mse_loo 1991.3 413.8 + mae_loo 24.6 2.3 + rps_loo 22.0 2.3 + crps_loo 21.3 2.2 + srps_loo -5.6 0.6 + +--- + + + Computed from 400 posterior draws and 262 observations. + Data source: loo + + Estimate SE + elpd_loo -5440.7 690.2 + p_loo 225.4 45.5 + looic 10881.4 1380.3 + r2_loo 0.2 0.8 + rmse_loo 44.6 4.6 + mse_loo 1991.3 413.8 + mae_loo 24.6 2.3 + rps_loo 22.0 2.3 + crps_loo 21.3 2.2 + srps_loo -5.6 0.6 + scrps_loo -4.5 0.3 + +--- + + + Computed from 400 posterior draws and 262 observations. + Data source: loo + + Estimate SE + elpd_loo -5440.7 690.2 + p_loo 225.4 45.5 + looic 10881.4 1380.3 + r2_loo 0.2 0.8 + rmse_loo 44.6 4.6 + mse_loo 1991.3 413.8 + mae_loo 24.6 2.3 + rps_loo 22.0 2.3 + crps_loo 21.3 2.2 + srps_loo -5.6 0.6 + scrps_loo -4.5 0.3 + mlpd_loo -20.8 2.6 + +--- + + + Computed from 400 posterior draws and 262 observations. + Data source: loo + + Estimate SE + elpd_loo -5440.7 690.2 + p_loo 225.4 45.5 + looic 10881.4 1380.3 + mlpd_loo -20.8 2.6 + +--- + + + Computed from 400 posterior draws and 262 observations. + Data source: loo + + Estimate SE + elpd_loo -5440.7 690.2 + p_loo 225.4 45.5 + looic 10881.4 1380.3 + mlpd_loo -20.8 2.6 + mae_loo 24.6 2.3 + +--- + + + Computed from 400 posterior draws and 262 observations. + Data source: loo + + Estimate SE + elpd_loo -5440.7 690.2 + p_loo 225.4 45.5 + looic 10881.4 1380.3 + mlpd_loo -20.8 2.6 + mae_loo 24.6 2.3 + srps_loo -5.6 0.6 + +--- + + + Computed from 400 posterior draws and 262 observations. + Data source: loo + + Estimate SE + elpd_loo -5440.7 690.2 + p_loo 225.4 45.5 + looic 10881.4 1380.3 + mlpd_loo -20.8 2.6 + mae_loo 24.6 2.3 + srps_loo -5.6 0.6 + r2_loo 0.2 0.8 + +--- + + + Computed from 400 posterior draws and 262 observations. + Data source: loo + + Estimate SE + elpd_loo -5440.7 690.2 + p_loo 225.4 45.5 + looic 10881.4 1380.3 + mlpd_loo -20.8 2.6 + mae_loo 24.6 2.3 + srps_loo -5.6 0.6 + r2_loo 0.2 0.8 + rmse_loo 44.6 4.6 + +--- + + + Computed from 400 posterior draws and 262 observations. + Data source: loo + + Estimate SE + elpd_loo -5440.7 690.2 + p_loo 225.4 45.5 + looic 10881.4 1380.3 + mlpd_loo -20.8 2.6 + mae_loo 24.6 2.3 + srps_loo -5.6 0.6 + r2_loo 0.2 0.8 + rmse_loo 44.6 4.6 + crps_loo 21.3 2.2 + +--- + + + Computed from 400 posterior draws and 262 observations. + Data source: loo + + Estimate SE + elpd_loo -5440.7 690.2 + p_loo 225.4 45.5 + looic 10881.4 1380.3 + mlpd_loo -20.8 2.6 + mae_loo 24.6 2.3 + srps_loo -5.6 0.6 + r2_loo 0.2 0.8 + rmse_loo 44.6 4.6 + crps_loo 21.3 2.2 + mse_loo 1991.3 413.8 + +--- + + + Computed from 400 posterior draws and 262 observations. + Data source: loo + + Estimate SE + elpd_loo -5440.7 690.2 + p_loo 225.4 45.5 + looic 10881.4 1380.3 + mlpd_loo -20.8 2.6 + mae_loo 24.6 2.3 + srps_loo -5.6 0.6 + r2_loo 0.2 0.8 + rmse_loo 44.6 4.6 + crps_loo 21.3 2.2 + mse_loo 1991.3 413.8 + scrps_loo -4.5 0.3 + +--- + + + Computed from 400 posterior draws and 262 observations. + Data source: loo + + Estimate SE + elpd_loo -5440.7 690.2 + p_loo 225.4 45.5 + looic 10881.4 1380.3 + mlpd_loo -20.8 2.6 + mae_loo 24.6 2.3 + srps_loo -5.6 0.6 + r2_loo 0.2 0.8 + rmse_loo 44.6 4.6 + crps_loo 21.3 2.2 + mse_loo 1991.3 413.8 + scrps_loo -4.5 0.3 + rps_loo 22.0 2.3 + diff --git a/tests/testthat/data-for-tests/generate-data.R b/tests/testthat/data-for-tests/generate-data.R new file mode 100644 index 00000000..16fd3e0b --- /dev/null +++ b/tests/testthat/data-for-tests/generate-data.R @@ -0,0 +1,242 @@ +# this file includes a data-generation pipeline for test data specifically +# used in the testthat tests for pred_measure* files and for +# vignettes in articles-online-only +library(rstanarm) +SEED <- 42 + +postprocess_res <- function(model, fit, chains = 2, draws = 200) { + ypred <- brms::posterior_predict(fit) + mupred <- brms::posterior_epred(fit) + ylp <- log_lik(fit) + log_ratios <- -1 * ylp + r_eff <- relative_eff(exp(-log_ratios), chain_id = rep(1:chains, each = draws)) + psis_object <- psis(log_ratios, r_eff = r_eff, cores = 2) + kfold2 <- brms::kfold(fit, save_fits = FALSE) + if (model %in% c("roaches", "binary", "binomial", "sleep")) { + kfold <- brms::kfold(fit, save_fits = TRUE) + mupred_kfold <- brms::kfold_predict(kfold, method = "fitted")$yrep + ypred_kfold <- brms::kfold_predict(kfold, method = "predict")$yrep + loo <- brms::loo(fit, save_psis = TRUE) + mupred_loo <- loo::E_loo(ypred, psis_object, type = "mean")$value + predperf <- insample_pred_measure(y = fit$data$y, mupred = mupred, + measure = "r2", ylp = ylp) + } + + if (model == "roaches") { + list( + y = fit$data$y, + ypred = ypred, + mupred = mupred, + ylp = ylp, + log_weights = psis_object$log_weights, + kfold = kfold2, + loo = loo, + predperf = predperf + ) + } else if (model == "binomial") { + list( + y = fit$data$y, + ypred = ypred, + log_weights = psis_object$log_weights, + ypred_kfold = ypred_kfold + ) + } else if (model == "binary") { + list( + y = fit$data$y, + ypred = ypred, + log_weights = psis_object$log_weights, + ypred_kfold = ypred_kfold + ) + } else if (model == "categorical") { + list( + y = fit$data$y, + mupred = mupred, + log_weights = psis_object$log_weights + ) + } else if (model == "sleep") { + list( + y = fit$data$y, + ypred = ypred, + log_weights = psis_object$log_weights, + ypred_kfold = ypred_kfold, + mupred = mupred, + mupred_loo = mupred_loo, + mupred_kfold = mupred_kfold + ) + } +} + +get_binary_res <- function() { + set.seed(SEED) + df_binary <- data.frame(y = rbinom(50, 1, 0.3)) + + fit_binary <- brms::brm(formula = "y ~ 1", + data = df_binary, + family = bernoulli, + chains = 2, + iter = 400, + seed = SEED, + refresh = 0 + ) + list( + fit = fit_binary, + res = postprocess_res("binary", fit_binary) + ) +} + +get_roaches_res <- function() { + data(roaches, package = "rstanarm") + roaches$sqrt_roach1 <- sqrt(roaches$roach1) + + fit_roaches <- brm( + y ~ sqrt_roach1 + treatment + senior + offset(log(exposure2)), + data = roaches, + family = poisson, + prior = prior(normal(0, 1), class = b), + chains = 2, + iter = 400, + refresh = 0, + seed = SEED + ) + list( + fit = fit_roaches, + res = postprocess_res("roaches", fit_roaches) + ) +} + +get_sleep_test_train_res <- function() { + # specifically for testing test_pred_measure + data("sleepstudy") + conditions <- make_conditions(sleepstudy, "Subject", incl_vars = FALSE) + sleepstudy <- sleepstudy |> + filter(Days >= 2) |> + mutate( + Days = Days - 2, + y = Reaction + ) + + test_subjects <- sample(unique(sleepstudy$Subject), size = 5) + train_data <- sleepstudy |> + dplyr::filter(!Subject %in% test_subjects) + test_data <- sleepstudy |> + dplyr::filter(Subject %in% test_subjects) + + prior_lin_base <- prior(normal(200, 100), class = b, coef = "Intercept") + + prior(normal(0, 20), class = b, coef = "Days") + + prior(exponential(0.02), class = sigma) + + fit_sleep_train <- brm( + y ~ 0 + Intercept + Days, + data = train_data, + family = gaussian(), + prior = prior_lin_base, + chains = 2, + iter = 400, + seed = SEED + ) + + list( + fit = fit_sleep_train, + res = list( + y_test = test_data$y, + ypred_test = brms::posterior_predict(fit_sleep_train, newdata = test_data), + mupred_test = brms::posterior_epred(fit_sleep_train, newdata = test_data), + ylp_test = brms::log_lik(fit_sleep_train, newdata = test_data), + ylp_train = brms::log_lik(fit_sleep_train) + ) + ) +} + +get_sleep_res <- function() { + data("sleepstudy") + conditions <- make_conditions(sleepstudy, "Subject", incl_vars = FALSE) + sleepstudy <- sleepstudy |> + dplyr::filter(Days >= 2) |> + dplyr::mutate(Days = Days - 2, y = Reaction) + + prior_lin_base <- prior(normal(200, 100), class = b, coef = "Intercept") + + prior(normal(0, 20), class = b, coef = "Days") + + prior(exponential(0.02), class = sigma) + + fit_sleepstudy <- brm( + y ~ 0 + Intercept + Days, + data = sleepstudy, + family = gaussian(), + prior = prior_lin_base, + chains = 2, + iter = 400, + refresh = 0, + seed = SEED + ) + list( + fit = fit_sleepstudy, + res = postprocess_res("sleep", fit_sleepstudy) + ) +} + +get_penguins_res <- function() { + data("penguins", package = "palmerpenguins") + penguins <- subset(penguins, complete.cases(penguins)) + penguins$y <- penguins$species + + fit <- brm( + y ~ bill_length_mm + bill_depth_mm, + data = penguins, + family = categorical(), + chains = 2, + iter = 400, + cores = 2, + seed = SEED + ) + list( + fit = fit, + res = postprocess_res("categorical", fit) + ) +} + +get_binomial_res <- function() { + set.seed(SEED) + df <- data.frame( + y = rbinom(50, 10, 0.3), + n = 10 + ) + + fit <- brms::brm(formula = "y | trials(n) ~ 1", + data = df, + family = binomial, + chains = 2, + iter = 400, + seed = SEED, + refresh = 0 + ) + list( + fit = fit, + res = postprocess_res("binomial", fit) + ) +} + +full_roaches <- get_roaches_res() +full_binary <- get_binary_res() +full_penguins <- get_penguins_res() +full_binomial <- get_binomial_res() +full_sleep <- get_sleep_res() +full_sleep_test <- get_sleep_test_train_res() + +# for tests -------------------------------------------------- + +test_path <- "tests/testthat/data-for-tests/" +saveRDS(full_roaches$res, paste0(test_path, "res_roaches.Rds")) +saveRDS(full_binary$res, paste0(test_path, "res_binary.Rds")) +saveRDS(full_penguins$res, paste0(test_path, "res_penguins.Rds")) +saveRDS(full_binomial$res, paste0(test_path, "res_binomial.Rds")) +saveRDS(full_sleep$res, paste0(test_path, "res_sleep.Rds")) +saveRDS(full_sleep_test$res, paste0(test_path, "res_sleep_test.Rds")) + +# for vignettes -------------------------------------------------- + +vignette_path <- "vignettes/articles-online-only/data-for-vignettes/" +saveRDS(full_roaches$fit, paste0(vignette_path, "fit_roaches.Rds")) +saveRDS(full_binary$fit, paste0(vignette_path, "fit_binary.Rds")) +saveRDS(full_penguins$fit, paste0(vignette_path, "fit_penguins.Rds")) +saveRDS(full_binomial$fit, paste0(vignette_path, "fit_binomial.Rds")) +saveRDS(full_sleep$fit, paste0(vignette_path, "fit_sleep.Rds")) \ No newline at end of file diff --git a/tests/testthat/data-for-tests/res_binary.Rds b/tests/testthat/data-for-tests/res_binary.Rds new file mode 100644 index 00000000..0f55a5ed Binary files /dev/null and b/tests/testthat/data-for-tests/res_binary.Rds differ diff --git a/tests/testthat/data-for-tests/res_binomial.Rds b/tests/testthat/data-for-tests/res_binomial.Rds new file mode 100644 index 00000000..d90c6c81 Binary files /dev/null and b/tests/testthat/data-for-tests/res_binomial.Rds differ diff --git a/tests/testthat/data-for-tests/res_penguins.Rds b/tests/testthat/data-for-tests/res_penguins.Rds new file mode 100644 index 00000000..d0b8e299 Binary files /dev/null and b/tests/testthat/data-for-tests/res_penguins.Rds differ diff --git a/tests/testthat/data-for-tests/res_roaches.Rds b/tests/testthat/data-for-tests/res_roaches.Rds new file mode 100644 index 00000000..869fab16 Binary files /dev/null and b/tests/testthat/data-for-tests/res_roaches.Rds differ diff --git a/tests/testthat/data-for-tests/res_sleep.Rds b/tests/testthat/data-for-tests/res_sleep.Rds new file mode 100644 index 00000000..b987ee69 Binary files /dev/null and b/tests/testthat/data-for-tests/res_sleep.Rds differ diff --git a/tests/testthat/data-for-tests/res_sleep_test.Rds b/tests/testthat/data-for-tests/res_sleep_test.Rds new file mode 100644 index 00000000..55fd89a5 Binary files /dev/null and b/tests/testthat/data-for-tests/res_sleep_test.Rds differ diff --git a/tests/testthat/test_crps.R b/tests/testthat/test_crps.R deleted file mode 100644 index ce523476..00000000 --- a/tests/testthat/test_crps.R +++ /dev/null @@ -1,58 +0,0 @@ -set.seed(123456789) -n <- 10 -S <- 100 -y <- rnorm(n) -x1 <- matrix(rnorm(n * S), nrow = S) -x2 <- matrix(rnorm(n * S), nrow = S) -ll <- matrix(rnorm(n * S) * 0.1 - 1, nrow = S) - -with_seed <- function(seed, code) { - code <- substitute(code) - orig.seed <- .Random.seed - on.exit(.Random.seed <<- orig.seed) - set.seed(seed) - eval.parent(code) -} - -test_that("crps computation is correct", { - expect_equal(.crps_fun(2.0, 1.0), 0.0) - expect_equal(.crps_fun(1.0, 2.0), -1.5) - expect_equal(.crps_fun(pi, pi^2), 0.5 * pi - pi^2) - - expect_equal(.crps_fun(1.0, 0.0, scale = TRUE), 0.0) - expect_equal(.crps_fun(1.0, 2.0, scale = TRUE), -2.0) - expect_equal(.crps_fun(pi, pi^2, scale = TRUE), -pi^2/pi - 0.5 * log(pi)) -}) - -test_that("crps matches snapshots", { - expect_snapshot_value(with_seed(1, crps(x1, x2, y)), style = "serialize") - expect_snapshot_value(with_seed(1, scrps(x1, x2, y)), style = "serialize") - expect_snapshot_value(with_seed(1, loo_crps(x1, x2, y, ll)), style = "serialize") - expect_snapshot_value(with_seed(1, loo_scrps(x1, x2, y, ll)), style = "serialize") -}) - -test_that("input validation throws correct errors", { - expect_error(validate_crps_input(as.character(x1), x2, y), - "is.numeric(x) is not TRUE", - fixed = TRUE) - expect_error(validate_crps_input(x1, as.character(x2), y), - "is.numeric(x2) is not TRUE", - fixed = TRUE) - expect_error(validate_crps_input(x1, x2, c('a', 'b')), - "is.numeric(y) is not TRUE", - fixed = TRUE) - expect_error(validate_crps_input(x1, t(x2), y), - "identical(dim(x), dim(x2)) is not TRUE", - fixed = TRUE) - expect_error(validate_crps_input(x1, x2, c(1, 2)), - "ncol(x) == length(y) is not TRUE", - fixed = TRUE) - expect_error(validate_crps_input(x1, x2, y, t(ll)), - "ifelse(is.null(log_lik), TRUE, identical(dim(log_lik), dim(x))) is not TRUE", - fixed = TRUE) -}) - -test_that("methods for single data point don't error", { - expect_silent(crps(x1[,1], x2[,1], y[1])) - expect_silent(scrps(x1[,1], x2[,1], y[1])) -}) diff --git a/tests/testthat/test_loo_and_waic.R b/tests/testthat/test_loo_and_waic.R index 5cdc823e..232f74c6 100644 --- a/tests/testthat/test_loo_and_waic.R +++ b/tests/testthat/test_loo_and_waic.R @@ -10,7 +10,7 @@ r_eff_mat <- relative_eff(exp(LLmat), chain_id = chain_id) loo1 <- suppressWarnings(loo(LLarr, r_eff = r_eff_arr)) waic1 <- suppressWarnings(waic(LLarr)) -elpd1 <- suppressWarnings(elpd(LLarr)) +elpd1 <- suppressWarnings(elpd(llarray_to_matrix(LLarr))) test_that("using loo.cores is deprecated", { options(mc.cores = NULL) @@ -135,8 +135,8 @@ test_that("waic.array and waic.matrix give same result", { }) test_that("elpd.array and elpd.matrix give same result", { - elpd2 <- suppressWarnings(elpd(LLmat)) - expect_identical(elpd1, elpd2) + elpd <- suppressWarnings(elpd(LLmat)) + expect_identical(elpd1, elpd) }) test_that("loo, waic, and elpd error with vector input", { diff --git a/tests/testthat/test_measures.R b/tests/testthat/test_measures.R new file mode 100644 index 00000000..36406dd3 --- /dev/null +++ b/tests/testthat/test_measures.R @@ -0,0 +1,307 @@ +# load test data -------------------------------------- +res_roaches <- readRDS("data-for-tests/res_roaches.Rds") +res_sleep <- readRDS("data-for-tests/res_sleep.Rds") +res_binom <- readRDS("data-for-tests/res_binomial.Rds") +res_binary <- readRDS("data-for-tests/res_binary.Rds") +res_cat <- readRDS("data-for-tests/res_penguins.Rds") + +# ptw_log_pred_density ------------------------ +testthat::test_that("ptw_log_pred_density() works as expected", { + res <- ptw_log_pred_density(ylp = res_roaches$ylp, psis_log_weights = NULL) + + expect_equal(length(res), dim(res_roaches$ylp)[2]) + expect_equal(res, matrixStats::colLogSumExps(res_roaches$ylp) - log(dim(res_roaches$ylp)[1])) +}) + +testthat::test_that("ptw_log_pred_density() with psis_log_weights works as expected", { + norm_log_weights <- .normalize_log_weights(res_roaches$log_weights) + res <- ptw_log_pred_density( + ylp = res_roaches$ylp, + psis_log_weights = norm_log_weights + ) + + expect_equal(length(res), dim(res_roaches$ylp)[2]) + expect_equal(res, matrixStats::colLogSumExps(res_roaches$ylp + norm_log_weights) + ) +}) + +testthat::test_that("ptw_log_pred_density() returns error when weights are not normalized", { + expect_error( + ptw_log_pred_density( + ylp = res_roaches$ylp, + psis_log_weights = res_roaches$log_weights + ), + regexp = "Range of current column sums" + ) +}) + +# elpd() ----------------------------------- + +testthat::test_that("elpd() works as expected", { + res <- elpd(ylp = res_roaches$ylp, log_weights = NULL) + + expect_equal(names(res), c("estimates", "pointwise")) + expect_equal(length(res$estimates[1]), 1) + expect_equal(length(res$estimates[2]), 1) + expect_equal(length(res$pointwise), dim(res_roaches$ylp)[2]) +}) + +testthat::test_that("elpd() with unnormalized log-weights works as expected", { + log_weights <- res_roaches$log_weights + res <- elpd(ylp = res_roaches$ylp, log_weights = log_weights) + + expect_false(all(.normalize_log_weights(log_weights) == log_weights)) + expect_equal(names(res), c("estimates", "pointwise")) + expect_equal(length(res$estimates[1]), 1) + expect_equal(length(res$estimates[2]), 1) + expect_equal(length(res$pointwise), dim(res_roaches$ylp)[2]) +}) + +testthat::test_that("elpd() with normalized log-weights works as expected", { + res <- elpd(ylp = res_roaches$ylp, log_weights = res_roaches$predperf_loo$log_weights) + + expect_equal(names(res), c("estimates", "pointwise")) + expect_equal(length(res$estimates[1]), 1) + expect_equal(length(res$estimates[2]), 1) + expect_equal(length(res$pointwise), dim(res_roaches$ylp)[2]) +}) + +# mlpd() ----------------------------------- + +testthat::test_that("mlpd() works as expected", { + res <- mlpd(ylp = res_roaches$ylp, log_weights = NULL) + res_elpd <- elpd(ylp = res_roaches$ylp, log_weights = NULL) + n_obs <- dim(res_roaches$ylp)[2] + + expect_equal(names(res), c("estimates", "pointwise")) + expect_equal(length(res$estimates[1]), 1) + expect_equal(length(res$estimates[2]), 1) + expect_equal(length(res$pointwise), n_obs) + expect_equal(res$estimates, res_elpd$estimates / n_obs) + expect_equal(res$pointwise, res_elpd$pointwise) +}) + +# rps() ------------------------------------- + +testthat::test_that("rps() with ordered categorial data works as expected", { + res <- rps( + y = res_binom$y, + ypred = res_binom$ypred + ) + + expect_equal(names(res), c("estimates", "pointwise")) + expect_equal(length(res$estimates), 2) + expect_equal(length(res$pointwise), length(res_binom$y)) +}) + +testthat::test_that("rps() scaled version with categorical data works as expected", { + res <- rps( + y = res_binom$y, + ypred = res_binom$ypred, + scaled = TRUE + ) + + expect_equal(names(res), c("estimates", "pointwise")) + expect_equal(length(res$estimates), 2) + expect_equal(length(res$pointwise), length(res_binom$y)) + expect_true(all(res$pointwise < 0)) +}) + +testthat::test_that("rps() for categorical data with log-weights works as expected", { + res <- rps( + y = res_binom$y, + ypred = res_binom$ypred, + log_weights = res_binom$log_weights + ) + + expect_equal(names(res), c("estimates", "pointwise")) + expect_equal(length(res$estimates), 2) + expect_equal(length(res$pointwise), length(res_binary$y)) + expect_true(all(res$pointwise >= 0)) +}) + +testthat::test_that("rps() with continuous data works as expected", { + res <- rps(res_sleep$y, res_sleep$ypred) + + expect_equal(names(res), c("estimates", "pointwise")) + expect_equal(length(res$estimates), 2) + expect_equal(length(res$pointwise), length(res_sleep$y)) +}) + +testthat::test_that("rps() with continuous data and log-weights works as expected", { + res <- rps( + res_sleep$y, + res_sleep$ypred, + log_weights = res_sleep$log_weights + ) + + expect_equal(names(res), c("estimates", "pointwise")) + expect_equal(length(res$estimates), 2) + expect_equal(length(res$pointwise), length(res_sleep$y)) +}) + +testthat::test_that("rps() with continuous data and scaled version works as expected", { + res <- rps(res_sleep$y, res_sleep$ypred, scaled = TRUE) + + expect_equal(names(res), c("estimates", "pointwise")) + expect_equal(length(res$estimates), 2) + expect_equal(length(res$pointwise), length(res_sleep$y)) + expect_true(all(res$pointwise < 0)) +}) + + +# brier() --------------------------------------- + +testthat::test_that("brier() works as expected", { + res_brier <- brier(y = res_binary$y, ypred = res_binary$ypred, log_weights = NULL) + + expect_equal(names(res_brier), c("estimates", "pointwise")) + expect_equal(length(res_brier$estimates), 2) + expect_equal(length(res_brier$pointwise), length(res_binary$y)) + expect_true(all(res_brier$pointwise >= 0 & res_brier$pointwise <= 1)) +}) + +testthat::test_that("brier() with log-weights works as expected", { + res_brier <- brier( + y = res_binary$y, + ypred = res_binary$ypred, + log_weights = res_binary$log_weights + ) + + expect_equal(names(res_brier), c("estimates", "pointwise")) + expect_equal(length(res_brier$estimates), 2) + expect_equal(length(res_brier$pointwise), length(res_binary$y)) + expect_true(all(res_brier$pointwise >= 0 & res_brier$pointwise <= 1)) + + res_brier2 <- brier( + y = res_binary$y, + ypred = res_binary$ypred, + log_weights = .normalize_log_weights(res_binary$log_weights) + ) + expect_equal(res_brier2$pointwise, res_brier$pointwise) +}) + +testthat::test_that("brier() errors when y not binary", { + expect_error( + brier( + y = res_binom$y, + ypred = res_binary$ypred, + log_weights = res_binary$log_weights + ), + regexp = "The brier score expects binary data 'y'." + ) +}) + +# mae() ------------------------------------------------ +testthat::test_that("mae() works as expected", { + res <- mae(y = res_roaches$y, mupred = res_roaches$mupred, + log_weights = NULL) + + expect_equal(names(res), c("estimates", "pointwise")) + expect_equal(length(res$estimates), 2) + expect_equal(length(res$pointwise), length(res_roaches$y)) +}) + +testthat::test_that("mae() with log_weights works as expected", { + res <- mae(y = res_roaches$y, mupred = res_roaches$mupred, + log_weights = res_roaches$loo1$psis_object$log_weights) + + expect_equal(names(res), c("estimates", "pointwise")) + expect_equal(length(res$estimates), 2) + expect_equal(length(res$pointwise), length(res_roaches$y)) +}) + +# rmse() / mse() ----------------------------------------- +testthat::test_that("mse() and rmse() work as expected", { + res_mse <- mse(y = res_roaches$y, mupred = res_roaches$mupred, + log_weights = NULL) + + res_rmse <- rmse(y = res_roaches$y, mupred = res_roaches$mupred, + log_weights = NULL) + + expect_equal(names(res_mse), c("estimates", "pointwise")) + expect_equal(length(res_mse$estimates), 2) + expect_equal(length(res_mse$pointwise), length(res_roaches$y)) + + expect_equal(sqrt(abs(res_mse$estimates[1])), res_rmse$estimates[1]) + expect_equal(res_mse$estimates[2]/(2*sqrt(abs(res_mse$estimates[1]))), res_rmse$estimates[2]) +}) + +testthat::test_that("rmse() works with se=0", { + mupred0 <- t(replicate(4000, res_roaches$y)) + + res_rmse <- rmse(y = res_roaches$y, mupred = mupred0, + log_weights = NULL) + + expect_equal(names(res_rmse), c("estimates", "pointwise")) + expect_equal(length(res_rmse$estimates), 2) + expect_equal(length(res_rmse$pointwise), length(res_roaches$y)) + expect_equal(unname(res_rmse$estimates[2]), 0) +}) + +# r2() --------------------------------------------------- +testthat::test_that("r2() works as expected", { + res <- r2(y = res_roaches$y, mupred = res_roaches$mupred, log_weights = NULL) + + expect_equal(names(res), c("estimates", "pointwise")) + expect_equal(length(res$estimates), 2) + expect_equal(length(res$pointwise), length(res_roaches$y)) + expect_true(all(res$estimates[1] >= 0 & res$estimates[1] <= 1)) +}) + +testthat::test_that("r2() with log_weights works as expected", { + res <- r2(y = res_roaches$y, mupred = res_roaches$mupred, + log_weights = res_roaches$loo1$psis_object$log_weights) + + expect_equal(names(res), c("estimates", "pointwise")) + expect_equal(length(res$estimates), 2) + expect_equal(length(res$pointwise), length(res_roaches$y)) + expect_true(all(res$estimates[1] >= 0 & res$estimates[1] <= 1)) +}) + +# acc() / bacc() ----------------------------------------------------- +testthat::skip_if_not_installed("brms") + +testthat::test_that("acc() works as expected", { + res <- acc(y = as.integer(res_cat$y), mupred = res_cat$mupred, log_weights = NULL) + + expect_equal(names(res), c("estimates", "pointwise")) + expect_equal(length(res$estimates), 2) + expect_equal(length(res$pointwise), length(res_cat$y)) + expect_true(all(res$pointwise >= 0 & res$pointwise <= 1)) +}) + +testthat::test_that("acc() with log-weights works as expected", { + res <- acc( + y = as.integer(res_cat$y), + mupred = res_cat$mupred, + log_weights = res_cat$loo$psis_object$log_weights + ) + + expect_equal(names(res), c("estimates", "pointwise")) + expect_equal(length(res$estimates), 2) + expect_equal(length(res$pointwise), length(res_cat$y)) + expect_true(!all(res$pointwise < 0 | res$pointwise > 1)) +}) + +testthat::test_that("bacc() works as expected", { + res <- bacc(y = as.integer(res_cat$y), mupred = res_cat$mupred, log_weights = NULL) + + expect_equal(names(res), c("estimates", "pointwise")) + expect_equal(length(res$estimates), 2) + expect_equal(length(res$pointwise), length(res_cat$y)) + expect_true(!all(res$pointwise < 0 | res$pointwise > 1)) +}) + +testthat::test_that("bacc() with log-weights works as expected", { + res <- bacc( + y = as.integer(res_cat$y), + mupred = res_cat$mupred, + log_weights = res_cat$loo$psis_object$log_weights + ) + + expect_equal(names(res), c("estimates", "pointwise")) + expect_equal(length(res$estimates), 2) + expect_equal(length(res$pointwise), length(res_cat$y)) + expect_true(!all(res$pointwise < 0 | res$pointwise > 1)) +}) diff --git a/tests/testthat/test_pred_measure.R b/tests/testthat/test_pred_measure.R new file mode 100644 index 00000000..ed971fdf --- /dev/null +++ b/tests/testthat/test_pred_measure.R @@ -0,0 +1,382 @@ +# load data ----------------------------- +res <- readRDS("data-for-tests/res_roaches.Rds") + +y <- res$y +ypred <- res$ypred +mupred <- res$mupred +ylp <- res$ylp +kfold1 <- res$kfold +loo1 <- res$loo +predperf <- res$predperf + +# unit tests ---------------------- +## .compute_measure() -------------------- + +.builtin_entry <- function(name) { + list(name = name, type = "builtin", key = name) +} + +test_that(".compute_measure() with r2 works as expected", { + measure_res <- .compute_measure( + y = y, + ypred = NULL, + mupred = mupred, + ylp = ylp, + measure_entry = .builtin_entry("r2"), + log_weights = NULL + ) + + expect_equal(names(measure_res), c("estimates", "pointwise")) +}) + +test_that(".compute_measure() with crps works as expected", { + measure_res <- .compute_measure( + y = y, + ypred = ypred, + mupred = NULL, + ylp = ylp, + measure_entry = .builtin_entry("crps"), + log_weights = NULL + ) + + expect_equal(names(measure_res), c("estimates", "pointwise")) + expect_equal(names(measure_res$estimates), c("Estimate", "SE")) +}) + +test_that(".compute_measure() with elpd works as expected", { + measure_res <- .compute_measure( + y = NULL, + ypred = NULL, + mupred = NULL, + ylp = ylp, + measure_entry = .builtin_entry("elpd"), + log_weights = NULL + ) + + expect_equal(names(measure_res), c("estimates", "pointwise")) +}) + +test_that(".compute_measure() fails if insufficient input is provided", { + expect_error( + .compute_measure( + y = y, + ypred = ypred, + mupred = NULL, + ylp = ylp, + measure_entry = .builtin_entry("r2"), + log_weights = NULL + ), + regexp = "`mupred` must be a numeric matrix." + ) +}) + +## .compute_base_measure() ------------------------- + +test_that(".compute_base_measure() returns predperf if already existent", { + res <- .compute_base_measure( + ylp = NULL, + ylp_test = NULL, + loo = NULL, + kfold = NULL, + predperf = predperf, + psis_object = NULL, + source = "insample" + ) + expect_equal(res, predperf) +}) + +test_that(".compute_base_measure() errors if missing input", { + expect_error( + .compute_base_measure( + ylp = NULL, + ylp_test = NULL, + loo = NULL, + kfold = NULL, + predperf = NULL, + psis_object = NULL, + source = "insample" + ), + regexp = "`ylp` must be a numeric matrix." + ) +}) + +test_that(".compute_base_measure() computes elpd as expected", { + base_measure <- .compute_base_measure( + ylp = ylp, + ylp_test = NULL, + loo = NULL, + kfold = NULL, + predperf = NULL, + psis_object = NULL, + source = "insample" + ) + + expect_equal(names(base_measure), c("estimates", "pointwise", "diagnostics")) + expect_null(base_measure$diagnostics) + expect_equal(rownames(base_measure$estimates), c("elpd", "ic")) + expect_equal(colnames(base_measure$estimates), c("Estimate", "SE")) + expect_equal(dimnames(base_measure$pointwise)[[2]], c("elpd", "ic")) +}) + +test_that(".compute_base_measure() computes elpd_loo as expected", { + base_measure <- .compute_base_measure( + ylp = ylp, + ylp_test = NULL, + loo = NULL, + kfold = NULL, + predperf = NULL, + psis_object = loo1$psis_object, + source = "loo" + ) + + expect_equal( + rownames(base_measure$estimates), + c("elpd_loo", "p_eff_loo", "ic_loo") + ) + expect_equal( + dimnames(base_measure$pointwise)[[2]], + c("elpd_loo", "p_eff_loo", "ic_loo") + ) +}) + +test_that(".compute_base_measure() computes elpd_kfold as expected", { + base_measure <- .compute_base_measure( + ylp = ylp, + ylp_test = NULL, + loo = NULL, + kfold = kfold1, + predperf = NULL, + psis_object = NULL, + source = "kfold" + ) + + expect_equal( + rownames(base_measure$estimates), + c("elpd_kfold", "p_kfold", "kfoldic") + ) + expect_equal( + dimnames(base_measure$pointwise)[[2]], + c("elpd_kfold", "p_kfold", "kfoldic") + ) +}) + +## .merge_matrix() --------------------------- + +test_that(".merge_matrix() works as expected", { + mat <- matrix(c(1, 2, 3, 4), ncol = 2, nrow = 2) + val <- c(5, 6) + expected_name <- "test" + + res <- .merge_matrix( + source = "insample", mat = mat, name = expected_name, + values = val, margin = 1 + ) + + expect_equal(rownames(res)[[3]], "test") + expect_equal(dim(res), c(3, 2)) + expect_equal(colnames(res), c("Estimate", "SE")) + + res <- .merge_matrix( + source = "insample", mat = mat, name = expected_name, + values = val, margin = 2 + ) + + expect_equal(dim(res), c(2, 3)) + expect_equal(colnames(res)[3], "test") +}) + +test_that(".merge_matrix() with mat = NULL works as expected", { + res <- .merge_matrix( + source = "insample", mat = NULL, name = "test", + values = c(1, 2), margin = 1 + ) + + expect_equal(rownames(res), "test") + expect_equal(colnames(res), c("Estimate", "SE")) + expect_equal(dim(res), c(1, 2)) +}) + +test_that(".merge_matrix() with loo showes correct names", { + res <- .merge_matrix( + source = "loo", mat = NULL, name = "test", + values = c(1, 2), margin = 1 + ) + + expect_equal(rownames(res), "test_loo") + expect_equal(colnames(res), c("Estimate", "SE")) + expect_equal(dim(res), c(1, 2)) +}) + +test_that(".merge_matrix() with kfold showes correct names", { + res <- .merge_matrix( + source = "kfold", mat = NULL, name = "test", + values = c(1, 2), margin = 1 + ) + + expect_equal(rownames(res), "test_kfold") + expect_equal(colnames(res), c("Estimate", "SE")) + expect_equal(dim(res), c(1, 2)) +}) + +test_that(".merge_matrix() with duplicate naming is skipped", { + mat <- matrix(c(1, 2, 3, 4), ncol = 2, nrow = 2) + rownames(mat) <- c("elpd", "mse") + + res <- .merge_matrix( + source = "insample", mat = mat, name = "rmse", + values = c(5, 6), margin = 1 + ) + + expect_equal(c("elpd", "mse", "rmse"), rownames(res)) + + expect_warning( + .merge_matrix( + source = "insample", mat = mat, name = "elpd", + values = c(5, 6), margin = 1 + ), + regexp = "already present in results. Skipping the update." + ) +}) + +# integration tests ------------------------------ +## loo_pred_measure() / pred_measure() / kfold_pred_measure() --------- + +test_that("pred_measure() updates loo results as expected", { + predperf_loo <- loo_pred_measure( + loo = loo1, + y = y, + mupred = mupred, + ylp = ylp, + measure = c("r2", "mse"), + save_psis = TRUE + ) + + updated_predperf <- pred_measure( + y = y, + mupred = mupred, + predperf = predperf_loo, + measure = "mae" + ) + + expect_equal( + rownames(updated_predperf$estimates), + c("elpd_loo", "p_loo", "looic", "r2_loo", "mse_loo", "mae_loo") + ) + expect_equal(dim(updated_predperf$estimates), c(6, 2)) +}) + +test_that("pred_measure() provides warning for duplicate measure", { + predperf_loo <- loo_pred_measure( + loo = loo1, + y = y, + mupred = mupred, + ylp = ylp, + measure = "r2", + save_psis = TRUE + ) + + expect_warning( + pred_measure( + y = y, + mupred = mupred, + predperf = predperf_loo, + measure = "r2" + ), + regexp = "already present in results. Skipping the update." + ) + + expect_error( + loo_pred_measure( + y = y, + mupred = mupred, + ylp = ylp, + loo = loo1, + measure = c("mse", "r2", "r2") + ), + regexp = "Duplicate measure" + ) +}) + +test_that("loo_pred_measure() computes expected measures", { + predperf1 <- loo_pred_measure( + loo = loo1, + y = y, + mupred = mupred, + ylp = ylp, + measure = c("r2", "mse") + ) + + expect_equal( + rownames(predperf1$estimates), + c("elpd_loo", "p_loo", "looic", "r2_loo", "mse_loo") + ) + expect_equal(dim(predperf1$estimates), c(5, 2)) +}) + +test_that("kfold_pred_measure() works with rps as expected", { + res <- kfold_pred_measure( + y = y, + ypred = ypred, + mupred = mupred, + ylp = ylp, + measure = c("rps", "srps"), + kfold = kfold1, + control = list( + rps = list(size = max(ypred)), + srps = list(size = max(ypred)) + ) + ) + + expect_equal( + rownames(res$estimates), + c("elpd_kfold", "p_kfold", "kfoldic", "rps_kfold", "srps_kfold") + ) +}) + +# pred_measure() with custom function ------------------------------ +test_that("insample_pred_measure() accepts a custom measure function", { + set.seed(42) + S <- 4L + n <- 8L + y <- rnorm(n) + mupred <- matrix(rnorm(S * n), nrow = S, ncol = n) + ylp <- matrix(rnorm(S * n), nrow = S, ncol = n) + + custom_rmse <- function(y, mupred, log_weights = NULL) { + rmse(y, mupred, log_weights = log_weights) + } + attr(custom_rmse, "measure_name") <- "custom_rmse" + + res <- insample_pred_measure( + y = y, + mupred = mupred, + ylp = ylp, + measure = custom_rmse + ) + + expect_true("custom_rmse" %in% rownames(res$estimates)) + expect_true("custom_rmse" %in% colnames(res$pointwise)) +}) + +test_that("insample_pred_measure() accepts mixed built-in and custom measures", { + set.seed(1) + S <- 4L + n <- 8L + y <- rnorm(n) + mupred <- matrix(rnorm(S * n), nrow = S, ncol = n) + ylp <- matrix(rnorm(S * n), nrow = S, ncol = n) + + custom_rmse <- function(y, mupred, log_weights = NULL) { + rmse(y, mupred, log_weights = log_weights) + } + attr(custom_rmse, "measure_name") <- "custom_rmse" + + res <- insample_pred_measure( + y = y, + mupred = mupred, + ylp = ylp, + measure = list("r2", custom_rmse = custom_rmse) + ) + + expect_true(all(c("r2", "custom_rmse") %in% rownames(res$estimates))) +}) \ No newline at end of file diff --git a/tests/testthat/test_pred_measures_helpers.R b/tests/testthat/test_pred_measures_helpers.R new file mode 100644 index 00000000..c7c54817 --- /dev/null +++ b/tests/testthat/test_pred_measures_helpers.R @@ -0,0 +1,113 @@ +# load data ----------------------------- +res <- readRDS("data-for-tests/res_roaches.Rds") + +supported_measures_list <- getFromNamespace("supported_measures_list", "loo") + +# helpers for tests ------------------------------------------------ +.builtin_entry <- function(name) { + list(name = name, type = "builtin", key = name) +} + +# .normalize_measure() ---------------------------------------------- + +test_that(".normalize_measure() handles NULL and character input", { + expect_equal(.normalize_measure(NULL), list()) + entries <- .normalize_measure(c("mse", "rps")) + expect_length(entries, 2) + expect_equal(entries[[1]], .builtin_entry("mse")) +}) + +test_that(".normalize_measure() handles a custom function", { + f <- function(y, mupred) list(estimate = 1, se = 0, pointwise = y) + attr(f, "measure_name") <- "custom_mae" + entries <- .normalize_measure(f) + expect_length(entries, 1) + expect_equal(entries[[1]]$type, "custom") + expect_equal(entries[[1]]$name, "custom_mae") +}) + +test_that(".normalize_measure() handles a mixed list", { + f <- function(y, mupred) list(estimate = 1, se = 0, pointwise = y) + attr(f, "measure_name") <- "custom_mae" + entries <- .normalize_measure(list("r2", custom_mae = f)) + expect_length(entries, 2) + expect_equal(entries[[1]]$name, "r2") + expect_equal(entries[[2]]$name, "custom_mae") +}) + +test_that(".normalize_measure() errors on duplicate names", { + expect_error( + .normalize_measure(c("mse", "mse")), + regexp = "Duplicate measure" + ) +}) + +test_that(".normalize_measure() errors on unnamed list function", { + f <- function(y, mupred) list(estimate = 1, se = 0, pointwise = y) + expect_error( + .normalize_measure(list(f)), + regexp = "must be named" + ) +}) + +# .prepare_measures() ----------------------------------------------- + +test_that(".prepare_measures() errors on invalid built-in names", { + expect_error( + .prepare_measures("pps", res$predperf, supported_measures_list), + regexp = "Invalid measure" + ) +}) + +test_that(".prepare_measures() filters measures already in predperf", { + expect_warning( + .prepare_measures(c("mse", "elpd"), + predperf = res$predperf, supported_measures_list), + regexp = "already present in" + ) + + expect_warning( + .prepare_measures(c("mse", "elpd", "r2"), + predperf = res$predperf, supported_measures_list), + regexp = "already present in" + ) + + entries <- .prepare_measures( + c("mse", "rps"), + predperf = res$predperf, + supported_measures_list + ) + expect_equal(vapply(entries, `[[`, "", "name"), c("mse", "rps")) + + entries <- .prepare_measures( + c("mse"), + predperf = res$predperf, + supported_measures_list + ) + expect_equal(vapply(entries, `[[`, "", "name"), "mse") +}) + +# .validate_measure_result() ---------------------------------------- + +test_that(".validate_measure_result() accepts standard and CRPS-style output", { + res_std <- list(estimate = 1, se = 0.1, pointwise = c(1, 2)) + expect_invisible(.validate_measure_result(res_std, "m", n_obs = 2)) + + res_crps <- list(estimates = c(1, 0.1), pointwise = c(1, 2)) + expect_invisible(.validate_measure_result(res_crps, "m", n_obs = 2)) +}) + +test_that(".validate_measure_result() errors on invalid output", { + expect_error( + .validate_measure_result(list(estimate = 1), "m"), + regexp = "Missing" + ) + expect_error( + .validate_measure_result( + list(estimate = 1, se = 0.1, pointwise = c(1, 2, 3)), + "m", + n_obs = 2 + ), + regexp = "length 2, not 3" + ) +}) diff --git a/tests/testthat/test_print.R b/tests/testthat/test_print.R new file mode 100644 index 00000000..234bb4ae --- /dev/null +++ b/tests/testthat/test_print.R @@ -0,0 +1,84 @@ +# load data +temp <- readRDS("data-for-tests/res_roaches.Rds") + +measure_specs <- function(temp) { + rank_size <- max(temp$ypred) + rank_control <- list( + rps = list(size = rank_size), + srps = list(size = rank_size) + ) + list( + list(measure = "r2", args = list(y = temp$y, mupred = temp$mupred)), + list(measure = "rmse", args = list(y = temp$y, mupred = temp$mupred)), + list(measure = "mse", args = list(y = temp$y, mupred = temp$mupred)), + list(measure = "mae", args = list(y = temp$y, mupred = temp$mupred)), + list( + measure = "rps", + args = list(y = temp$y, ypred = temp$ypred), + control = rank_control + ), + list(measure = "crps", args = list(y = temp$y, ypred = temp$ypred)), + list( + measure = "srps", + args = list(y = temp$y, ypred = temp$ypred), + control = rank_control + ), + list(measure = "scrps", args = list(y = temp$y, ypred = temp$ypred)), + list(measure = "mlpd", args = list(ylp = temp$ylp)) + ) +} + +run_measure_snapshots <- function(loo_start, measures) { + loo_iter <- loo_start + for (measure in measures) { + loo_prev <- loo_iter + call_args <- c( + measure$args, + list( + measure = measure$measure, + loo = loo_iter, + save_psis = TRUE + ) + ) + if (!is.null(measure$control)) { + call_args$control <- measure$control + } + loo_iter <- do.call(loo_pred_measure, call_args) + common_rows <- intersect( + rownames(loo_prev$estimates), + rownames(loo_iter$estimates) + ) + common_cols <- intersect( + colnames(loo_prev$pointwise), + colnames(loo_iter$pointwise) + ) + expect_equal( + loo_iter$estimates[common_rows, , drop = FALSE], + loo_prev$estimates[common_rows, , drop = FALSE], + info = measure$measure + ) + expect_equal( + loo_iter$pointwise[, common_cols, drop = FALSE], + loo_prev$pointwise[, common_cols, drop = FALSE], + info = measure$measure + ) + expect_snapshot_output(print(loo_iter)) + } + loo_iter +} + +test_that("loo_pred_measure print snapshots", { + loo_ordered <- run_measure_snapshots(temp$loo, measure_specs(temp)) + loo_shuffled <- run_measure_snapshots( + temp$loo, + with(set.seed(0), sample(measure_specs(temp))) + ) + expect_setequal( + rownames(loo_ordered$estimates), + rownames(loo_shuffled$estimates) + ) + expect_equal( + loo_ordered$estimates[rownames(loo_shuffled$estimates), , drop = FALSE], + loo_shuffled$estimates + ) +}) diff --git a/vignettes/articles-online-only/pred_measure-family.Rmd b/vignettes/articles-online-only/pred_measure-family.Rmd new file mode 100644 index 00000000..20659746 --- /dev/null +++ b/vignettes/articles-online-only/pred_measure-family.Rmd @@ -0,0 +1,465 @@ +--- +title: "Computing predictive performance: The `pred_measure`-family" +author: "Florence Bockting" +output: + rmarkdown::html_vignette: + toc: true + toc_depth: 3 +params: + EVAL: TRUE +vignette: > + %\VignetteIndexEntry{Computing predictive performance: The `pred_measure`-family} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + message = FALSE, + warning = FALSE, + eval = params$EVAL, + fig.width = 7, + fig.height = 4, + fig.align = "center" +) + +devtools::load_all(".") +# library(brms) +devtools::load_all("/u/21/wa.bocktif1/unix/GitHub/brms") +library(dplyr) +library(ggplot2) +``` + +## Introduction + +After fitting a Bayesian model, you often want predictive performance measures +that are comparable across evaluation strategies: in-sample, PSIS-LOO, +k-fold, and holdout data. The **pred_measures** package provides a unified +workflow through four entry points: + +- `insample_pred_measure()` for in-sample evaluation +- `loo_pred_measure()` for PSIS-LOO cross-validation +- `kfold_pred_measure()` for k-fold cross-validation +- `test_pred_measure()` for holdout / new data + +Each entry point can compute any supported measure in a single call when you +supply the required inputs and pass the `measure` argument. +For the argument `measure` you can either specify a vector with predefined measure names whereby supported measures +are: + +- **Density scores:** `elpd`, `mlpd` +- **Distributional scores:** `rps`, `srps` +- **Point metrics:** `mae`, `mse`, `rmse`, `r2`, `acc`, `bacc` + +Base measures (`elpd`, `ic`) are always included automatically; pass any +additional names from the list above through the `measure` argument. You can +also obtain a list of supported measures via `supported_measures_list`: + +```{r supported-measures} +supported_measures_list +``` + +When you want to compute a measure that is not yet supported, you can pass a custom function to the `measure` argument (see [Passing a custom function for a measure](#passing-a-custom-function-for-a-measure) below). In this case, the argument `measure` should be a list, whereby you can pass both predefined measure names and custom functions (e.g., `list("rps", "my_fun" = my_fun)`). The custom function must accept the same required arguments as the built-in version of that measure (see [Input requirements by measure](#input-requirements-by-measure) below), and you can pass additional arguments through `control`. + +For measure definitions and formulas, see the companion vignette +"overview-measures" [TODO]. + +Besides computing all desired measures in one call, `pred_measure()` lets you +**add** measures to an existing result without recomputing what is already +stored (see [Adding measures incrementally](#adding-measures-incrementally-optional) below). + +**Prerequisites.** + +This tutorial assumes a fitted **brms** model and familiarity +with `brms::log_lik()`, `brms::posterior_predict()`, and +`brms::posterior_epred()`. + +**What you will learn.** + +1. Compute all desired measures in one call to an entry point. +2. Optionally accumulate measures incrementally with `pred_measure()`. +3. Repeat the workflow under LOO, k-fold, and out-of-sample evaluation. +4. Compare results across evaluation strategies. + +```{r load-data, include=FALSE} +fit_sleep <- readRDS("data-for-vignettes/fit_sleep.Rds") +fit_binary <- readRDS("data-for-vignettes/fit_binary.Rds") +fit_roaches <- readRDS("data-for-vignettes/fit_roaches.Rds") +fit_penguins <- readRDS("data-for-vignettes/fit_penguins.Rds") +res_sleep_test <- readRDS("../../tests/testthat/data-for-tests/res_sleep_test.Rds") +``` + +## Quick start + +The sleep-study model from **lme4** predicts reaction time (ms) from days of +sleep deprivation. The shortest path from a fitted model to printed measures is +a single call to an entry point: + +```{r quick-start} +sleep_measure <- insample_pred_measure( + ylp = brms::log_lik(fit_sleep), + y = fit_sleep$data$y, + ypred = brms::posterior_predict(fit_sleep), + measure = c("rps", "srps") +) + +print(sleep_measure) +``` + +Base measures (`elpd`, `ic`) are always computed; pass additional measure names +in `measure`. The rest of this tutorial explains how inputs map to measures and +how to switch evaluation modes. + +## Core concepts + +### Object structure + +Every function returns a **pred_measure** object which is a list with: + +| Component | Description | +| :--- | :--- | +| `$estimates` | Summary estimate and SE for each measure | +| `$pointwise` | Observation-level contributions (one column per measure) | +| `$diagnostics` | PSIS diagnostics (LOO and Kfold-CV only) | +| `$log_weights` | Log weights used for LOO (LOO-CV only) | + +The `print()` method shows `$estimates` in a readable table. Use +`str(result)` when you need the full structure. + +Every object also carries attributes that identify the evaluation mode and data +dimensions. The table below summarizes what differs across approaches; all +objects share `attr(result, "dims")` (posterior draws × observations) and +`attr(result, "source")` (one of `"insample"`, `"loo"`, `"kfold"`, `"test"`). + +| | In-sample | PSIS-LOO | K-fold | Holdout (Test) | +| :--- | :--- | :--- | :--- | :--- | +| **Class** | `insample_pred_measure` | `loo_pred_measure` (+ classes from `loo` object) | `kfold_pred_measure` (+ classes from `kfold` object) | `test_pred_measure` | +| **List slots** | `$estimates`, `$pointwise` | `$estimates`, `$pointwise`, `$diagnostics`, `$log_weights`; (`$psis_object`) | `$estimates`, `$pointwise`; (`$diagnostics`) | `$estimates`, `$pointwise` | +| **Measure suffix** | none (`elpd`, `ic`) | `_loo` (`elpd_loo`, `p_loo`, `ic_loo`, …) | `_kfold` (`elpd_kfold`, `p_kfold`, `kfoldic`, …) | `_test` (`elpd_test`, `ic_test`, …) | +| **Inherited attrs** | -- | `yhash`, `model_name` from `loo` | `K`, `Ksub`, `folds`, `fold_type`, `joint`, `yhash`, `model_name` from `kfold` | -- | + +For holdout objects, `dims` reflects the test-set size (from `ylp_test`), not +the training data. LOO and Kfold objects expose Pareto-k values in `$diagnostics$pareto_k` to assess the reliability of PSIS weights. + +### Input requirements by measure + +Supply the inputs each measure needs in a single entry-point call (see the +[workflow overview](#workflow-overview) below). When adding measures +incrementally, pass an existing object as `predperf` to `pred_measure()` so +base estimates and weights are reused. + +| Measure | `ylp` | `y` | `ypred` | `mupred` | +| :---: | :---: | :---: | :---: | :---: | +| `elpd`, `mlpd`, `ic` | ✓ | | | | +| `rps`, `srps` | | ✓ | ✓ | | +| `acc`, `bacc` | | ✓ | | ✓ | +| `mae`, `mse`, `rmse`, `r2` | | ✓ | | ✓ | + +- **`ylp`**: pointwise log predictive density matrix (`S` draws × `n` observations), + typically from `brms::log_lik(fit)`. +- **`ypred`**: posterior predictive draws from `brms::posterior_predict()`. +- **`mupred`**: posterior expected values from `brms::posterior_epred()`. + +### Two ways to compute measures + +| Approach | When to use | Example | +| :--- | :--- | :--- | +| **One-shot** | You know all measures upfront | `loo_pred_measure(..., measure = c("rps", "r2"))` | +| **Incremental** | Explore interactively or add measures later | `pred_measure(..., predperf = existing, measure = "r2")` | + +### Workflow overview + +| Evaluation mode | Entry point | Primary inputs | +| :--- | :--- | :--- | :--- | +| In-sample | `insample_pred_measure()` | `ylp`, plus `y`, `ypred`, `mupred` as needed | +| PSIS-LOO | `loo_pred_measure()` | `loo`, or `ylp` + `psis_object`, or `ylp` alone | +| K-fold | `kfold_pred_measure()` | `kfold`, plus `y`, `ypred`, `mupred` as needed | +| Holdout | `test_pred_measure()` | `ylp`, `ylp_test`, plus `y`, `ypred`, `mupred` as needed | + +Pass all required inputs and the full `measure` vector to the entry point for +your evaluation mode. Use `pred_measure()` only when you want to extend an +existing result (see below). + +## Continuous outcome: sleep study + +We use the pre-fitted Gaussian model on the sleep study throughout this section. + +### In-sample evaluation + +Pass all required inputs and the measures you want to `insample_pred_measure()`: + +```{r insample-all} +ylp_sleep <- brms::log_lik(fit_sleep) +ypred_sleep <- brms::posterior_predict(fit_sleep) +mupred_sleep <- brms::posterior_epred(fit_sleep) + +sleep_measure <- insample_pred_measure( + ylp = ylp_sleep, + y = fit_sleep$data$y, + ypred = ypred_sleep, + mupred = mupred_sleep, + measure = c("rps", "srps", "r2") +) +print(sleep_measure) +``` + +`elpd` sums the expected log pointwise predictive density across observations; +`ic` is `-2 × elpd`. Additional rows in the table correspond to the measures +passed in `measure`. + +```{r pointwise-crps, fig.cap="Pointwise CRPS contributions across observations (in-sample)."} +crps_pw <- sleep_measure$pointwise[, "rps"] +ggplot( + data = data.frame(obs = seq_along(crps_pw), crps = crps_pw) +) + + geom_dotplot(aes(crps), dotsize = 0.5) + + labs( + x = "CRPS_i", + title = "Dotplot for in-sample pointwise CRPS" + ) + + theme_minimal() +``` + +The entire structure of the returned object with corresponding +attributes is shown in the following + +```{r} +str(sleep_measure) +``` + +#### Passing a custom function for a measure + +It is also possible to pass a custom function for any measure. The function must accept the same arguments as the built-in version of that measure, and you can pass additional arguments through `control`: + +```{r} +# custom scoring rule +hamming_loss <- function(y, ypred, log_weights = NULL, revert_sign = FALSE) { + if (is.null(log_weights)) { + hamming_i <- colMeans(sweep(ypred, 2, y, "!=")) + } else { + w <- exp(.normalize_log_weights(log_weights)) + hamming_i <- colSums(w * sweep(ypred, 2, y, "!=")) + } + + res <- list( + estimate = mean(hamming_i), + se = sqrt(var(hamming_i) / length(hamming_i)), + pointwise = hamming_i + ) + + .measure_output(res, revert_sign) +} +``` + +```{r insample-custom-function} +(insample_pred_measure( + ylp = brms::log_lik(fit_binary), + y = fit_binary$data$y, + ypred = res_binary$ypred, + measure = list("rps", hamming = hamming_loss) +)) +``` + +#### Adding measures incrementally + +If you already computed measures and want to add more without +recomputing them, pass the existing object as `predperf` (**pred**icted +**perf**ormance measure) to `pred_measure()`: + +```{r insample-incremental} +(pred_measure( + y = fit_sleep$data$y, + mupred = mupred_sleep, + predperf = sleep_measure, + measure = c("rmse", "mse") +)) +``` + +This two-step pattern is useful when exploring measures interactively. + +### PSIS-LOO cross-validation + +In-sample `elpd` uses posterior draws from $p(\theta \mid y)$. Comparing +those predictions to the same data is **optimistically biased** for future or +otherwise unseen observations: the model has already seen the data it is +predicting ([Vehtari's CV-FAQ](https://users.aalto.fi/~ave/CV-FAQ.html), +Section 4). + +**Cross-validation** estimates out-of-sample predictive performance by +holding out part of the data, refitting (or reweighting) without that part, +and scoring the held-out points. In **leave-one-out cross-validation (LOO-CV)**, +each observation $y_i$ is predicted using a model fit to $y_{-i}$. The target +quantity is the expected log pointwise predictive density (elppd), estimated by +summing held-out log predictive densities across observations (see +`vignette("overview-measures")` for the full +definitions of within-sample, LOO, PSIS-LOO, and k-fold variants). + +**PSIS-LOO** approximates exact LOO-CV without refitting the model $n$ times. +It reweights the full-data posterior draws $\theta^{(s)} \sim p(\theta \mid y)$ +with Pareto-smoothed importance sampling (PSIS) weights so that each +observation $i$ is scored as if it had been left out. The resulting +`elpd_loo` is an estimate of how well the model would predict new reaction +times. + +Compute LOO measures in one call: + +```{r loo-all} +loo_sleep <- brms::loo(fit_sleep, save_psis = TRUE) + +loo_measure <- loo_pred_measure( + loo = loo_sleep, + y = fit_sleep$data$y, + ypred = ypred_sleep, + mupred = mupred_sleep, + measure = c("rps", "srps", "r2") +) +print(loo_measure) +``` + +The estimates carry a `_loo` suffix (`elpd_loo`, `ic_loo`). Additional `p_loo` (effective number of parameters) is provided which is the difference between +`elpd_loo` and the non-cross-validated log posterior predictive density. +Furthermore, we get an additional entry with PSIS diagnostics in `$diagnostics`. + +```{r loo-pareto-k, fig.cap="Pareto-k diagnostics for PSIS-LOO. Values above 0.7 suggest unreliable LOO estimates for those observations."} +str(loo_measure) +``` + +**Alternative LOO inputs.** + +`loo_pred_measure()` accepts three equivalent input patterns: + +| Input | Example | +| :--- | :--- | +| Precomputed `loo` object | `loo_pred_measure(loo = loo_sleep, measure = ...)` | +| `ylp` + `psis_object` | `loo_pred_measure(ylp = ylp, psis_object = psis_obj, measure = ...)` | +| `ylp` only (PSIS computed internally) | `loo_pred_measure(ylp = ylp, measure = ...)` | + +To add measures incrementally after a base-only LOO call, pass the existing +object as `predperf` to `pred_measure()`. When +accumulating after computing only `elpd`/`ic`, use `save_psis = TRUE` in +`loo_pred_measure()` so PSIS weights are stored for subsequent measures. + +Compare the in-sample and LOO ELPD values directly: + +```{r elpd-compare-table} +data.frame( + method = c("In-sample", "PSIS-LOO"), + elpd = c( + sleep_measure$estimates["elpd", "Estimate"], + loo_measure$estimates["elpd_loo", "Estimate"] + ), + se = c( + sleep_measure$estimates["elpd", "SE"], + loo_measure$estimates["elpd_loo", "SE"] + ) +) +``` + +### K-fold cross-validation + +K-fold CV holds out groups of observations rather than single points. +The same computation pattern applies to this approach: pass all inputs and +measures to `kfold_pred_measure()` with a `kfold` object from `brms::kfold()`: + +```{r kfold-measures} +kfold_sleep <- brms::kfold(fit_sleep, save_fits = TRUE) +mupred_kfold <- brms::kfold_predict(kfold_sleep, method = "fitted")$yrep +ypred_kfold <- brms::kfold_predict(kfold_sleep, method = "predict")$yrep + +kfold_measure <- kfold_pred_measure( + y = fit_sleep$data$y, + ypred = ypred_kfold, + mupred = mupred_kfold, + kfold = kfold_sleep, + measure = c("rps", "srps", "r2") +) +print(kfold_measure) +``` + +Measure names carry a `_kfold` suffix (e.g. `elpd_kfold`, `rps_kfold`). +The returned object inherits the attributes from the `brms::kfold` object: + +```{r} +str(kfold_measure) +``` + +### Holdout evaluation + +For genuinely new data, use `test_pred_measure()` in the same one-shot way. +We load precomputed predictive inputs from the package test fixtures (a random +train/test split of the sleep study): + +```{r test-measures} +test_measure <- test_pred_measure( + y = res_sleep_test$y_test, + ypred = res_sleep_test$ypred_test, + mupred = res_sleep_test$mupred_test, + ylp = res_sleep_test$ylp_train, + ylp_test = res_sleep_test$ylp_test, + measure = c("rps", "srps", "r2") +) +print(test_measure) +``` + +Holdout measures use a `_test` suffix and reflect performance on the 50 +held-out observations only. + +## Other outcome types + +The same pattern applies to non-Gaussian models. Each subsection uses +a pre-fitted **brms** model. + +### Binary outcomes + +```{r binary-example} +binary_measure <- insample_pred_measure( + ylp = brms::log_lik(fit_binary), + y = fit_binary$data$y, + mupred = brms::posterior_epred(fit_binary), + measure = c("acc", "bacc") +) +print(binary_measure) +``` + +### Count outcomes + +```{r count-example} +ypred_roaches <- brms::posterior_predict(fit_roaches) + +roaches_measure <- insample_pred_measure( + ylp = brms::log_lik(fit_roaches), + y = fit_roaches$data$y, + ypred = ypred_roaches, + measure = c("rps", "srps"), + control = list(rps = list(size = as.integer(max(ypred_roaches))), + srps = list(size = as.integer(max(ypred_roaches)))) +) +print(roaches_measure) +``` + +### Categorical outcomes + +```{r categorical-example} +penguins_measure <- insample_pred_measure( + ylp = brms::log_lik(fit_penguins), + y = as.integer(fit_penguins$data$y), + mupred = brms::posterior_epred(fit_penguins), + measure = c("acc", "bacc") +) +print(penguins_measure) +``` + +See `vignette("overview-measures", package = "pred_measures")` for which +measures apply to each outcome type. + +## Further reading + +- `?supported_measures_list` — character vector of all supported measure names +- `?pred_measure` — optional helper for adding measures to an existing predperf + object +- `?insample_pred_measure`, `?loo_pred_measure`, `?kfold_pred_measure`, + `?test_pred_measure` — entry points for each evaluation mode +- `vignette("overview-measures", package = "pred_measures")` — measure + definitions, formulas, and orientation (higher vs lower is better) diff --git a/vignettes/articles-online-only/pred_measure-formulas.Rmd b/vignettes/articles-online-only/pred_measure-formulas.Rmd new file mode 100644 index 00000000..3fb3f7eb --- /dev/null +++ b/vignettes/articles-online-only/pred_measure-formulas.Rmd @@ -0,0 +1,911 @@ +--- +title: "Overview of scores and metrics" +author: "Florence Bockting" +date: "`r Sys.Date()`" +output: + rmarkdown::html_vignette: + toc: true + toc_depth: 3 +params: + EVAL: FALSE +vignette: > + %\VignetteIndexEntry{Overview of scores and metrics} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + + +## Introduction + +In Bayesian model evaluation, predictive performance is assessed by comparing a model's posterior predictive distribution against observed data. Depending on whether the outcome is continuous, binary, or categorical, and whether we evaluate the entire probability distribution or a specific point estimate, different scoring rules and performance metrics are appropriate. + +The table below summarizes the core metrics implemented in our framework. + +| Metric Keyword | Full Name | Metric Type | Orientation | Target Outcome Type | +| :--- | :--- | :--- | :--- | :--- | +| `lppd` / `elppd` | (Expected) Log Pointwise Predictive Density | Scoring Rule (Distributional) | Higher is Better | Any | +| `acc` | Accuracy | Proper Scoring Rule | Higher is Better | Nominal Categorical | +| `bacc` | Balanced Accuracy | Adjusted Classification Metric | Higher is Better | Imbalanced Categorical | +| `brier` | Brier Score | Proper Scoring Rule | Lower is Better | Binary | +| `crps` | Continuous Ranked Probability Score | Proper Scoring Rule (Distributional) | Lower is Better | Continuous | +| `scrps` | Scaled Continuous Ranked Probability Score | Scale-Invariant Proper Scoring Rule | Higher is Better | Continuous | +| `rps` | Ranked Probability Score | Proper Scoring Rule | Lower is Better | Ordered Categorical | +| `srps` | Scaled Ranked Probability Score | Scale-Invariant Proper Scoring Rule | Higher is Better | Ordered Categorical | +| `mae` | Mean Absolute Error | Point Estimation Metric | Lower is Better | Continuous | +| `mse` | Mean Squared Error | Point Estimation Metric | Lower is Better | Continuous | +| `rmse` | Root Mean Squared Error | Point Estimation Metric | Lower is Better | Continuous | +| `r2` | Coefficient of Determination | Summary Fit Metric | Higher is Better | Continuous | + +```{r setup, include=FALSE} +knitr::opts_chunk$set(eval = TRUE, message = FALSE, warnings = FALSE) +``` + +## Data Examples +```{r, message=FALSE, warnings=FALSE, include = FALSE} +library(tidyr) +library(dplyr) +library(ggplot2) +# devtools::load_all("../brms") +``` + +```{r, include = FALSE} +plot_ptw <- function(ws, loo, kfold, label, show_estimate = TRUE){ + + # 1. Safe estimation extractor + ptw_est <- function(x) { + if (is.null(x)) return(NA) + if (!is.null(x$estimates[1])) { + return(x$estimates[1]) + } + x$estimates["Estimate"] + } + + # 2. Build estimates and create the formatted labels right away + estimates <- tibble( + source = c("ws", "loo", "kfold"), + estimate = c(ptw_est(ws), ptw_est(loo), ptw_est(kfold)) + ) |> + mutate(source_label = if_else( + is.na(estimate), + paste0(source, " (no data)"), + paste0(source, " (est. = ", round(estimate, 2), ")") + )) + + # Turn into a factor to preserve the ws -> loo -> kfold order + estimates$source_label <- factor(estimates$source_label, levels = estimates$source_label) + + # 3. Safe data binding for the dots + g_data <- dplyr::bind_rows( + tibble(source = "ws", value = if (!is.null(ws)) ws$pointwise else as.numeric(NA)), + tibble(source = "loo", value = if (!is.null(loo)) loo$pointwise else as.numeric(NA)), + tibble(source = "kfold", value = if (!is.null(kfold)) kfold$pointwise else as.numeric(NA)) + ) + + # 4. Align facet names for the dots data + plot_df <- g_data |> + left_join(estimates, by = "source") |> + mutate(source = source_label) + + # 5. Align facet names for the line data so ggplot matches them up perfectly + line_data <- estimates |> + mutate(source = source_label) + + # Plotting + g <- ggplot(plot_df, mapping = aes(x = value)) + + geom_dotplot() + + if (isTRUE(show_estimate)) { + g <- g + + geom_vline( + data = line_data, # Uses the aligned facet labels now + aes(xintercept = estimate), color = "red", linetype = "solid", linewidth = 1 + ) + } + + g + + facet_wrap(~ source, nrow = 1) + + theme_classic() + + xlab(label) +} +``` + +```{r} +# see vignettes/articles-online-only/test-data/generate-data.R for code +# to generate these objects + +# poisson model; count data +res_roaches <- readRDS("../../tests/testthat/data-for-tests/res_roaches.Rds") +# normal model; continuous data +res_sleep <- readRDS("../../tests/testthat/data-for-tests/res_sleep.Rds") +# bernoulli model; binary data +res_binary <- readRDS("../../tests/testthat/data-for-tests/res_binary.Rds") +# categorical model; nominal categories +res_penguins <- readRDS("../../tests/testthat/data-for-tests/res_penguins.Rds") +# binomial model; bounded count data +res_binomial <- readRDS("../../tests/testthat/data-for-tests/res_binomial.Rds") +``` + +## Log pointwise predictive density (lppd) +The log pointwise predictive density aggregates the log posterior predictive +densities over a pointwise partition of the data $y = (y_1,\ldots,y_n)$: +$$ +\text{lppd} = \sum_{i=1}^n \log \int p(y_i \mid \theta) p(\theta \mid y) d\theta +$$ +Monte Carlo estimator with posterior draws: +$$ +\widehat{\text{lppd}} = \sum_{i=1}^n \log \left(\frac{1}{S}\sum_{s=1}^S +p(y_i\mid \theta^{(s)})\right) += \sum_{i=1}^n \underbrace{\Big[\text{log-sum-exp}_s\{\log p(y_i\mid +\theta^{(s)})\} - \log S\Big]}_{\text{lppd}_i} +$$ + +We use $\log p(y_i\mid \theta^{(s)})$ for numerical stability. + +### Conceptual variants +We define pointwise terms $\text{lppd}_i$ and their sums. +For standard errors we use the sample variance across $i$: + +- $\widehat{\text{Var}}_i(\text{lppd}_i) = \frac{1}{n-1} \sum_{i=1}^n +(\text{lppd}_i - \text{lppd})^2 \text{, with } \text{lppd} = +\frac{1}{n}\sum_i \text{lppd}_i$ +- $se(\text{sum}) \approx \sqrt{n \cdot \widehat{\text{Var}}(\text{lppd}_i)}$ +- $se(\text{mean}) \approx \sqrt{\frac{\widehat{\text{Var}}(\text{lppd}_i)}{n}}$ + +#### Within-sample (training data; optimistic for new data) ++ Draws: $\theta^{(s)} \sim p(\theta \mid y)$ ++ Pointwise term: +$$ +\text{lppd}_i^\text{ws} = \text{log-sum-exp}_s \big(\log p(y_i +\mid \theta^s)\big) - \log(S) +$$ ++ Estimate and Standard error: + + $\text{lppd}^\text{sw} = \sum_i^n \text{lppd}_i^\text{ws}$ + + $\widehat{\text{elppd}}^\text{sw} = \text{lppd}^\text{sw}$ + (optimistically biased for future data) + + $se(\text{lppd}^\text{sw}) \approx \sqrt{n \cdot \widehat{\text{Var}}(\text{lppd}_i^\text{ws})}$ + +#### Leave-one-out cross-validation (LOO-CV; held-out) ++ Draws: $\theta_{−i}^{(s)} \sim p(\theta \mid y_{−i})$ ++ Pointwise term: +$$ +\text{lppd}_i^\text{loo} = \text{log-sum-exp}_s(\log p(y_i \mid \theta_{-i}^s)) - \log(S) +$$ ++ Estimate and Standard Error: + + $\text{lppd}^\text{loo} = \sum_i^n \text{lppd}_i^\text{loo}$, + an estimate of elppd + + $se(\text{lppd}^\text{loo}) \approx \sqrt{n \cdot \widehat{\text{Var}}(\text{lppd}_i^\text{loo})}$ + +#### PSIS-LOO (held-out approximation) ++ Reweights full-data draws $\theta^{(s)} \sim p(\theta \mid y)$ to approximate +$p(\theta \mid y_{−i})$ with normalized PSIS weights $\tilde w_i^{(s)}$ ($\sum \tilde w_i^{(s)} = 1$) ++ Pointwise term: +$$ +\text{lppd}_i^\text{psis} = \text{log-sum-exp}_s\big(\log p(y_i \mid \theta^s) + \log \tilde w_i^{(s)}\big) +$$ ++ Estimate and Standard Error: + + $\text{lppd}^\text{psis} = \sum_i^n \text{lppd}_i^\text{psis}$, an estimate + of elppd + + $se(\text{lppd}^\text{psis}) \approx \sqrt{n \cdot \widehat{\text{Var}}(\text{lppd}_i^\text{psis})}$ + +#### K-fold cross-validation +Partition indices into $K$ disjoint folds, $F_1,\ldots, F_K$. For each fold +$m$, fit the model on the training data $y_{-F_m}$, draw parameters from the +posterior distribution $\theta^{(m,s)} \sim p(\theta \mid y_{-F_m})$ and +evaluate fold $F_m$. + ++ Draws: $\theta^{(m,s)} \sim p(\theta \mid y_{-F_m})$ ++ Pointwise term: +$$ +\text{lppd}_i^\text{kfold} = \text{log-sum-exp}_s\big(\log p(y_i\mid \theta^{(m,s)})\big) - \log S,\quad i \in F_m +$$ ++ Estimate and Standard Error: + + $\text{lppd}^\text{kfold} = \sum_i \text{lppd}_i^\text{kfold}$, an estimate + for elppd + + $se(\text{lppd}^\text{kfold}) \approx \sqrt{n \cdot \widehat{\text{Var}}(\text{lppd}_i^\text{kfold})}$ + +## Expected log pointwise predictive density (elppd) +**elppd** (expected log pointwise predictive density) is the sum, over a chosen +pointwise partition of the data, of the expected log posterior predictive +density for future observations drawn from the true data-generating +distribution $p_t$. Formally, +$$ +\mathrm{elppd}=\sum_{i=1}^n \mathbb{E}_{\tilde y_i \sim p_t} \left[\log \int p(\tilde y_i \mid \theta) p(\theta \mid y) d\theta \right] =\sum_{i=1}^n \mathbb{E}_{\tilde y_i \sim p_t} \left[\text{lppd}_i \right] +$$ + +Because $p_t$ is unknown, **elppd** is estimated from data-either by +bias-correcting the in-sample lppd (e.g., WAIC) or via cross-validation +(e.g., LOO/Psis-LOO/K-fold) that evaluates predictive densities on held-out +data. +$$ +\widehat{\text{elppd}} \approx \sum_i^n\text{lppd}_i +$$ + + +```{r, warnings=FALSE, message=FALSE} +# within-sample elpd +elpd_ws <- elpd(ylp = res_roaches$ylp) +message("within-sample elpd:") +invisible(str(elpd_ws)) + +# psis-loo elpd +elpd_psis1 <- elpd( + ylp = res_roaches$ylp, + log_weights = res_roaches$log_weights +) +elpd_psis2 <- elpd( + ylp = NULL, + pointwise = res_roaches$loo$pointwise[, "elpd_loo"] +) +# expect_equal(elpd_psis1, elpd_psis2) +message("psis-loo elpd:") +invisible(str(elpd_psis1)) + +# kfold elpd +elpd_kfold <- elpd( + ylp = NULL, + pointwise = res_roaches$kfold$pointwise[, "elpd_kfold"] +) +message("kfold elpd:") +invisible(str(elpd_kfold)) +``` + +```{r, fig.height=3, fig.width=7, fig.align="center"} +plot_ptw(elpd_ws, elpd_psis1, elpd_kfold, "lppd_i", show_estimate = FALSE) +``` + +### Mean log pointwise predictive density +The mean log predictive density (**mlpd**) is the average (across observations) +of the model's log predictive density. + +$$ +\text{mlpd} = \frac{1}{n} \widehat{\text{elppd}} = \frac{1}{n} \sum_i^n\text{lppd}_i +$$ +The standard error of the mean is then +$$ +se(\text{mlpd}) = \frac{se(\text{elppd})}{n} +$$ + +```{r} +mlpd_ws <- mlpd( + ylp = res_roaches$ylp +) +mlpd_loo <- mlpd( + ylp = res_roaches$ylp, + log_weights = res_roaches$log_weights +) +mlpd_kfold <- mlpd( + ylp = NULL, + pointwise = res_roaches$kfold$pointwise[, "elpd_kfold"] +) + +message("within-sample mlpd:") +invisible(str(mlpd_ws)) +message("psis-loo mlpd:") +invisible(str(mlpd_loo)) +message("kfold mlpd:") +invisible(str(mlpd_kfold)) +``` + +```{r, fig.height=3, fig.width=7, fig.align="center"} +plot_ptw(mlpd_ws, mlpd_loo, mlpd_kfold, "lppd_i") +``` + +## Log score vs. elpd +The log score is the log of the predictive density (or probability) evaluated +at the observed value. With $S$ posterior draws $\theta^1, \ldots, \theta^S$ +from $p(\theta \mid y)$, the posterior predictive density is approximated by +Monte Carlo averaging +$$ +\log p(y_i \mid y) \approx \log \Big[\frac{1}{S} \sum_{s=1}^S p(y_i \mid \theta^s) \Big] +$$ +or using the log-sum-exp trick: +$$ +\log p(y_i \mid y) \approx \Big[\text{log-sum-exp}_s\big(\log p(y_i \mid \theta^{(s)})\big) - \log S\Big] +$$ + +This definition looks equivalent to that we used in order to introduce the +elpd. What is then the difference between log score and elpd? + ++ **Log score**: The realized value $\log p(y \mid p(\theta \mid y))$ for an +observed data point $y$ under a model's predictive distribution. It is a proper, + local scoring rule used to evaluate probabilistic predictions. ++ **elpd**: The expectation of the log score for a future, as-yet-unobserved +data point drawn from the data-generating process. It averages the log +predictive density over the distribution of future data, and its pointwise +sum over $n$ items is elppd; in practice we estimate it via cross-validation +or using information criteria that correct in-sample bias. + +## Accuracy (`acc`) + +Accuracy here is the posterior predictive probability assigned to the observed category, averaged across (weighted) posterior draws. +Accuracy is a proper scoring rule suitable for nominal categorical outcomes and is positively oriented (higher is better). Accuracy ranges between 0 (all predictions are wrong) and 1 (all predictions are correct). + +- Predictive probability of the observed class, Monte Carlo: + $$ + \hat p_i = \frac{1}{S}\sum_{s=1}^S \Pr(Y_i = y_i \mid \theta^{(s)}),\quad + \hat p_i^w = \sum_{s=1}^S \tilde w_i^{(s)}\,\Pr(Y_i = y_i \mid \theta^{(s)}) + $$ +- Pointwise: + $$ + \text{acc}_i = \hat p_i \ \text{ or }\ \hat p_i^w + $$ +- Estimate and Standard Error: + $$ + \widehat{\text{acc}} = \frac{1}{n}\sum_{i=1}^n \text{acc}_i,\quad + se(\widehat{\text{acc}}) = \sqrt{\frac{\widehat{\text{Var}}(\text{acc}_i)}{n}} + $$ + +::: {.callout-tip} + +Instead of drawing a single label per sample (`ypred`), we use the expected probability under the posterior (`mupred`). This is the Bayes-optimal decision rule under 0-1 loss (0-1 loss = 1 - Accuracy). +::: + +```{r} +acc_ws <- acc( + y = as.numeric(res_penguins$y), + mupred = res_penguins$mupred +) +acc_loo <- acc( + y = as.numeric(res_penguins$y), + mupred = res_penguins$mupred, + log_weights = res_penguins$log_weights +) +# acc_kfold <- acc( +# y = as.numeric(res_penguins$y), +# mupred = res_penguins$mupred_kfold +# ) + +message("within-sample acc:") +invisible(str(acc_ws)) +message("psis-loo acc:") +invisible(str(acc_loo)) +# message("kfold acc:") +# invisible(str(acc_kfold)) +``` + +```{r, fig.height=3, fig.width=7, fig.align="center"} +plot_ptw(acc_ws, acc_loo, NULL, "acc_i") +``` + +## Balanced Accuracy (`bacc`) + +Balanced accuracy averages the within-class mean of the predicted probability assigned to the correct class, giving each class equal weight. + +Let $\mathcal{C} = \{1, \ldots, K\}$ be the set of classes and +$n_c = |\{i : y_i = c\}|$ the number of observations in class $c$. The +class-specific accuracy is the mean predicted probability of the correct +category among observations that truly belong to class $c$: + +- Class-specific accuracy: + $$ + \text{acc}_c = \frac{1}{n_c} \sum_{i: y_i=c} \hat p_i^w + $$ +- Balanced accuracy: + $$ + \widehat{\text{bacc}} = \frac{1}{K}\sum_{c=1}^K \text{acc}_c + $$ +- A convenient pointwise contribution that yields the same mean: +$$ +\text{bacc}_i = \frac{\hat p_i^w}{n_{y_i}} +$$ +- Estimate and Standard Error: + $$ + \widehat{\text{bacc}} = \frac{1}{K}\sum_{c=1}^K \frac{1}{n_c}\sum_{i: y_i=c} \hat p_i^w,\quad + se(\widehat{\text{bacc}}) = \sqrt{\frac{\widehat{\text{Var}}(\text{bacc}_i)}{n}} + $$ + +::: {.callout-note} +When class frequencies are equal ($n_c = n/K$), balanced accuracy +reduces to standard accuracy. For binary outcomes ($K=2$), it equals the mean +of sensitivity and specificity. +::: + +```{r} +print(table(as.numeric(res_penguins$y))) + +bacc_ws <- bacc( + y = as.numeric(res_penguins$y), + mupred = res_penguins$mupred +) +bacc_loo <- bacc( + y = as.numeric(res_penguins$y), + mupred = res_penguins$mupred, + log_weights = res_penguins$log_weights +) +# bacc_kfold <- bacc( +# y = as.numeric(res_penguins$y), +# mupred = res_penguins$mupred_kfold +# ) + +message("within-sample bacc:") +invisible(str(bacc_ws)) +message("psis-loo bacc:") +invisible(str(bacc_loo)) +# message("kfold bacc:") +# invisible(str(bacc_kfold)) +``` + +```{r, fig.height=3, fig.width=7, fig.align="center"} +plot_ptw(bacc_ws, bacc_loo, NULL, "bacc_i") +``` + +## Brier Score (`brier`) +The Brier score is a strictly proper scoring rule for binary outcomes; it is the mean + squared error between the predicted probability and the observed binary outcome. The Brier score is + negatively oriented (lower is better). + The brier score ranges within [0, 1]. + +- Pointwise: + $$ + \widehat{\text{brier}}_i^{\text{raw}} = \big(p_i - y_i\big)^2,\quad + p_i = + \begin{cases} + \frac{1}{S}\sum_{s=1}^S \Pr(y_i=1 \mid \theta^{(s)}) & \text{(unweighted)}\\[4pt] + \sum_{s=1}^S \tilde w_i^{(s)}\,\Pr(y_i=1 \mid \theta^{(s)}) & \text{(PSIS weights)} + \end{cases} + $$ ++ Estimate and Standard Error: + $$ + \widehat{\text{bs}} = \frac{1}{n}\sum_{i=1}^n \text{bs}_i,\quad + se(\widehat{\text{bs}}) = \sqrt{\frac{\widehat{\text{Var}}(\text{bs}_i)}{n}} + $$ + +```{r} +brier_ws <- brier( + y = res_binary$y, + ypred = res_binary$ypred +) +brier_loo <- brier( + y = res_binary$y, + ypred = res_binary$ypred, + log_weights = res_binary$log_weights +) +brier_kfold <- brier( + y = res_binary$y, + ypred = res_binary$ypred_kfold +) + +message("within-sample brier:") +invisible(str(brier_ws)) +message("psis-loo brier:") +invisible(str(brier_loo)) +message("kfold brier:") +invisible(str(brier_kfold)) +``` + +```{r, fig.height=3, fig.width=7, fig.align="center"} +plot_ptw(brier_ws, brier_loo, brier_kfold, "brier_i") +``` + +## Continuous Ranked Probability Score (`crps`) +CRPS is a strictly proper scoring rule for continuous outcomes; it measures the squared CDF distance between the predictive CDF $F$ and the observed step function integrated over the support. It is the continuous limit of rps as $K \rightarrow \infty$. As for rps, also crps is negatively oriented (lower = better). The crps is on the same units as $y$. + +With predictive draws $Y^s \sim p(\tilde y \mid \theta^s)$: + +- Pointwise terms: +$$ +\text{crps}_i = \int_{-\infty}^{\infty} \left(F_i(x) - \mathbf{1}(x \geq y_i)\right)^2 dx +$$ +Equivalently, via the energy score representation (when $F_i$ has a finite +first moment): +$$ +\text{crps}_i = \mathbb E_F\big[Y - y_i\big] - \frac{1}{2}\mathbb E_F \big[Y-Y'\big] +$$ +where $Y, Y'$ are independent draws from the predictive distribution $F_i$. +Because computing $\mathbb{E}_F[|Y - Y'|]$ requires two independent sets of +predictive draws, we instead use the identity (valid when $F$ is continuous): +$$ +\text{crps}_i = \mathbb E_F\big[Y - y_i\big] + \mathbb E_F\big[Y\big] - 2\mathbb E_F \big[Y\cdot F(Y)\big] +$$ +which requires only a single set of draws. +- Weighted Monte Carlo estimators (PSIS weights $\tilde w_i^{(s)}$; unweighted is the special case $\tilde w = 1/S$): +$$ +\hat F_i^w(x) = \sum_{s=1}^S \tilde w_i^s \mathbf{1}(Y_i^s \leq x) +$$ +- Computation: +$$ +\mathbb{E}_F\big[|Y - y_i|\big] \approx \sum_{s=1}^S \tilde w_i^s \big|Y_i^s - y_i\big|, \qquad +\mathbb{E}_F[Y] \approx \sum_{s=1}^S \tilde w_i^s Y_i^s, +$$ +$$ +\mathbb{E}_F\big[Y \cdot F(Y)\big] \approx \sum_{s=1}^S \tilde w_i^s Y_i^s \hat F_i^w\!\left(Y_i^s\right) +$$ +giving the pointwise estimator +$$ +\widehat{\text{crps}}_i = \sum_s \tilde w_i^s \big|Y_i^s - y_i\big| + + \sum_s \tilde w_i^s Y_i^s + - 2 \sum_s \tilde w_i^s Y_i^s \hat F_i^w\!\left(Y_i^s\right) +$$ ++ Estimate and Standard Error: + $$ + \widehat{\text{crps}} = \frac{1}{n}\sum_{i=1}^n \text{crps}_i,\quad + se(\widehat{\text{crps}}) = \sqrt{\frac{\widehat{\text{Var}}(\text{crps}_i)}{n}} + $$ + +```{r} +crps_ws <- crps( + y = res_sleep$y, + ypred = res_sleep$ypred +) +crps_loo <- crps( + y = res_sleep$y, + ypred = res_sleep$ypred, + log_weights = res_sleep$log_weights +) +crps_kfold <- crps( + y = res_sleep$y, + ypred = res_sleep$ypred_kfold +) + +message("within-sample crps:") +invisible(str(crps_ws)) +message("psis-loo crps:") +invisible(str(crps_loo)) +message("kfold crps:") +invisible(str(crps_kfold)) +``` + +```{r, fig.height=3, fig.width=7, fig.align="center"} +plot_ptw(crps_ws, crps_loo, crps_kfold, "crps_i") +``` + +### Scaled CRPS (`scrps`) + +SCRPS is a locally scale-invariant proper scoring rule that normalizes CRPS by predictive spread (denominator) and adds a log penalty on spread (second term) [Bolin & Wallin, 2023]: + +$$ +\text{scrps}_i = -\frac{\mathbb{E}_F\big[|Y - y_i|\big]}{\mathbb{E}_F\big[|Y - Y'|\big]} +- \frac{1}{2}\log \mathbb{E}_F\big[|Y - Y'|\big] +$$ +- Use the same weighted estimators for the two expectations as above. +- Rather than requiring two independent draw matrices, we recover +$\mathbb{E}_F[|Y-Y'|]$ from quantities already computed for the crps via the +identity +$$ +\mathbb{E}_F\big[|Y - Y'|\big] = 4\,\mathbb{E}_F\big[Y \cdot F(Y)\big] - 2\,\mathbb{E}_F[Y] +$$ +- With importance weights this becomes +$$ +\widehat{\mathbb{E}}_F\big[|Y - Y'|\big] = + 4\sum_s \tilde w_i^s Y_i^s \hat F_i^w\!\left(Y_i^s\right) + - 2\sum_s \tilde w_i^s Y_i^s +$$ +so the pointwise scrps estimator is +$$ +\widehat{\text{scrps}}_i = + -\frac{\displaystyle\sum_s \tilde w_i^s \big|Y_i^s - y_i\big|} + {\widehat{\mathbb{E}}_F\big[|Y-Y'|\big]} + - \frac{1}{2}\log\widehat{\mathbb{E}}_F\big[|Y-Y'|\big] +$$ +- Estimate and Standard Error: + $$ + \widehat{\text{scrps}} = \frac{1}{n}\sum_{i=1}^n \text{scrps}_i,\quad + se(\widehat{\text{scrps}}) = \sqrt{\frac{\widehat{\text{Var}}(\text{scrps}_i)}{n}} + $$ + +```{r} +scrps_ws <- scrps( + y = res_sleep$y, + ypred = res_sleep$ypred +) +scrps_loo <- scrps( + y = res_sleep$y, + ypred = res_sleep$ypred, + log_weights = res_sleep$log_weights +) +scrps_kfold <- scrps( + y = res_sleep$y, + ypred = res_sleep$ypred_kfold +) + +message("within-sample scrps:") +invisible(str(scrps_ws)) +message("psis-loo scrps:") +invisible(str(scrps_loo)) +message("kfold scrps:") +invisible(str(scrps_kfold)) +``` + +```{r, fig.height=3, fig.width=7, fig.align="center"} +plot_ptw(scrps_ws, scrps_loo, scrps_kfold, "scrps_i") +``` + +## Ranked Probability Score (`rps`) +RPS is a strictly proper scoring rule for ordered categorical outcomes with $K$ categories. It compares the predictive CDF to the observed +step CDF and generalizes the Brier score ($K=2$ recovers Brier). The standard +definition is negatively oriented (lower is better). The range of rps is $[0, K-1]$. + +- Predictive class probabilities and CDFs: + $$ + \hat p_{i,k}^w = \sum_{s=1}^S \tilde w_i^{(s)}\,\Pr(Y_i=k\mid \theta^{(s)}),\quad + \hat F_{i,k}^w = \sum_{j=1}^k \hat p_{i,j}^w + $$ +- Observed step CDF $O_{i,k} = \mathbf{1}(y_i \le k)$. +- Pointwise term: + $$ + \widehat{\text{rps}}_i^{\text{raw}} = \sum_{k=1}^{K-1} (\hat F_{i,k}^w - O_{i,k})^2 + $$ +- Estimate and Standard Error: + $$ + \widehat{\text{rps}} = \frac{1}{n}\sum_{i=1}^n \text{rps}_i,\quad + se(\widehat{\text{rps}}) = \sqrt{\frac{\widehat{\text{Var}}(\text{rps}_i)}{n}} + $$ + +```{r} +rps_ws <- rps( + y = res_binomial$y, + ypred = res_binomial$ypred, + size = 10 +) +rps_loo <- rps( + y = res_binomial$y, + ypred = res_binomial$ypred, + size = 10, + log_weights = res_binomial$log_weights +) +rps_kfold <- rps( + y = res_binomial$y, + ypred = res_binomial$ypred_kfold, + size = 10 +) + +message("within-sample rps:") +invisible(str(rps_ws)) +message("psis-loo rps:") +invisible(str(rps_loo)) +message("kfold rps:") +invisible(str(rps_kfold)) +``` + +```{r, fig.height=3, fig.width=7, fig.align="center"} +plot_ptw(rps_ws, rps_loo, rps_kfold, "rps_i") +``` + +### Scaled RPS (`srps`) + +By analogy with SCRPS [Bolin & Wallin, 2023], the scaled RPS is + +$$ +\text{srps}_i = -\frac{\mathbb{E}_F\big[|Y - y_i|\big]}{\mathbb{E}_F\big[|Y - Y'|\big]} +- \frac{1}{2}\log \mathbb{E}_F\big[|Y - Y'|\big] +$$ + +For ordered discrete support: + +- Weighted mean absolute deviation: + $$ + \widehat{\mathbb{E}}_F[|Y-y_i|] = \sum_{k=1}^K \hat p_{i,k}^w\,|k - y_i| + $$ +- Predictive spread: + $$ + \widehat{\mathbb{E}}_F[|Y - Y'|] = 2\sum_{k=1}^{K-1} \hat F_{i,k}^w\big(1 - \hat F_{i,k}^w\big) + $$ +Estimation and SE follow the same mean/variance recipe as above. The srps has a positive direction, i.e., "higher is better". + +```{r} +srps_ws <- srps( + y = res_binomial$y, + ypred = res_binomial$ypred, + size = 10 +) +srps_loo <- srps( + y = res_binomial$y, + ypred = res_binomial$ypred, + size = 10, + log_weights = res_binomial$log_weights +) +srps_kfold <- srps( + y = res_binomial$y, + ypred = res_binomial$ypred_kfold, + size = 10 +) + +message("within-sample srps:") +invisible(str(srps_ws)) +message("psis-loo srps:") +invisible(str(srps_loo)) +message("kfold srps:") +invisible(str(srps_kfold)) +``` + +```{r, fig.height=3, fig.width=7, fig.align="center"} +plot_ptw(srps_ws, srps_loo, srps_kfold, "srps_i") +``` + +## Mean absolute error (`mae`) +MAE evaluates point predictions $\hat \mu_i$ against outcomes $y_i$ via +absolute loss. It has the same units as $y$ and has orientation "lower is better". + +In our setup, we take `mupred` to be the vector of point predictions (one per +observation), obtained from posterior expectations (e.g., via `posterior_epred`) + or from PSIS-LOO expected predictions (via `E_loo`), as illustrated below. + +- Pointwise: +$$ +\text{mae}_i = \mid y_i − \hat \mu_i \mid +$$ +- Point prediction: +$$ +\hat \mu_i \approx +\begin{cases} +\frac{1}{S}\sum_{s=1}^S \mu_i^{(s)}, & \mu_i^{(s)} = \mathbb{E}[\tilde y_i \mid \theta^{(s)}] \quad \text{(within-sample)}\\[6pt] +\sum_{s=1}^S \tilde w_i^{(s)}\,\mu_i^{(s)}, & \text{(PSIS-LOO held-out approximation)} +\end{cases} +$$ +- Estimate and Standard Error: +$$ +\widehat{\text{mae}} = \frac{1}{n} \sum_{i=1}^n \text{mae}_i, \quad +se(\widehat{\text{mae}}) = \sqrt{\frac{\widehat{\text{Var}}(\text{mae}_i)}{n}} +$$ + +```{r} +mae_ws <- mae( + y = res_sleep$y, + mupred = res_sleep$mupred +) +mae_loo <- mae( + y = res_sleep$y, + mupred = res_sleep$mupred_loo +) +mae_kfold <- mae( + y = res_sleep$y, + mupred = res_sleep$mupred_kfold +) + +message("within-sample mae:") +invisible(str(mae_ws)) +message("psis-loo mae:") +invisible(str(mae_loo)) +message("kfold mae:") +invisible(str(mae_kfold)) +``` + +```{r, fig.height=3, fig.width=7, fig.align="center"} +plot_ptw(mae_ws, mae_loo, mae_kfold, "mae_i") +``` + +## (Root) mean squared error (`rmse`/`mse`) +RMSE uses squared loss, has the same units as `y` and has a positive direction +("lower is better"). +RMSE evaluates point predictions $\hat\mu_i$ against observed outcomes $y_i$ +using squared loss and then takes the square root of the average. It has the +same units as $y$ and the direction "lower is better". In our setup, `mupred` +is the point prediction per observation, computed as the posterior predictive +mean (e.g., via `posterior_epred`) or its PSIS-LOO analogue via `E_loo`. RMSE +is the square root of the mean squared error (MSE). + +- Pointwise squared error: + $$ + \text{sqe}_i^{\text{raw}} = (y_i - \hat \mu_i)^2 + $$ +- Point prediction: +$$ +\hat \mu_i \approx +\begin{cases} +\frac{1}{S}\sum_{s=1}^S \mu_i^{(s)}, & \mu_i^{(s)} = \mathbb{E}[\tilde y_i \mid \theta^{(s)}] \quad \text{(within-sample)}\\[6pt] +\sum_{s=1}^S \tilde w_i^{(s)}\,\mu_i^{(s)}, & \text{(PSIS-LOO held-out approximation)} +\end{cases} +$$ +- Estimates: +$$ +\text{mse}^{\text{raw}} = \frac{1}{n}\sum_{i=1}^n \text{sqe}_i^{\text{raw}},\quad +\text{rmse}^{\text{raw}} = \sqrt{\text{mse}^{\text{raw}}} +$$ +- Standard Errors: + - For MSE: + $$ + se(\text{mse}^{\text{raw}}) = + \sqrt{\frac{\widehat{\text{Var}}\big(\text{sqe}_i^{\text{raw}}\big)}{n}} + $$ + - For RMSE (delta method): + $$ + se(\text{rmse}^{\text{raw}}) \approx \frac{se(\text{mse}^{\text{raw}})}{2\,\text{rmse}^{\text{raw}}} + $$ + If $\text{rmse} = 0$, set $se(\text{rmse}) = 0$. + +```{r} +rmse_ws <- rmse( + y = res_sleep$y, + mupred = res_sleep$mupred +) +rmse_loo <- rmse( + y = res_sleep$y, + mupred = res_sleep$mupred_loo +) +rmse_kfold <- rmse( + y = res_sleep$y, + mupred = res_sleep$mupred_kfold +) + +message("within-sample rmse:") +invisible(str(rmse_ws)) +message("psis-loo rmse:") +invisible(str(rmse_loo)) +message("kfold rmse:") +invisible(str(rmse_kfold)) +``` + +```{r, fig.height=3, fig.width=7, fig.align="center"} +plot_ptw(rmse_ws, rmse_loo, rmse_kfold, "squared-error_i", show_estimate = FALSE) +``` + +```{r} +mse_ws <- mse( + y = res_sleep$y, + mupred = res_sleep$mupred +) +mse_loo <- mse( + y = res_sleep$y, + mupred = res_sleep$mupred_loo +) +mse_kfold <- mse( + y = res_sleep$y, + mupred = res_sleep$mupred_kfold +) + +message("within-sample mse:") +invisible(str(mse_ws)) +message("psis-loo mse:") +invisible(str(mse_loo)) +message("kfold mse:") +invisible(str(mse_kfold)) +``` + +```{r, fig.height=3, fig.width=7, fig.align="center"} +plot_ptw(mse_ws, mse_loo, mse_kfold, "squared-error_i") +``` + +## Coefficient of determination (`r2`) +R2 summarizes how well point predictions $\hat\mu_i$ explain the variation in +observed outcomes $y_i$. We use the same point predictions, $\hat\mu_i$, as in +the mae/rmse sections (i.e., the posterior predictive mean per observation). We +define a predictive R2 compatible with cross-validation, not Bayesian-`R^2` +based on a model’s residual variance parameter. + +- Definition: + $$ + R^2 = 1 - \frac{\sum_{i=1}^n (y_i - \hat y_i)^2}{\sum_{i=1}^n (y_i - \bar y)^2},\quad \bar y = \frac{1}{n}\sum_{i=1}^n y_i + $$ + with $\hat y_i = \hat \mu_i$ formed as within-sample means, PSIS-LOO + reweighted means, or K-fold held-out means. +- Equivalently in mean form: + $$ + \widehat{\text{mse}} = \frac{1}{n}\sum_i (y_i - \hat y_i)^2,\quad + \widehat{\text{mse}}(y) = \frac{1}{n}\sum_i (y_i - \bar y)^2,\quad + \widehat{R}^2 = 1 - \frac{\widehat{\text{mse}}}{\widehat{\text{mse}}(y)} + $$ +- Standard error (first‑order delta method): + $$ + se(\widehat{R}^2) = \left[ + se(\widehat{\text{mse}})^2 + - 2\,\frac{\widehat{\text{mse}}}{\widehat{\text{mse}}(y)}\,\frac{\widehat{\text{Cov}}\big((y_i-\hat y_i)^2,\,(y_i-\bar y)^2\big)}{n} + + \frac{\widehat{\text{mse}}^2}{\widehat{\text{mse}}(y)^2}\,\frac{\widehat{\text{Var}}\big((y_i-\bar y)^2\big)}{n} + \right]^{1/2}\cdot \frac{1}{\widehat{\text{mse}}(y)} + $$ + where $\widehat{\text{mse}} = \frac{1}{n}\sum_{i=1}^n \text{sqe}_i$ and $\widehat{\text{mse}}(y) = \frac{1}{n}\sum_{i=1}^n (y_i - \bar y)^2$. + +```{r} +r2_ws <- r2( + y = res_sleep$y, + mupred = res_sleep$mupred +) +r2_loo <- r2( + y = res_sleep$y, + mupred = res_sleep$mupred_loo +) +r2_kfold <- r2( + y = res_sleep$y, + mupred = res_sleep$mupred_kfold +) + +message("within-sample r2:") +invisible(str(r2_ws)) +message("psis-loo r2:") +invisible(str(r2_loo)) +message("kfold r2:") +invisible(str(r2_kfold)) +``` + +```{r, fig.height=3, fig.width=7, fig.align="center"} +plot_ptw(r2_ws, r2_loo, r2_kfold, "squared-error_i", show_estimate = FALSE) +``` \ No newline at end of file