Source code for skein_glm.ic

"""Information-criterion λ selection for path estimators.

`select_by_ic(path_model, x, *outcomes, criterion=...)` picks the best
λ from a fitted `*PathRegressor` by minimizing AIC, BIC, or EBIC. It
composes with any existing path estimator without per-class wrappers
(unlike the CV API, where the K-fold loop has to live inside an
estimator); the user simply asks "which λ on this fitted path is the
IC-best one?"

Per-datafit dispatch sniffs the path estimator's class name. The four
GLM families currently supported:

- LS (`MCPPathRegressor`, `SCADPathRegressor`, `*Group*PathRegressor`):
  NLL = `(n/2) · log(RSS/n)` (drops the additive Gaussian constant —
  fine for AIC/BIC since they're additive-constant-invariant).
- Logistic: NLL = `Σ_i softplus(η_i) − y_i · η_i` (binomial NLL up to
  the y log y constant).
- Poisson: NLL = `Σ_i exp(η_i) − y_i · η_i` (drops `log(y_i!)`).
- Cox PH: NLL is read from `path_model.info_["final_losses"]` (Cox
  partial NLL per sample, multiplied by `n` to get the total).

Effective degrees of freedom = the active-set size at each λ
(`np.sum(|β| > eps)`). This matches the unbiased-estimator df result
for lasso (Zou-Hastie-Tibshirani) and is the standard convention in
ncvreg / glmnet.

Three criteria, all minimized:

- AIC: `2k + 2·NLL`
- BIC: `log(n)·k + 2·NLL`
- EBIC (Chen-Chen): `BIC + 2γ·log C(p,k)` with `γ ∈ [0, 1]` (default
  0.5). Tighter than BIC when `p ≫ n`; matches what `ncvreg::BIC`
  uses for high-dim selection.
"""
from __future__ import annotations


import numpy as np
from numpy.typing import NDArray

from skein_glm.estimators import _is_sparse  # noqa: F401  (kept for potential downstream use)


_DEFAULT_ACTIVE_EPS = 1e-12


def _active_size(coefs: NDArray[np.float64], eps: float = _DEFAULT_ACTIVE_EPS) -> NDArray[np.int64]:
    """Per-λ count of non-zero β entries."""
    return np.sum(np.abs(coefs) > eps, axis=1).astype(np.int64)


def _compute_nll_ls(path_model, x, y) -> NDArray[np.float64]:
    """Gaussian-LS NLL up to additive constants: `(n/2) · log(RSS/n)`."""
    pred = np.asarray(path_model.predict(x))  # (n, n_lambdas)
    if pred.ndim == 1:
        pred = pred[:, None]
    y_col = np.asarray(y, dtype=np.float64).reshape(-1, 1)
    rss = np.sum((y_col - pred) ** 2, axis=0)
    n = pred.shape[0]
    return 0.5 * n * np.log(np.maximum(rss / n, 1e-15))


def _compute_nll_logistic(path_model, x, y) -> NDArray[np.float64]:
    """Binomial NLL: `Σ softplus(η) − y·η`. `decision_function` returns
    η directly (path version: shape `(n, n_lambdas)`)."""
    eta = np.asarray(path_model.decision_function(x))
    if eta.ndim == 1:
        eta = eta[:, None]
    # Numerically stable softplus.
    sp = np.maximum(eta, 0.0) + np.log1p(np.exp(-np.abs(eta)))
    y_col = np.asarray(y, dtype=np.float64).reshape(-1, 1)
    return np.sum(sp - y_col * eta, axis=0)


def _compute_nll_poisson(path_model, x, y) -> NDArray[np.float64]:
    """Poisson (log-link) NLL up to `log(y!)`: `Σ exp(η) − y·η`. The
    same η-clamp the Rust core uses (`±30`) keeps `exp(η)` finite."""
    eta = np.asarray(path_model.decision_function(x))
    if eta.ndim == 1:
        eta = eta[:, None]
    eta_clamped = np.clip(eta, -30.0, 30.0)
    mu = np.exp(eta_clamped)
    y_col = np.asarray(y, dtype=np.float64).reshape(-1, 1)
    return np.sum(mu - y_col * eta_clamped, axis=0)


def _compute_nll_multinomial(path_model, x, y) -> NDArray[np.float64]:
    """Multinomial NLL: per-λ Σ_i (logsumexp(η_i) − η_{i, y_i}). The path
    estimator's `coefs_` has shape `(n_lambdas, K, p)` and η is the per-λ
    `(n, K)` matrix from `decision_function` recomputed at each λ — but
    `decision_function` only honors the single-best-λ refit, so we
    recompute η directly off `coefs_` / `intercepts_`."""
    coefs_3d: NDArray[np.float64] = path_model.coefs_  # (n_lambdas, K, p)
    intercepts: NDArray[np.float64] = path_model.intercepts_  # (n_lambdas, K)
    n_lambdas = coefs_3d.shape[0]
    classes = np.asarray(path_model.classes_)
    y_arr = np.asarray(y)
    codes = np.searchsorted(classes, y_arr).astype(np.intp)
    n = codes.shape[0]

    # Build a single (n, K) η per λ.
    nll = np.zeros(n_lambdas)
    rows = np.arange(n)
    for li in range(n_lambdas):
        if hasattr(x, "toarray"):
            eta = (x @ coefs_3d[li].T)
            if hasattr(eta, "toarray"):
                eta = eta.toarray()
            eta = np.asarray(eta) + intercepts[li]
        else:
            eta = np.asarray(x) @ coefs_3d[li].T + intercepts[li]
        m = eta.max(axis=1, keepdims=True)
        lse = (np.log(np.exp(eta - m).sum(axis=1, keepdims=True)) + m).ravel()
        eta_y = eta[rows, codes]
        nll[li] = float(np.sum(lse - eta_y))
    return nll


def _multinomial_active_size(coefs_3d: NDArray[np.float64], eps: float) -> NDArray[np.int64]:
    """Active-feature count per λ for row-grouped multinomial: a feature
    is active if **any** of its K class coefficients is nonzero. `coefs_3d`
    has shape `(n_lambdas, K, p)`."""
    # Take max-abs across the class axis, then count features above eps.
    max_abs_per_feature = np.max(np.abs(coefs_3d), axis=1)  # (n_lambdas, p)
    return np.sum(max_abs_per_feature > eps, axis=1).astype(np.int64)


def _compute_nll_cox(path_model, time) -> NDArray[np.float64]:
    """Cox PH NLL: `path_model.info_["final_losses"][k]` is already the
    Breslow per-sample partial NLL at λ_k (the same expression
    `CoxPH::loss` returns), so multiply by `n` to get the total."""
    losses = np.asarray(path_model.info_["final_losses"], dtype=np.float64)
    n = len(time)
    return losses * n


def _detect_family(path_model) -> str:
    """Sniff the datafit family from the estimator's class name. Cox
    has the unique `(time, event)` outcome shape and so the dispatch
    is part of the contract; logistic/Poisson/LS are picked for the
    NLL helper but otherwise share one path."""
    cls = type(path_model).__name__
    if "Cox" in cls:
        return "cox"
    if "Multinomial" in cls:
        return "multinomial"
    if "Logistic" in cls:
        return "logistic"
    if "Poisson" in cls:
        return "poisson"
    return "ls"


[docs] def select_by_ic( path_model, x, *outcomes, criterion: str = "bic", ebic_gamma: float = 0.5, active_eps: float = _DEFAULT_ACTIVE_EPS, ) -> tuple[int, NDArray[np.float64]]: """Pick the best λ from a fitted path estimator by AIC, BIC, or EBIC. A single free function — no per-estimator wrapper. Dispatches on the path estimator's class name to compute the right negative log-likelihood (LS, logistic, Poisson, or Cox), then adds a complexity penalty: - AIC = 2k + 2·NLL - BIC = log(n)·k + 2·NLL - EBIC = BIC + 2γ·log C(p, k), with γ ∈ [0, 1] (default 0.5; matches ``ncvreg::BIC``'s high-dim recommendation) where k is the number of nonzero coefficients per λ (the Zou-Hastie-Tibshirani unbiased df estimator, the standard ``ncvreg`` / ``glmnet`` convention). Parameters ---------- path_model : *PathRegressor Any fitted path estimator (LS / logistic / Poisson / Cox). x : array-like The design matrix used in the fit. Used to recompute the per-λ negative log-likelihood. *outcomes For non-Cox estimators: a single ``y`` array. For Cox: ``time, event``. Mirrors each estimator's ``fit`` signature. criterion : {"aic", "bic", "ebic"}, default "bic" Which information criterion to use. ebic_gamma : float, default 0.5 EBIC penalty parameter γ ∈ [0, 1]. Ignored for AIC/BIC. active_eps : float, default 1e-12 Threshold for counting a coefficient as "active" (nonzero). Returns ------- best_idx : int Index into ``path_model.lambdas_`` of the IC-minimizing λ. scores : ndarray of shape (n_lambdas,) Per-λ score vector (lower-is-better). The fitted β is ``path_model.coefs_[best_idx]``. Examples -------- >>> import skein_glm >>> path = skein_glm.MCPPathRegressor(gamma=3.0, n_lambdas=50).fit(X, y) >>> best_idx, scores = skein_glm.select_by_ic(path, X, y, criterion="bic") >>> beta_best = path.coefs_[best_idx] >>> intercept_best = path.intercepts_[best_idx] For Cox PH: >>> cox_path = skein_glm.CoxMCPPathRegressor(gamma=3.0, n_lambdas=50).fit( ... X, time, event) >>> best_idx, _ = skein_glm.select_by_ic(cox_path, X, time, event, criterion="ebic") """ if criterion not in ("aic", "bic", "ebic"): raise ValueError( f"criterion must be 'aic', 'bic', or 'ebic'; got {criterion!r}" ) if not 0.0 <= ebic_gamma <= 1.0: raise ValueError( f"ebic_gamma must be in [0, 1]; got {ebic_gamma}" ) family = _detect_family(path_model) if family == "cox": if len(outcomes) != 2: raise ValueError( "Cox path estimators require `(time, event)` outcomes" ) time, _event = outcomes nll = _compute_nll_cox(path_model, time) n = len(time) else: if len(outcomes) != 1: raise ValueError( f"{family} path estimators take a single `y` outcome" ) y = outcomes[0] if family == "logistic": nll = _compute_nll_logistic(path_model, x, y) elif family == "poisson": nll = _compute_nll_poisson(path_model, x, y) elif family == "multinomial": nll = _compute_nll_multinomial(path_model, x, y) else: nll = _compute_nll_ls(path_model, x, y) n = len(y) coefs: NDArray[np.float64] = path_model.coefs_ if family == "multinomial": # coefs_ shape (n_lambdas, K, p); df = active row count. p = coefs.shape[2] k = _multinomial_active_size(coefs, active_eps).astype(np.float64) else: p = coefs.shape[1] k = _active_size(coefs, active_eps).astype(np.float64) if criterion == "aic": scores = 2.0 * k + 2.0 * nll elif criterion == "bic": scores = np.log(n) * k + 2.0 * nll else: # "ebic" from scipy.special import gammaln # type: ignore[import-untyped] # log C(p, k) = lgamma(p+1) - lgamma(k+1) - lgamma(p-k+1). log_binom = ( gammaln(p + 1.0) - gammaln(k + 1.0) - gammaln(p - k + 1.0) ) scores = np.log(n) * k + 2.0 * nll + 2.0 * ebic_gamma * log_binom best_idx = int(np.argmin(scores)) return best_idx, scores