Source code for skein_glm.ic

"""Information-criterion λ selection for path estimators.

`select_by_ic(path_model, x, *outcomes, criterion=...)` picks the best
λ from a fitted `*PathRegressor` by minimizing AIC, BIC, or EBIC. It
composes with any existing path estimator without per-class wrappers
(unlike the CV API, where the K-fold loop has to live inside an
estimator); the user simply asks "which λ on this fitted path is the
IC-best one?"

Per-datafit dispatch sniffs the path estimator's class name. The four
GLM families currently supported:

- LS (`MCPPathRegressor`, `SCADPathRegressor`, `*Group*PathRegressor`):
  NLL = `(n/2) · log(RSS/n)` (drops the additive Gaussian constant —
  fine for AIC/BIC since they're additive-constant-invariant).
- Logistic: NLL = `Σ_i softplus(η_i) − y_i · η_i` (binomial NLL up to
  the y log y constant).
- Poisson: NLL = `Σ_i exp(η_i) − y_i · η_i` (drops `log(y_i!)`).
- Cox PH: NLL is read from `path_model.info_["final_losses"]` (Cox
  partial NLL per sample, multiplied by `n` to get the total).

Effective degrees of freedom = the active-set size at each λ
(`np.sum(|β| > eps)`). This matches the unbiased-estimator df result
for lasso (Zou-Hastie-Tibshirani) and is the standard convention in
ncvreg / glmnet.

Three criteria, all minimized:

- AIC: `2k + 2·NLL`
- BIC: `log(n)·k + 2·NLL`
- EBIC (Chen-Chen): `BIC + 2γ·log C(p,k)` with `γ ∈ [0, 1]` (default
  0.5). Tighter than BIC when `p ≫ n`; matches what `ncvreg::BIC`
  uses for high-dim selection.
"""
from __future__ import annotations


import numpy as np
from numpy.typing import NDArray

from skein_glm.estimators import _is_sparse  # noqa: F401  (kept for potential downstream use)


_DEFAULT_ACTIVE_EPS = 1e-12


def _active_size(coefs: NDArray[np.float64], eps: float = _DEFAULT_ACTIVE_EPS) -> NDArray[np.int64]:
    """Per-λ count of non-zero β entries."""
    return np.sum(np.abs(coefs) > eps, axis=1).astype(np.int64)


def _compute_nll_ls(path_model, x, y) -> NDArray[np.float64]:
    """Gaussian-LS NLL up to additive constants: `(n/2) · log(RSS/n)`."""
    pred = np.asarray(path_model.predict(x))  # (n, n_lambdas)
    if pred.ndim == 1:
        pred = pred[:, None]
    y_col = np.asarray(y, dtype=np.float64).reshape(-1, 1)
    rss = np.sum((y_col - pred) ** 2, axis=0)
    n = pred.shape[0]
    return 0.5 * n * np.log(np.maximum(rss / n, 1e-15))


def _compute_nll_logistic(path_model, x, y) -> NDArray[np.float64]:
    """Binomial NLL: `Σ softplus(η) − y·η`. `decision_function` returns
    η directly (path version: shape `(n, n_lambdas)`)."""
    eta = np.asarray(path_model.decision_function(x))
    if eta.ndim == 1:
        eta = eta[:, None]
    # Numerically stable softplus.
    sp = np.maximum(eta, 0.0) + np.log1p(np.exp(-np.abs(eta)))
    y_col = np.asarray(y, dtype=np.float64).reshape(-1, 1)
    return np.sum(sp - y_col * eta, axis=0)


def _compute_nll_poisson(path_model, x, y) -> NDArray[np.float64]:
    """Poisson (log-link) NLL up to `log(y!)`: `Σ exp(η) − y·η`. The
    same η-clamp the Rust core uses (`±30`) keeps `exp(η)` finite."""
    eta = np.asarray(path_model.decision_function(x))
    if eta.ndim == 1:
        eta = eta[:, None]
    eta_clamped = np.clip(eta, -30.0, 30.0)
    mu = np.exp(eta_clamped)
    y_col = np.asarray(y, dtype=np.float64).reshape(-1, 1)
    return np.sum(mu - y_col * eta_clamped, axis=0)


def _compute_nll_multinomial(path_model, x, y) -> NDArray[np.float64]:
    """Multinomial NLL: per-λ Σ_i (logsumexp(η_i) − η_{i, y_i}). The path
    estimator's `coefs_` has shape `(n_lambdas, K, p)` and η is the per-λ
    `(n, K)` matrix from `decision_function` recomputed at each λ — but
    `decision_function` only honors the single-best-λ refit, so we
    recompute η directly off `coefs_` / `intercepts_`."""
    coefs_3d: NDArray[np.float64] = path_model.coefs_  # (n_lambdas, K, p)
    intercepts: NDArray[np.float64] = path_model.intercepts_  # (n_lambdas, K)
    n_lambdas = coefs_3d.shape[0]
    classes = np.asarray(path_model.classes_)
    y_arr = np.asarray(y)
    codes = np.searchsorted(classes, y_arr).astype(np.intp)
    n = codes.shape[0]

    # Build a single (n, K) η per λ.
    nll = np.zeros(n_lambdas)
    rows = np.arange(n)
    for li in range(n_lambdas):
        if hasattr(x, "toarray"):
            eta = (x @ coefs_3d[li].T)
            if hasattr(eta, "toarray"):
                eta = eta.toarray()
            eta = np.asarray(eta) + intercepts[li]
        else:
            eta = np.asarray(x) @ coefs_3d[li].T + intercepts[li]
        m = eta.max(axis=1, keepdims=True)
        lse = (np.log(np.exp(eta - m).sum(axis=1, keepdims=True)) + m).ravel()
        eta_y = eta[rows, codes]
        nll[li] = float(np.sum(lse - eta_y))
    return nll


def _multinomial_active_size(coefs_3d: NDArray[np.float64], eps: float) -> NDArray[np.int64]:
    """Active-feature count per λ for row-grouped multinomial: a feature
    is active if **any** of its K class coefficients is nonzero. `coefs_3d`
    has shape `(n_lambdas, K, p)`."""
    # Take max-abs across the class axis, then count features above eps.
    max_abs_per_feature = np.max(np.abs(coefs_3d), axis=1)  # (n_lambdas, p)
    return np.sum(max_abs_per_feature > eps, axis=1).astype(np.int64)


def _compute_nll_cox(path_model, time) -> NDArray[np.float64]:
    """Cox PH NLL: `path_model.info_["final_losses"][k]` is already the
    Breslow per-sample partial NLL at λ_k (the same expression
    `CoxPH::loss` returns), so multiply by `n` to get the total."""
    losses = np.asarray(path_model.info_["final_losses"], dtype=np.float64)
    n = len(time)
    return losses * n


def _detect_family(path_model) -> str:
    """Sniff the datafit family from the estimator's class name. Cox
    has the unique `(time, event)` outcome shape and so the dispatch
    is part of the contract; logistic/Poisson/LS are picked for the
    NLL helper but otherwise share one path."""
    cls = type(path_model).__name__
    if "Cox" in cls:
        return "cox"
    if "Multinomial" in cls:
        return "multinomial"
    if "Logistic" in cls:
        return "logistic"
    if "Poisson" in cls:
        return "poisson"
    return "ls"



[docs]
def select_by_ic(
    path_model,
    x,
    *outcomes,
    criterion: str = "bic",
    ebic_gamma: float = 0.5,
    active_eps: float = _DEFAULT_ACTIVE_EPS,
) -> tuple[int, NDArray[np.float64]]:
    """Pick the best λ from a fitted path estimator by AIC, BIC, or EBIC.

    A single free function — no per-estimator wrapper. Dispatches on
    the path estimator's class name to compute the right negative
    log-likelihood (LS, logistic, Poisson, or Cox), then adds a
    complexity penalty:

    - AIC = 2k + 2·NLL
    - BIC = log(n)·k + 2·NLL
    - EBIC = BIC + 2γ·log C(p, k), with γ ∈ [0, 1] (default 0.5;
      matches ``ncvreg::BIC``'s high-dim recommendation)

    where k is the number of nonzero coefficients per λ (the
    Zou-Hastie-Tibshirani unbiased df estimator, the standard
    ``ncvreg`` / ``glmnet`` convention).

    Parameters
    ----------
    path_model : *PathRegressor
        Any fitted path estimator (LS / logistic / Poisson / Cox).
    x : array-like
        The design matrix used in the fit. Used to recompute the
        per-λ negative log-likelihood.
    *outcomes
        For non-Cox estimators: a single ``y`` array. For Cox:
        ``time, event``. Mirrors each estimator's ``fit`` signature.
    criterion : {"aic", "bic", "ebic"}, default "bic"
        Which information criterion to use.
    ebic_gamma : float, default 0.5
        EBIC penalty parameter γ ∈ [0, 1]. Ignored for AIC/BIC.
    active_eps : float, default 1e-12
        Threshold for counting a coefficient as "active" (nonzero).

    Returns
    -------
    best_idx : int
        Index into ``path_model.lambdas_`` of the IC-minimizing λ.
    scores : ndarray of shape (n_lambdas,)
        Per-λ score vector (lower-is-better). The fitted β is
        ``path_model.coefs_[best_idx]``.

    Examples
    --------
    >>> import skein_glm
    >>> path = skein_glm.MCPPathRegressor(gamma=3.0, n_lambdas=50).fit(X, y)
    >>> best_idx, scores = skein_glm.select_by_ic(path, X, y, criterion="bic")
    >>> beta_best = path.coefs_[best_idx]
    >>> intercept_best = path.intercepts_[best_idx]

    For Cox PH:

    >>> cox_path = skein_glm.CoxMCPPathRegressor(gamma=3.0, n_lambdas=50).fit(
    ...     X, time, event)
    >>> best_idx, _ = skein_glm.select_by_ic(cox_path, X, time, event, criterion="ebic")
    """
    if criterion not in ("aic", "bic", "ebic"):
        raise ValueError(
            f"criterion must be 'aic', 'bic', or 'ebic'; got {criterion!r}"
        )
    if not 0.0 <= ebic_gamma <= 1.0:
        raise ValueError(
            f"ebic_gamma must be in [0, 1]; got {ebic_gamma}"
        )

    family = _detect_family(path_model)
    if family == "cox":
        if len(outcomes) != 2:
            raise ValueError(
                "Cox path estimators require `(time, event)` outcomes"
            )
        time, _event = outcomes
        nll = _compute_nll_cox(path_model, time)
        n = len(time)
    else:
        if len(outcomes) != 1:
            raise ValueError(
                f"{family} path estimators take a single `y` outcome"
            )
        y = outcomes[0]
        if family == "logistic":
            nll = _compute_nll_logistic(path_model, x, y)
        elif family == "poisson":
            nll = _compute_nll_poisson(path_model, x, y)
        elif family == "multinomial":
            nll = _compute_nll_multinomial(path_model, x, y)
        else:
            nll = _compute_nll_ls(path_model, x, y)
        n = len(y)

    coefs: NDArray[np.float64] = path_model.coefs_
    if family == "multinomial":
        # coefs_ shape (n_lambdas, K, p); df = active row count.
        p = coefs.shape[2]
        k = _multinomial_active_size(coefs, active_eps).astype(np.float64)
    else:
        p = coefs.shape[1]
        k = _active_size(coefs, active_eps).astype(np.float64)

    if criterion == "aic":
        scores = 2.0 * k + 2.0 * nll
    elif criterion == "bic":
        scores = np.log(n) * k + 2.0 * nll
    else:  # "ebic"
        from scipy.special import gammaln  # type: ignore[import-untyped]
        # log C(p, k) = lgamma(p+1) - lgamma(k+1) - lgamma(p-k+1).
        log_binom = (
            gammaln(p + 1.0) - gammaln(k + 1.0) - gammaln(p - k + 1.0)
        )
        scores = np.log(n) * k + 2.0 * nll + 2.0 * ebic_gamma * log_binom

    best_idx = int(np.argmin(scores))
    return best_idx, scores