Source code for skein_glm.graph_selection

"""Model selection for graphical models.

The headline export is :func:`ebic_path`, which walks a λ-grid and
selects the precision matrix that minimises the Extended Bayesian
Information Criterion (Foygel & Drton 2010). EBIC is the field-standard
tuning rule in network psychometrics — it penalises edge count more
aggressively than BIC and gives stable graphs even at small `n / p`.

For joint glasso, :func:`joint_ebic_path` walks a `(λ_2)` grid summing
per-population log-likelihoods and counting the *union* of active edges
across populations.

Conventions:

- `gamma ∈ [0, 1]` controls the EBIC strength. `gamma = 0` is plain
  BIC; `gamma = 0.5` is the field default for graphical models.
- `n` is required when the user supplied a precomputed covariance
  (we can't infer it from `S` shape).
"""
from __future__ import annotations

from dataclasses import dataclass
from typing import Any

import numpy as np
from numpy.typing import NDArray


[docs] @dataclass class EBICPathResult: """Outcome of a single-population EBIC search. Attributes ---------- best_estimator : fitted estimator at the EBIC-selected λ. best_lambda : float best_ebic : float lambdas : ndarray (n_lambdas,) ebic : ndarray (n_lambdas,) n_edges : ndarray (n_lambdas,) int """ best_estimator: Any best_lambda: float best_ebic: float lambdas: NDArray[np.float64] ebic: NDArray[np.float64] n_edges: NDArray[np.int64]
def _count_edges(theta: NDArray[np.float64], atol: float = 1e-8) -> int: """Number of nonzero upper-triangular off-diagonal entries.""" p = theta.shape[0] iu = np.triu_indices(p, k=1) return int(np.sum(np.abs(theta[iu]) > atol)) def _gaussian_loglik(s: NDArray[np.float64], theta: NDArray[np.float64], n: float) -> float: """Gaussian log-likelihood (up to constants) of `Θ̂` given sample covariance `S` and effective sample size `n`: ℓ(Θ̂; S) = (n / 2) · (log det Θ̂ − tr(SΘ̂)). """ sign, logdet = np.linalg.slogdet(theta) if sign <= 0: # Non-PD; return -inf to disqualify. return -np.inf return 0.5 * n * (logdet - float(np.trace(s @ theta)))
[docs] def ebic_path( X, estimator_cls, lambdas: NDArray[np.float64], *, gamma: float = 0.5, n: int | None = None, assume_centered: bool = False, **estimator_kwargs, ) -> EBICPathResult: """Sweep a λ-grid for a graphical estimator and pick the model minimising EBIC. Parameters ---------- X : ndarray Either raw `(n, p)` data or a precomputed `(p, p)` symmetric covariance. If precomputed, pass `n` explicitly. estimator_cls : a class with an `alpha` parameter (GraphicalLasso, GraphicalMCP, GraphicalSCAD). lambdas : array-like of floats (positive, ideally in descending order). gamma : float, default 0.5 EBIC strength. `0` → BIC. n : int, optional Effective sample size. Required if `X` is precomputed covariance. assume_centered : bool, default False **estimator_kwargs : passed through to the estimator constructor. Returns ------- EBICPathResult """ from skein_glm.estimators import _is_symmetric, _to_covariance x = np.ascontiguousarray(X, dtype=np.float64) if x.ndim != 2: raise ValueError(f"X must be 2D, got shape {x.shape}") rows, cols = x.shape if rows == cols and _is_symmetric(x): if n is None: raise ValueError( "n is required when X is a precomputed covariance matrix" ) s = x p = cols else: s = _to_covariance(x, assume_centered=assume_centered) n = rows if n is None else n p = s.shape[0] assert n is not None lambdas = np.ascontiguousarray(lambdas, dtype=np.float64) if lambdas.ndim != 1 or lambdas.size == 0: raise ValueError("lambdas must be a non-empty 1D array") if np.any(lambdas <= 0): raise ValueError("lambdas must be positive") ebic_values = np.empty(lambdas.size, dtype=np.float64) n_edges = np.empty(lambdas.size, dtype=np.int64) estimators: list[Any] = [] log_p = np.log(p) if p > 1 else 0.0 log_n = np.log(n) if n > 1 else 0.0 for i, lam in enumerate(lambdas): est = estimator_cls(alpha=float(lam), assume_centered=assume_centered, **estimator_kwargs) est.fit(s) e = _count_edges(est.precision_) ll = _gaussian_loglik(s, est.precision_, n) ebic_values[i] = -2.0 * ll + e * log_n + 4.0 * gamma * e * log_p n_edges[i] = e estimators.append(est) best_i = int(np.argmin(ebic_values)) return EBICPathResult( best_estimator=estimators[best_i], best_lambda=float(lambdas[best_i]), best_ebic=float(ebic_values[best_i]), lambdas=lambdas, ebic=ebic_values, n_edges=n_edges, )
[docs] @dataclass class JointEBICPathResult: best_estimator: Any best_lambda_2: float best_ebic: float lambdas: NDArray[np.float64] ebic: NDArray[np.float64] n_edges_union: NDArray[np.int64]
[docs] def joint_ebic_path( Xs: list, estimator_cls, lambdas: NDArray[np.float64], *, gamma: float = 0.5, ns: list[int] | None = None, assume_centered: bool = False, **estimator_kwargs, ) -> JointEBICPathResult: """EBIC tuner for joint glasso. Walks a `λ_2` grid; the active edge count is the *union* across populations (an edge counts once if any population has it nonzero). Sums per-population log-likelihoods. Per-pop `n_k` is read from raw `X^(k)` row count, or via the `ns` argument when populations are passed as precomputed covariances. """ from skein_glm.estimators import _is_symmetric, _to_covariance if not isinstance(Xs, (list, tuple)) or len(Xs) == 0: raise ValueError("Xs must be a non-empty list of arrays") covs: list[NDArray[np.float64]] = [] n_list: list[int] = [] p: int | None = None for k, x in enumerate(Xs): xa = np.ascontiguousarray(x, dtype=np.float64) if xa.ndim != 2: raise ValueError(f"Xs[{k}] must be 2D, got shape {xa.shape}") rows, cols = xa.shape if rows == cols and _is_symmetric(xa): if ns is None: raise ValueError( "ns is required when Xs contain precomputed covariance matrices" ) covs.append(xa) n_list.append(int(ns[k])) this_p = cols else: covs.append(_to_covariance(xa, assume_centered=assume_centered)) n_list.append(int(rows)) this_p = cols if p is None: p = this_p elif this_p != p: raise ValueError("populations must share the same number of features") assert p is not None lambdas = np.ascontiguousarray(lambdas, dtype=np.float64) if lambdas.ndim != 1 or lambdas.size == 0: raise ValueError("lambdas must be a non-empty 1D array") ebic_values = np.empty(lambdas.size, dtype=np.float64) n_edges_union = np.empty(lambdas.size, dtype=np.int64) estimators: list[Any] = [] log_p = np.log(p) if p > 1 else 0.0 n_total = float(sum(n_list)) log_n_total = np.log(n_total) if n_total > 1 else 0.0 for i, lam in enumerate(lambdas): est = estimator_cls( lambda_2=float(lam), assume_centered=assume_centered, **estimator_kwargs, ) est.fit(Xs) # Per-pop log-lik, edge union. ll_total = 0.0 union_mask = np.zeros((p, p), dtype=bool) for theta, s, n_k in zip(est.precisions_, covs, n_list): ll_total += _gaussian_loglik(s, theta, float(n_k)) union_mask |= np.abs(theta) > 1e-8 iu = np.triu_indices(p, k=1) e = int(np.sum(union_mask[iu])) ebic_values[i] = -2.0 * ll_total + e * log_n_total + 4.0 * gamma * e * log_p n_edges_union[i] = e estimators.append(est) best_i = int(np.argmin(ebic_values)) return JointEBICPathResult( best_estimator=estimators[best_i], best_lambda_2=float(lambdas[best_i]), best_ebic=float(ebic_values[best_i]), lambdas=lambdas, ebic=ebic_values, n_edges_union=n_edges_union, )