Source code for skein_glm.debiased

"""Debiased / desparsified lasso for least squares.

Van de Geer–Bühlmann–Ritov–Dezeure (2014) confidence intervals and
p-values for high-dimensional lasso regression. Constructs an
approximate inverse Gram `Θ̂` via per-column **nodewise lassos**
(Meinshausen–Bühlmann 2006), uses it to debias the penalized
estimator, and reports asymptotic Gaussian inference on every
coordinate of `β̂_d`.

The debiased estimator is

    β̂_d = β̂ + (1 / n) · Θ̂ · Xᵀ (y − X β̂)

with asymptotic distribution (under standard sparsity / restricted-
eigenvalue conditions)

    √n · (β̂_d − β) ⇝ N(0, σ² · Θ̂ Σ̂ Θ̂ᵀ).

This is the one inference feature `glmnet` and `ncvreg` do not
offer; the canonical R implementation is `hdi::lasso.proj`. We
match its semantics: free function on `(X, y)` returning a result
dataclass plus a thin sklearn-style `DebiasedLassoRegressor`
wrapper.

Scope (v1): least-squares only, dense `X`. GLM (logistic / Poisson)
debiasing via the weighted-LS surrogate + Fisher information is a
planned follow-up.

References
----------
- Van de Geer, S., Bühlmann, P., Ritov, Y., Dezeure, R. (2014).
  *On asymptotically optimal confidence regions and tests for
  high-dimensional models.* Annals of Statistics 42(3): 1166–1202.
- Zhang, C.-H., Zhang, S. (2014). *Confidence intervals for low-
  dimensional parameters in high-dimensional linear models.*
  JRSS B 76(1): 217–242.
- Reid, S., Tibshirani, R., Friedman, J. (2016). *A study of error
  variance estimation in lasso regression.* Statistica Sinica 26:
  35–67.
"""
from __future__ import annotations

from dataclasses import dataclass
from typing import Any

import numpy as np
from numpy.typing import NDArray
from scipy import stats
from sklearn.base import BaseEstimator, RegressorMixin

from skein_glm.estimators import (
    CoxMCPRegressor,
    ElasticNetRegressor,
    LogisticLassoRegressor,
    PoissonLassoRegressor,
)

_ACTIVE_EPS = 1e-10


# --- result dataclass -------------------------------------------------



[docs]
@dataclass
class DebiasedLassoResult:
    """Output of :func:`debiased_lasso`.

    All coefficient arrays are returned on the **original (un-
    standardized) feature scale**. ``Theta`` is on the standardized
    scale used internally (where the nodewise lasso defaults are
    dimensionless); it is exposed for inspection and reuse.

    Attributes
    ----------
    coef_debiased : ndarray (p,)
        Desparsified estimator `β̂_d`.
    coef_lasso : ndarray (p,)
        The underlying penalized lasso fit `β̂` (also original-scale).
    intercept_ : float
    se : ndarray (p,)
        Asymptotic standard error from the diagonal of
        `σ̂² · Θ̂ Σ̂ Θ̂ᵀ / n`.
    ci_lower, ci_upper : ndarray (p,)
        Two-sided `(1 − alpha)` CIs.
    pvalues : ndarray (p,)
        Two-sided Wald p-values for `H_0: β_j = 0`.
    z_scores : ndarray (p,)
        `β̂_d / se`.
    sigma_hat : float
        Residual scale `‖y − ŷ‖ / sqrt(n − ‖β̂‖₀)`.
    Theta : ndarray (p, p)
        Approximate inverse Gram on the standardized scale.
    lambda_main : float
    lambda_nodewise : ndarray (p,)
    alpha : float
        CI level used.
    """

    coef_debiased: NDArray[np.float64]
    coef_lasso: NDArray[np.float64]
    intercept_: float
    se: NDArray[np.float64]
    ci_lower: NDArray[np.float64]
    ci_upper: NDArray[np.float64]
    pvalues: NDArray[np.float64]
    z_scores: NDArray[np.float64]
    sigma_hat: float
    Theta: NDArray[np.float64]
    lambda_main: float
    lambda_nodewise: NDArray[np.float64]
    alpha: float



# --- helpers ----------------------------------------------------------


def _theoretical_lambda(n: int, p: int) -> float:
    """VBR theoretical scale `√(2 log p / n)`.

    Dimensionless — appropriate when columns are standardized. For
    the main lasso this is the *base* scale; users typically prefer
    a CV-tuned `lambda_` and override the default.
    """
    return float(np.sqrt(2.0 * np.log(max(p, 2)) / n))


def _fit_nodewise_column(
    Xs: NDArray[np.float64],
    j: int,
    lambda_j: float,
    max_iter: int,
    tol: float,
) -> tuple[NDArray[np.float64], float]:
    """Run the j-th nodewise lasso: regress ``X_s[:, j]`` on the
    remaining columns.

    Returns
    -------
    gamma : ndarray (p - 1,)
        Lasso coefficients with ``j`` deleted.
    tau2 : float
        VBR `τ̂_j² = ‖X_j − X_{−j} γ̂‖² / n + λ_j · ‖γ̂‖₁`. The `+ λ‖γ‖₁`
        correction is the residual-norm-plus-penalty term that makes
        Θ̂ exactly satisfy the row-wise KKT inversion identity (VBR
        Lemma 6.2). Using the plain residual norm gives a slightly
        biased Θ̂ at finite `n`.
    """
    n = Xs.shape[0]
    Xmj = np.delete(Xs, j, axis=1)
    xj = Xs[:, j].copy()

    fit = ElasticNetRegressor(
        lambda_=lambda_j,
        alpha=1.0,
        fit_intercept=False,
        standardize=False,
        max_iter=max_iter,
        tol=tol,
    ).fit(Xmj, xj)
    gamma = np.asarray(fit.coef_, dtype=np.float64)

    resid = xj - Xmj @ gamma
    tau2 = float(np.dot(resid, resid)) / n + lambda_j * float(np.sum(np.abs(gamma)))
    return gamma, tau2


def _assemble_theta_rows(
    gammas: list[NDArray[np.float64]],
    tau2s: NDArray[np.float64],
) -> NDArray[np.float64]:
    """Build Θ̂ row-by-row from the nodewise outputs.

    Row j is `(−γ̂_{j,1}, …, 1, …, −γ̂_{j,p−1}) / τ̂_j²` with the `1`
    in position j. Θ̂ is **not symmetrized** here; VBR's variance
    formula uses the symmetric quadratic form `Θ̂ Σ̂ Θ̂ᵀ` so
    symmetrization at this stage is unnecessary.
    """
    p = len(gammas)
    Theta = np.zeros((p, p), dtype=np.float64)
    for j, (g, t2) in enumerate(zip(gammas, tau2s)):
        row = np.empty(p, dtype=np.float64)
        row[:j] = -g[:j]
        row[j] = 1.0
        row[j + 1 :] = -g[j:]
        Theta[j, :] = row / t2
    return Theta


# --- main entry point ------------------------------------------------



[docs]
def debiased_lasso(
    X: Any,
    y: Any,
    *,
    lambda_: float | None = None,
    lambda_nodewise: float | NDArray[np.float64] | None = None,
    alpha: float = 0.05,
    fit_intercept: bool = True,
    standardize: bool = True,
    max_iter: int = 1000,
    tol: float = 1e-7,
    n_jobs: int | None = None,
) -> DebiasedLassoResult:
    """Van de Geer–Bühlmann–Ritov debiased lasso for least squares.

    Parameters
    ----------
    X : array-like (n, p)
    y : array-like (n,)
    lambda_ : float or None, default None
        Regularisation for the **main** lasso fit. If ``None``, the
        theoretical scale `√(2 log p / n)` is used on standardized
        features. For best inference quality, pass a CV-tuned λ from
        a prior :class:`~skein_glm.ElasticNetPathCV` fit.
    lambda_nodewise : float, array-like (p,), or None, default None
        Per-column λ for the **nodewise** lassos that build `Θ̂`.
        Scalar broadcasts to every column; an array selects per
        column. ``None`` uses `√(2 log p / n)` uniformly (the VBR
        theoretical choice on standardized columns).
    alpha : float, default 0.05
        Two-sided CI / p-value level.
    fit_intercept : bool, default True
        Center `y` and `X`; intercept is recovered post-fit. The
        intercept itself is **not** an inferential target — VBR
        theory applies to the slopes.
    standardize : bool, default True
        Scale columns of `X` to unit variance before fitting. The
        dimensionless `lambda_nodewise` default is calibrated to
        standardized columns; turning this off without supplying
        column-specific `lambda_nodewise` typically gives wrong
        scale on the CIs.
    max_iter, tol : int, float
        Forwarded to every underlying lasso fit.
    n_jobs : int or None
        joblib parallelism over the `p` nodewise lassos. ``-1``
        uses all cores. The main lasso is single-threaded; the
        nodewise loop dominates for `p ≳ 50`.

    Returns
    -------
    DebiasedLassoResult

    Notes
    -----
    Memory is `O(p²)` for `Θ̂` plus `O(n p)` for the standardized
    design — practical up to `p ~ 5000` on a modern laptop. For
    larger `p`, the nodewise loop runtime (`p` lasso fits) is the
    binding constraint rather than memory.

    The asymptotic distribution requires (a) lasso consistency at
    rate `‖β̂ − β‖₁ = o(1/√(log p))`, (b) row-wise sparsity of the
    true `Σ⁻¹`, (c) sub-Gaussian design and noise. Failures show up
    as poor coverage in the empirical CI simulation tests.
    """
    from joblib import Parallel, delayed

    X = np.ascontiguousarray(X, dtype=np.float64)
    y = np.ascontiguousarray(y, dtype=np.float64)
    if X.ndim != 2:
        raise ValueError(f"X must be 2D, got shape {X.shape}")
    if y.ndim != 1 or y.shape[0] != X.shape[0]:
        raise ValueError(
            f"y must be 1D with length {X.shape[0]}, got shape {y.shape}"
        )
    if not 0.0 < alpha < 1.0:
        raise ValueError(f"alpha must be in (0, 1); got {alpha}")
    if max_iter < 1:
        raise ValueError(f"max_iter must be ≥ 1; got {max_iter}")
    if tol <= 0:
        raise ValueError(f"tol must be > 0; got {tol}")

    n, p = X.shape
    if p < 2:
        raise ValueError(
            f"debiased lasso requires p ≥ 2 features; got p = {p}"
        )

    # Center / scale.
    if fit_intercept:
        x_mean = X.mean(axis=0)
        y_mean = float(y.mean())
    else:
        x_mean = np.zeros(p)
        y_mean = 0.0
    Xc = X - x_mean
    yc = y - y_mean

    if standardize:
        x_scale = Xc.std(axis=0, ddof=0)
        # Constant column ⇒ keep scale 1 so we don't divide by zero;
        # the lasso will still drop it cleanly via the standard KKT.
        x_scale = np.where(x_scale > 0, x_scale, 1.0)
    else:
        x_scale = np.ones(p)
    Xs = Xc / x_scale

    # Main lasso fit (standardized scale).
    lambda_main = (
        _theoretical_lambda(n, p) if lambda_ is None else float(lambda_)
    )
    if lambda_main <= 0:
        raise ValueError(f"lambda_ must be > 0; got {lambda_main}")
    main_fit = ElasticNetRegressor(
        lambda_=lambda_main,
        alpha=1.0,
        fit_intercept=False,
        standardize=False,
        max_iter=max_iter,
        tol=tol,
    ).fit(Xs, yc)
    beta_hat_s = np.asarray(main_fit.coef_, dtype=np.float64)

    # Nodewise lassos.
    if lambda_nodewise is None:
        lam_nw = np.full(p, _theoretical_lambda(n, p), dtype=np.float64)
    elif np.isscalar(lambda_nodewise):
        lam_nw = np.full(
            p, float(lambda_nodewise),  # type: ignore[arg-type]
            dtype=np.float64,
        )
    else:
        lam_nw = np.ascontiguousarray(lambda_nodewise, dtype=np.float64)
        if lam_nw.shape != (p,):
            raise ValueError(
                f"lambda_nodewise must be scalar or shape ({p},); "
                f"got {lam_nw.shape}"
            )
    if np.any(lam_nw <= 0):
        raise ValueError("lambda_nodewise entries must be > 0")

    results = Parallel(n_jobs=n_jobs)(
        delayed(_fit_nodewise_column)(Xs, j, float(lam_nw[j]), max_iter, tol)
        for j in range(p)
    )
    gammas = [g for g, _ in results]
    tau2s = np.array([t for _, t in results], dtype=np.float64)
    Theta = _assemble_theta_rows(gammas, tau2s)  # (p, p), standardized scale

    # Debiased estimator (standardized scale).
    resid_s = yc - Xs @ beta_hat_s
    beta_d_s = beta_hat_s + (Theta @ (Xs.T @ resid_s)) / n

    # σ̂ from lasso residuals — Reid–Tibshirani–Friedman convention.
    nz = int(np.sum(np.abs(beta_hat_s) > _ACTIVE_EPS))
    sigma2 = float(np.dot(resid_s, resid_s)) / max(n - nz, 1)
    sigma_hat = float(np.sqrt(sigma2))

    # Variance: σ̂² · diag(Θ̂ Σ̂ Θ̂ᵀ) / n.
    # Compute via U = X_s · Θ̂ᵀ ∈ R^{n×p}; then [Θ̂ Σ̂ Θ̂ᵀ]_jj = ‖U_j‖² / n.
    # That avoids materializing the p×p Σ̂.
    U = Xs @ Theta.T
    diag_quad = np.einsum("ij,ij->j", U, U) / n
    var_d_s = sigma2 * diag_quad / n
    se_s = np.sqrt(np.maximum(var_d_s, 0.0))

    # Back to original scale.
    beta_d = beta_d_s / x_scale
    beta_lasso = beta_hat_s / x_scale
    se = se_s / x_scale

    intercept = (
        y_mean - float(np.dot(beta_d, x_mean)) if fit_intercept else 0.0
    )

    # Inference (Wald).
    z_alpha = float(stats.norm.ppf(1.0 - alpha / 2.0))
    safe_se = np.where(se > 0, se, 1.0)
    z_scores = np.where(se > 0, beta_d / safe_se, 0.0)
    pvalues = 2.0 * stats.norm.sf(np.abs(z_scores))
    ci_lower = beta_d - z_alpha * se
    ci_upper = beta_d + z_alpha * se

    return DebiasedLassoResult(
        coef_debiased=beta_d,
        coef_lasso=beta_lasso,
        intercept_=intercept,
        se=se,
        ci_lower=ci_lower,
        ci_upper=ci_upper,
        pvalues=pvalues,
        z_scores=z_scores,
        sigma_hat=sigma_hat,
        Theta=Theta,
        lambda_main=lambda_main,
        lambda_nodewise=lam_nw,
        alpha=alpha,
    )



# --- sklearn-style wrapper -------------------------------------------



[docs]
class DebiasedLassoRegressor(BaseEstimator, RegressorMixin):
    """Sklearn-style facade over :func:`debiased_lasso`.

    Exposes the debiased estimator as ``coef_`` / ``intercept_`` so
    the result composes with sklearn `Pipeline` and metric helpers.
    The VBR-specific attributes (``se_``, ``ci_lower_``,
    ``ci_upper_``, ``pvalues_``, ``z_scores_``, ``Theta_``,
    ``sigma_hat_``) live on the fitted estimator.

    Parameters mirror :func:`debiased_lasso`.

    Examples
    --------
    >>> from skein_glm import DebiasedLassoRegressor
    >>> est = DebiasedLassoRegressor(random_state=0).fit(X, y)
    >>> est.coef_, est.ci_lower_, est.ci_upper_, est.pvalues_
    """

    coef_: NDArray[np.float64]
    coef_lasso_: NDArray[np.float64]
    intercept_: float
    se_: NDArray[np.float64]
    ci_lower_: NDArray[np.float64]
    ci_upper_: NDArray[np.float64]
    pvalues_: NDArray[np.float64]
    z_scores_: NDArray[np.float64]
    sigma_hat_: float
    Theta_: NDArray[np.float64]
    lambda_main_: float
    lambda_nodewise_: NDArray[np.float64]
    n_features_in_: int

    def __init__(
        self,
        *,
        lambda_: float | None = None,
        lambda_nodewise: float | NDArray[np.float64] | None = None,
        alpha: float = 0.05,
        fit_intercept: bool = True,
        standardize: bool = True,
        max_iter: int = 1000,
        tol: float = 1e-7,
        n_jobs: int | None = None,
    ) -> None:
        self.lambda_ = lambda_
        self.lambda_nodewise = lambda_nodewise
        self.alpha = alpha
        self.fit_intercept = fit_intercept
        self.standardize = standardize
        self.max_iter = max_iter
        self.tol = tol
        self.n_jobs = n_jobs

    def fit(self, X: Any, y: Any) -> "DebiasedLassoRegressor":
        res = debiased_lasso(
            X, y,
            lambda_=self.lambda_,
            lambda_nodewise=self.lambda_nodewise,
            alpha=self.alpha,
            fit_intercept=self.fit_intercept,
            standardize=self.standardize,
            max_iter=self.max_iter,
            tol=self.tol,
            n_jobs=self.n_jobs,
        )
        self.coef_ = res.coef_debiased
        self.coef_lasso_ = res.coef_lasso
        self.intercept_ = res.intercept_
        self.se_ = res.se
        self.ci_lower_ = res.ci_lower
        self.ci_upper_ = res.ci_upper
        self.pvalues_ = res.pvalues
        self.z_scores_ = res.z_scores
        self.sigma_hat_ = res.sigma_hat
        self.Theta_ = res.Theta
        self.lambda_main_ = res.lambda_main
        self.lambda_nodewise_ = res.lambda_nodewise
        self.n_features_in_ = int(res.coef_debiased.shape[0])
        return self

    def predict(self, X: Any) -> NDArray[np.float64]:
        X = np.ascontiguousarray(X, dtype=np.float64)
        return X @ self.coef_ + self.intercept_



# =====================================================================
# M5.x-b — Debiased GLM (logistic + Poisson)
# =====================================================================
#
# Extends the VBR construction from LS to canonical-link GLMs via the
# weighted-LS surrogate. The estimating equation at β̂ is
#
#     score(β̂) = Xᵀ (y − μ̂(β̂))
#
# and the Fisher information at β̂ is J = (1/n) · Xᵀ W X where W is the
# diagonal of working weights — μ̂(1−μ̂) for binomial logit, μ̂ for
# Poisson log. The debiased estimator is
#
#     β̂_d = β̂ + (1/n) · Θ̂ · Xᵀ (y − μ̂)
#
# with asymptotic distribution √n (β̂_d − β) ⇝ N(0, J⁻¹) and Θ̂ ≈ J⁻¹
# constructed nodewise on the **weighted design** X̃ = W^{1/2} X. This
# is the JM-extended-to-GLM construction (Van de Geer 2014 §3).
#
# Reuses :func:`_fit_nodewise_column` and :func:`_assemble_theta_rows`
# from the LS path — the math is identical once you swap `X` for `X̃`.


_GLM_WEIGHT_FLOOR = 1e-8



[docs]
@dataclass
class DebiasedGLMResult:
    """Output of :func:`debiased_logistic_lasso` / :func:`debiased_poisson_lasso`.

    Same fields as :class:`DebiasedLassoResult` minus ``sigma_hat``
    (GLMs have no residual scale — the noise is encoded in the working
    weights `W`) and plus ``mu_fitted`` and ``family``.

    Attributes
    ----------
    coef_debiased : ndarray (p,)
    coef_glm : ndarray (p,)
        Underlying penalized GLM fit on the original feature scale.
    intercept_ : float
        Intercept of the underlying fit; not an inferential target (VBR
        theory applies to slopes only).
    se : ndarray (p,)
        `sqrt(diag(Θ̂ X̃ᵀX̃ Θ̂ᵀ)) / n`.
    ci_lower, ci_upper, pvalues, z_scores : ndarray (p,)
    mu_fitted : ndarray (n,)
        `μ̂` at the penalized fit. Useful for diagnostics.
    Theta : ndarray (p, p)
        Approximate inverse Fisher on the standardized scale.
    lambda_main : float
    lambda_nodewise : ndarray (p,)
    alpha : float
    family : str
        ``"binomial"`` or ``"poisson"``.
    """

    coef_debiased: NDArray[np.float64]
    coef_glm: NDArray[np.float64]
    intercept_: float
    se: NDArray[np.float64]
    ci_lower: NDArray[np.float64]
    ci_upper: NDArray[np.float64]
    pvalues: NDArray[np.float64]
    z_scores: NDArray[np.float64]
    mu_fitted: NDArray[np.float64]
    Theta: NDArray[np.float64]
    lambda_main: float
    lambda_nodewise: NDArray[np.float64]
    alpha: float
    family: str



def _debiased_glm_core(
    Xs: NDArray[np.float64],
    y: NDArray[np.float64],
    beta_hat_s: NDArray[np.float64],
    intercept_s: float,
    mu_hat: NDArray[np.float64],
    w_diag: NDArray[np.float64],
    x_mean: NDArray[np.float64],
    x_scale: NDArray[np.float64],
    lam_nw: NDArray[np.float64],
    alpha: float,
    max_iter: int,
    tol: float,
    n_jobs: int | None,
    family: str,
    lambda_main: float,
) -> DebiasedGLMResult:
    """Shared GLM debiasing core — nodewise + variance + inference.

    Inputs are on the **standardized** feature scale (mean-centered,
    scaled to unit variance). Output is destandardized to the original
    scale.
    """
    from joblib import Parallel, delayed

    n, p = Xs.shape

    # Floor working weights to avoid degenerate columns in X̃ near
    # saturation (logistic prob ≈ 0/1) or very small Poisson means.
    # The asymptotic theory requires the working weights to be bounded
    # away from zero; in practice tiny floors here don't move the
    # answer materially when the data are well-conditioned but prevent
    # blow-ups when they aren't.
    w_safe = np.maximum(w_diag, _GLM_WEIGHT_FLOOR)
    w_sqrt = np.sqrt(w_safe)
    X_tilde = Xs * w_sqrt[:, None]

    # Nodewise lassos on the weighted design.
    results = Parallel(n_jobs=n_jobs)(
        delayed(_fit_nodewise_column)(X_tilde, j, float(lam_nw[j]), max_iter, tol)
        for j in range(p)
    )
    gammas = [g for g, _ in results]
    tau2s = np.array([t for _, t in results], dtype=np.float64)
    Theta = _assemble_theta_rows(gammas, tau2s)

    # Debias: β̂_d = β̂ + (1/n) Θ̂ Xᵀ (y − μ̂). Note this is the
    # *unweighted* score (the canonical-link gradient), not the
    # weighted X̃ᵀ (y − μ̂).
    score = Xs.T @ (y - mu_hat)
    beta_d_s = beta_hat_s + (Theta @ score) / n

    # Variance: diag(Θ̂ X̃ᵀX̃ Θ̂ᵀ) / n. Compute via U = X̃ Θ̂ᵀ to skip
    # the (p, p) Gram materialization.
    U = X_tilde @ Theta.T
    var_d_s = np.einsum("ij,ij->j", U, U) / (n * n)
    se_s = np.sqrt(np.maximum(var_d_s, 0.0))

    # Destandardize to original feature scale.
    beta_d = beta_d_s / x_scale
    beta_glm = beta_hat_s / x_scale
    se = se_s / x_scale
    intercept = intercept_s - float(np.dot(beta_d, x_mean))

    # Wald inference.
    z_alpha = float(stats.norm.ppf(1.0 - alpha / 2.0))
    safe_se = np.where(se > 0, se, 1.0)
    z_scores = np.where(se > 0, beta_d / safe_se, 0.0)
    pvalues = 2.0 * stats.norm.sf(np.abs(z_scores))
    ci_lower = beta_d - z_alpha * se
    ci_upper = beta_d + z_alpha * se

    return DebiasedGLMResult(
        coef_debiased=beta_d,
        coef_glm=beta_glm,
        intercept_=intercept,
        se=se,
        ci_lower=ci_lower,
        ci_upper=ci_upper,
        pvalues=pvalues,
        z_scores=z_scores,
        mu_fitted=mu_hat,
        Theta=Theta,
        lambda_main=lambda_main,
        lambda_nodewise=lam_nw,
        alpha=alpha,
        family=family,
    )


def _standardize_X(
    X: NDArray[np.float64],
    standardize: bool,
) -> tuple[NDArray[np.float64], NDArray[np.float64], NDArray[np.float64]]:
    """Mean-center and (optionally) unit-scale columns.

    Returns ``(Xs, x_mean, x_scale)``. When ``standardize=False`` we
    still center; ``x_scale`` is all ones. Constant columns get
    ``scale=1`` so we don't divide by zero — the penalized fit will
    drop them via the usual KKT.
    """
    x_mean = X.mean(axis=0)
    Xc = X - x_mean
    if standardize:
        x_scale = Xc.std(axis=0, ddof=0)
        x_scale = np.where(x_scale > 0, x_scale, 1.0)
    else:
        x_scale = np.ones(X.shape[1], dtype=np.float64)
    return Xc / x_scale, x_mean, x_scale


def _build_lambda_nodewise(
    lambda_nodewise: float | NDArray[np.float64] | None,
    n: int,
    p: int,
) -> NDArray[np.float64]:
    if lambda_nodewise is None:
        return np.full(p, _theoretical_lambda(n, p), dtype=np.float64)
    if np.isscalar(lambda_nodewise):
        return np.full(
            p, float(lambda_nodewise),  # type: ignore[arg-type]
            dtype=np.float64,
        )
    arr = np.ascontiguousarray(lambda_nodewise, dtype=np.float64)
    if arr.shape != (p,):
        raise ValueError(
            f"lambda_nodewise must be scalar or shape ({p},); got {arr.shape}"
        )
    return arr


def _sigmoid(z: NDArray[np.float64]) -> NDArray[np.float64]:
    """Numerically stable sigmoid."""
    out = np.empty_like(z)
    pos = z >= 0
    out[pos] = 1.0 / (1.0 + np.exp(-z[pos]))
    e = np.exp(z[~pos])
    out[~pos] = e / (1.0 + e)
    return out


# --- logistic --------------------------------------------------------



[docs]
def debiased_logistic_lasso(
    X: Any,
    y: Any,
    *,
    lambda_: float | None = None,
    lambda_nodewise: float | NDArray[np.float64] | None = None,
    alpha: float = 0.05,
    fit_intercept: bool = True,
    standardize: bool = True,
    max_iter: int = 1000,
    tol: float = 1e-7,
    n_jobs: int | None = None,
) -> DebiasedGLMResult:
    """VBR debiased lasso for binomial logistic regression.

    Construction (Van de Geer 2014 §3): fit penalized logistic lasso
    to obtain ``β̂``, compute ``μ̂_i = σ(η̂_i)`` and working weights
    ``W_ii = μ̂_i (1 − μ̂_i)``, then build the approximate inverse
    Fisher ``Θ̂`` by nodewise lassos on ``X̃ = W^{1/2} X``. The
    debiased estimator is
    ``β̂_d = β̂ + (1/n) · Θ̂ · Xᵀ (y − μ̂)``
    with asymptotic variance ``diag(Θ̂ X̃ᵀX̃ Θ̂ᵀ) / n`` (no `σ²` —
    the noise is encoded in `W`).

    Parameters mirror :func:`debiased_lasso`. The penalized fit is
    obtained from :class:`~skein_glm.LogisticLassoRegressor`, the new
    convex-L1 primitive — debiasing on top of the prior
    ``MCP(γ=1e9)`` approximation would have inherited its bias.

    Notes
    -----
    Working weights are floored at ``1e-8`` to avoid degenerate
    columns in `X̃` when fitted probabilities saturate near 0 or 1.
    If you see suspiciously narrow CIs, inspect ``mu_fitted`` for
    boundary cases.
    """
    X = np.ascontiguousarray(X, dtype=np.float64)
    y = np.ascontiguousarray(y, dtype=np.float64)
    if X.ndim != 2:
        raise ValueError(f"X must be 2D, got shape {X.shape}")
    if y.ndim != 1 or y.shape[0] != X.shape[0]:
        raise ValueError(
            f"y must be 1D with length {X.shape[0]}, got shape {y.shape}"
        )
    if not np.all((y == 0.0) | (y == 1.0)):
        raise ValueError("logistic regression requires y ∈ {0, 1}")
    if not 0.0 < alpha < 1.0:
        raise ValueError(f"alpha must be in (0, 1); got {alpha}")
    n, p = X.shape
    if p < 2:
        raise ValueError(f"debiased lasso requires p ≥ 2 features; got p = {p}")

    Xs, x_mean, x_scale = _standardize_X(X, standardize)
    # The logistic primitive fits an intercept itself; we pass
    # standardized X. (`fit_intercept` here means: do we let the GLM
    # fit one at all.)
    lambda_main = (
        _theoretical_lambda(n, p) if lambda_ is None else float(lambda_)
    )
    if lambda_main <= 0:
        raise ValueError(f"lambda_ must be > 0; got {lambda_main}")
    fit = LogisticLassoRegressor(
        lambda_=lambda_main,
        fit_intercept=fit_intercept,
        standardize=False,
        max_iter=max_iter,
        tol=tol,
        max_outer=20,
        outer_tol=tol,
    ).fit(Xs, y)
    beta_hat_s = np.asarray(fit.coef_, dtype=np.float64)
    intercept_s = float(fit.intercept_)

    eta = Xs @ beta_hat_s + intercept_s
    mu_hat = _sigmoid(eta)
    w_diag = mu_hat * (1.0 - mu_hat)

    lam_nw = _build_lambda_nodewise(lambda_nodewise, n, p)
    if np.any(lam_nw <= 0):
        raise ValueError("lambda_nodewise entries must be > 0")

    return _debiased_glm_core(
        Xs=Xs, y=y, beta_hat_s=beta_hat_s, intercept_s=intercept_s,
        mu_hat=mu_hat, w_diag=w_diag,
        x_mean=x_mean, x_scale=x_scale,
        lam_nw=lam_nw, alpha=alpha,
        max_iter=max_iter, tol=tol, n_jobs=n_jobs,
        family="binomial", lambda_main=lambda_main,
    )



# --- Poisson --------------------------------------------------------



[docs]
def debiased_poisson_lasso(
    X: Any,
    y: Any,
    *,
    lambda_: float | None = None,
    lambda_nodewise: float | NDArray[np.float64] | None = None,
    alpha: float = 0.05,
    fit_intercept: bool = True,
    standardize: bool = True,
    offset: NDArray[np.float64] | None = None,
    max_iter: int = 1000,
    tol: float = 1e-7,
    n_jobs: int | None = None,
) -> DebiasedGLMResult:
    """VBR debiased lasso for Poisson log-link regression.

    Construction: fit penalized Poisson lasso to obtain ``β̂``,
    compute ``μ̂_i = exp(η̂_i + offset_i)`` and working weights
    ``W_ii = μ̂_i``, build ``Θ̂`` nodewise on ``X̃ = W^{1/2} X``,
    debias as in the logistic case.

    Parameters mirror :func:`debiased_logistic_lasso` plus a Poisson
    ``offset`` (log-exposure) that participates in the linear
    predictor but is not an inferential target.

    Notes
    -----
    Poisson working weights can have very large dynamic range
    (`μ̂ = exp(η̂)`). If some `μ̂_i` are tiny, the corresponding rows
    of `X̃` are heavily down-weighted and the asymptotic theory's
    regularity conditions may be strained. The same `1e-8` floor as
    the logistic case applies — well-conditioned problems are
    unaffected.
    """
    X = np.ascontiguousarray(X, dtype=np.float64)
    y = np.ascontiguousarray(y, dtype=np.float64)
    if X.ndim != 2:
        raise ValueError(f"X must be 2D, got shape {X.shape}")
    if y.ndim != 1 or y.shape[0] != X.shape[0]:
        raise ValueError(
            f"y must be 1D with length {X.shape[0]}, got shape {y.shape}"
        )
    if not np.all(np.isfinite(y)) or np.any(y < 0.0):
        raise ValueError("Poisson regression requires y ≥ 0 (finite)")
    if not 0.0 < alpha < 1.0:
        raise ValueError(f"alpha must be in (0, 1); got {alpha}")
    n, p = X.shape
    if p < 2:
        raise ValueError(f"debiased lasso requires p ≥ 2 features; got p = {p}")
    offset_arr: NDArray[np.float64] | None = None
    if offset is not None:
        offset_arr = np.ascontiguousarray(offset, dtype=np.float64)
        if offset_arr.shape != (n,):
            raise ValueError(
                f"offset must be 1D with length {n}; got shape {offset_arr.shape}"
            )

    Xs, x_mean, x_scale = _standardize_X(X, standardize)
    lambda_main = (
        _theoretical_lambda(n, p) if lambda_ is None else float(lambda_)
    )
    if lambda_main <= 0:
        raise ValueError(f"lambda_ must be > 0; got {lambda_main}")

    fit_kwargs: dict[str, Any] = dict(
        lambda_=lambda_main,
        fit_intercept=fit_intercept,
        standardize=False,
        max_iter=max_iter,
        tol=tol,
        max_outer=20,
        outer_tol=tol,
    )
    if offset_arr is not None:
        fit_kwargs["offset"] = offset_arr
    fit = PoissonLassoRegressor(**fit_kwargs).fit(Xs, y)
    beta_hat_s = np.asarray(fit.coef_, dtype=np.float64)
    intercept_s = float(fit.intercept_)

    eta = Xs @ beta_hat_s + intercept_s
    if offset_arr is not None:
        eta = eta + offset_arr
    mu_hat = np.exp(eta)
    w_diag = mu_hat

    lam_nw = _build_lambda_nodewise(lambda_nodewise, n, p)
    if np.any(lam_nw <= 0):
        raise ValueError("lambda_nodewise entries must be > 0")

    return _debiased_glm_core(
        Xs=Xs, y=y, beta_hat_s=beta_hat_s, intercept_s=intercept_s,
        mu_hat=mu_hat, w_diag=w_diag,
        x_mean=x_mean, x_scale=x_scale,
        lam_nw=lam_nw, alpha=alpha,
        max_iter=max_iter, tol=tol, n_jobs=n_jobs,
        family="poisson", lambda_main=lambda_main,
    )



# --- sklearn-style wrappers -----------------------------------------


class _DebiasedGLMRegressorBase(BaseEstimator):
    """Shared facade: exposes ``coef_`` / ``intercept_`` and the
    inference outputs as suffixed attributes. Subclasses pin the
    family-specific free function."""

    coef_: NDArray[np.float64]
    coef_glm_: NDArray[np.float64]
    intercept_: float
    se_: NDArray[np.float64]
    ci_lower_: NDArray[np.float64]
    ci_upper_: NDArray[np.float64]
    pvalues_: NDArray[np.float64]
    z_scores_: NDArray[np.float64]
    Theta_: NDArray[np.float64]
    mu_fitted_: NDArray[np.float64]
    lambda_main_: float
    lambda_nodewise_: NDArray[np.float64]
    family_: str
    n_features_in_: int

    def _store_result(self, res: DebiasedGLMResult) -> None:
        self.coef_ = res.coef_debiased
        self.coef_glm_ = res.coef_glm
        self.intercept_ = res.intercept_
        self.se_ = res.se
        self.ci_lower_ = res.ci_lower
        self.ci_upper_ = res.ci_upper
        self.pvalues_ = res.pvalues
        self.z_scores_ = res.z_scores
        self.Theta_ = res.Theta
        self.mu_fitted_ = res.mu_fitted
        self.lambda_main_ = res.lambda_main
        self.lambda_nodewise_ = res.lambda_nodewise
        self.family_ = res.family
        self.n_features_in_ = int(res.coef_debiased.shape[0])

    def decision_function(self, X: Any) -> NDArray[np.float64]:
        X = np.ascontiguousarray(X, dtype=np.float64)
        return X @ self.coef_ + self.intercept_



[docs]
class DebiasedLogisticLassoRegressor(_DebiasedGLMRegressorBase):
    """Sklearn-style facade over :func:`debiased_logistic_lasso`."""

    def __init__(
        self,
        *,
        lambda_: float | None = None,
        lambda_nodewise: float | NDArray[np.float64] | None = None,
        alpha: float = 0.05,
        fit_intercept: bool = True,
        standardize: bool = True,
        max_iter: int = 1000,
        tol: float = 1e-7,
        n_jobs: int | None = None,
    ) -> None:
        self.lambda_ = lambda_
        self.lambda_nodewise = lambda_nodewise
        self.alpha = alpha
        self.fit_intercept = fit_intercept
        self.standardize = standardize
        self.max_iter = max_iter
        self.tol = tol
        self.n_jobs = n_jobs

    def fit(self, X: Any, y: Any) -> "DebiasedLogisticLassoRegressor":
        res = debiased_logistic_lasso(
            X, y,
            lambda_=self.lambda_,
            lambda_nodewise=self.lambda_nodewise,
            alpha=self.alpha,
            fit_intercept=self.fit_intercept,
            standardize=self.standardize,
            max_iter=self.max_iter,
            tol=self.tol,
            n_jobs=self.n_jobs,
        )
        self._store_result(res)
        return self

    def predict_proba(self, X: Any) -> NDArray[np.float64]:
        return _sigmoid(self.decision_function(X))

    def predict(self, X: Any) -> NDArray[np.float64]:
        return (self.predict_proba(X) >= 0.5).astype(np.float64)




[docs]
class DebiasedPoissonLassoRegressor(_DebiasedGLMRegressorBase):
    """Sklearn-style facade over :func:`debiased_poisson_lasso`."""

    def __init__(
        self,
        *,
        lambda_: float | None = None,
        lambda_nodewise: float | NDArray[np.float64] | None = None,
        alpha: float = 0.05,
        fit_intercept: bool = True,
        standardize: bool = True,
        offset: NDArray[np.float64] | None = None,
        max_iter: int = 1000,
        tol: float = 1e-7,
        n_jobs: int | None = None,
    ) -> None:
        self.lambda_ = lambda_
        self.lambda_nodewise = lambda_nodewise
        self.alpha = alpha
        self.fit_intercept = fit_intercept
        self.standardize = standardize
        self.offset = offset
        self.max_iter = max_iter
        self.tol = tol
        self.n_jobs = n_jobs

    def fit(self, X: Any, y: Any) -> "DebiasedPoissonLassoRegressor":
        res = debiased_poisson_lasso(
            X, y,
            lambda_=self.lambda_,
            lambda_nodewise=self.lambda_nodewise,
            alpha=self.alpha,
            fit_intercept=self.fit_intercept,
            standardize=self.standardize,
            offset=self.offset,
            max_iter=self.max_iter,
            tol=self.tol,
            n_jobs=self.n_jobs,
        )
        self._store_result(res)
        return self


[docs]
    def predict(self, X: Any) -> NDArray[np.float64]:
        """μ̂ = exp(η̂) — matches PoissonLassoRegressor's convention."""
        eta = self.decision_function(X)
        if self.offset is not None:
            eta = eta + np.ascontiguousarray(self.offset, dtype=np.float64)
        return np.exp(eta)




# --- Cox PH ---------------------------------------------------------


@dataclass
class DebiasedCoxResult:
    """Output of :func:`debiased_cox_lasso`.

    Same inferential fields as :class:`DebiasedGLMResult` minus
    ``mu_fitted`` (Cox PH has no ``μ`` in the canonical GLM sense)
    plus ``risk_score`` ``= X β̂`` — the prognostic index on the
    original feature scale.

    Cox has no intercept (the baseline hazard absorbs constants);
    the ``intercept_`` field is the destandardization correction
    ``-⟨β̂_d, x̄⟩`` so that ``X @ coef_ + intercept_`` reproduces the
    linear predictor of the original-feature-scale model.

    Attributes
    ----------
    coef_debiased : ndarray (p,)
    coef_glm : ndarray (p,)
        Underlying penalized Cox lasso fit on the original feature
        scale.
    intercept_ : float
        Destandardization correction. Cox `decision_function` returns
        ``X @ coef_ + intercept_`` — the prognostic index.
    se : ndarray (p,)
    ci_lower, ci_upper, pvalues, z_scores : ndarray (p,)
    risk_score : ndarray (n,)
        ``η̂_i = ⟨x_i, β̂⟩`` at the penalized fit. The Cox prognostic
        index for the training rows.
    Theta : ndarray (p, p)
        Approximate inverse Fisher on the standardized scale, built
        nodewise on the weighted design ``X̃ = W^{1/2} X``.
    lambda_main : float
    lambda_nodewise : ndarray (p,)
    alpha : float
    family : str
        Always ``"cox"``.
    ties : str
        Tie-handling rule used in the partial likelihood.
    """

    coef_debiased: NDArray[np.float64]
    coef_glm: NDArray[np.float64]
    intercept_: float
    se: NDArray[np.float64]
    ci_lower: NDArray[np.float64]
    ci_upper: NDArray[np.float64]
    pvalues: NDArray[np.float64]
    z_scores: NDArray[np.float64]
    risk_score: NDArray[np.float64]
    Theta: NDArray[np.float64]
    lambda_main: float
    lambda_nodewise: NDArray[np.float64]
    alpha: float
    family: str
    ties: str


def debiased_cox_lasso(
    X: Any,
    time: Any,
    event: Any,
    *,
    lambda_: float | None = None,
    lambda_nodewise: float | NDArray[np.float64] | None = None,
    alpha: float = 0.05,
    ties: str = "breslow",
    standardize: bool = True,
    max_iter: int = 1000,
    tol: float = 1e-7,
    n_jobs: int | None = None,
) -> DebiasedCoxResult:
    """Debiased Cox lasso with Wald confidence intervals.

    Construction (van de Geer–Cai–Wang extension of VBR to Cox):

    1. Fit a penalized Cox lasso to obtain ``β̂``.
    2. At ``β̂``, extract the per-sample partial-likelihood Fisher
       information ``w_i`` and working response ``z_i`` from the Cox
       IRLS surrogate (via the Rust ``CoxPH::surrogate_at`` exposed as
       ``_core.cox_surrogate_weights_at``).
    3. Build the **weighted design** ``X̃ = W^{1/2} X`` where
       ``W = diag(w)``. The partial-likelihood Fisher information is
       ``Xᵀ W X``, so ``X̃`` plays exactly the role of ``X`` in the
       LS / GLM debiased construction.
    4. Build ``Θ̂`` by node-wise lasso on ``X̃`` (Van de Geer 2014 §3
       construction; ``_fit_nodewise_column`` is penalty-axis agnostic
       and reused unchanged).
    5. Debias: ``β̂_d = β̂ + (1/n) Θ̂ · Xᵀ (event − μ̂_cox)`` where
       ``μ̂_cox_i = exp(η̂_i) · Λ̂_0(t_i)`` is the partial-likelihood
       contribution. Computed via ``μ̂_cox = event + w·(η − z)`` from
       the surrogate identity ``z = η − g / w``, ``g = event −
       exp(η)·Λ̂_0``.
    6. Variance: ``diag(Θ̂ X̃ᵀX̃ Θ̂ᵀ) / n²``. Same formula as the
       logistic / Poisson case; the partial likelihood supplies its
       own Fisher information, so there is no σ² nuisance term.

    Parameters
    ----------
    X : array-like of shape (n, p)
    time : array-like of shape (n,)
        Event / censoring time; must be ≥ 0.
    event : array-like of shape (n,)
        0/1 censoring indicator (1 = event observed).
    lambda_ : float, optional
        Lasso λ for the penalized Cox fit. Defaults to the
        theoretical ``c · √(log p / n)`` rate.
    lambda_nodewise : float or array, optional
        Per-node λ for the nodewise lassos. Defaults to a tuned grid.
    alpha : float, default 0.05
        Wald CI level — outputs ``[α/2, 1 − α/2]`` percentile bands.
    ties : {"breslow", "efron"}, default "breslow"
        Tie-handling rule for the partial likelihood. Forwarded to
        both the penalized fit and the Fisher-information extraction.
    standardize : bool, default True
    max_iter, tol : solver controls (forwarded to nodewise lassos
        and the penalized Cox fit).
    n_jobs : parallel jobs for the nodewise sweep.

    Returns
    -------
    :class:`DebiasedCoxResult`

    References
    ----------
    van de Geer, S., Bühlmann, P., Ritov, Y., & Dezeure, R. (2014).
    "On asymptotically optimal confidence regions and tests for
    high-dimensional models." *Annals of Statistics* 42(3): 1166–1202.
    — The original LS / GLM debiased construction.

    Cai, T., & Wang, J. (2017). "High-dimensional inference for
    covariance and precision matrices and applications to survival
    analysis." *J. Am. Stat. Assoc.* — Cox-PH extension.
    """
    from joblib import Parallel, delayed
    from skein_glm._core import cox_surrogate_weights_at

    X = np.ascontiguousarray(X, dtype=np.float64)
    time = np.ascontiguousarray(time, dtype=np.float64)
    event = np.ascontiguousarray(event, dtype=np.float64)
    if X.ndim != 2:
        raise ValueError(f"X must be 2D, got shape {X.shape}")
    n, p = X.shape
    if time.shape != (n,) or event.shape != (n,):
        raise ValueError(
            f"time / event must be 1D with length {n}; got "
            f"{time.shape} / {event.shape}"
        )
    if not np.all(np.isfinite(time)) or np.any(time < 0):
        raise ValueError("Cox PH requires time ≥ 0 (finite)")
    valid_events = np.isin(event, [0.0, 1.0])
    if not np.all(valid_events):
        raise ValueError("Cox PH requires event ∈ {0, 1}")
    if int(event.sum()) == 0:
        raise ValueError("Cox PH requires at least one event (event = 1)")
    if not 0.0 < alpha < 1.0:
        raise ValueError(f"alpha must be in (0, 1); got {alpha}")
    if ties not in ("breslow", "efron"):
        raise ValueError(f"ties must be 'breslow' or 'efron'; got {ties!r}")
    if p < 2:
        raise ValueError(f"debiased lasso requires p ≥ 2 features; got p = {p}")

    Xs, x_mean, x_scale = _standardize_X(X, standardize)
    lambda_main = (
        _theoretical_lambda(n, p) if lambda_ is None else float(lambda_)
    )
    if lambda_main <= 0:
        raise ValueError(f"lambda_ must be > 0; got {lambda_main}")

    # Cox-lasso via MCP at large γ (the standard skein idiom — group-MCP
    # with γ→∞ recovers L1 exactly, and the Rust path supports it
    # natively). Cox has no intercept; the baseline hazard absorbs
    # additive constants.
    fit = CoxMCPRegressor(
        lambda_=lambda_main,
        gamma=1e10,
        ties=ties,
        standardize=False,
        max_iter=max_iter,
        tol=tol,
        max_outer=20,
        outer_tol=tol,
    ).fit(Xs, time, event)
    beta_hat_s = np.asarray(fit.coef_, dtype=np.float64)
    eta_hat = Xs @ beta_hat_s

    # Extract per-sample Cox Fisher information w_i and working
    # response z_i at the fitted β̂. The Rust surrogate returns
    # (w, z) such that z = η − g/w, g = event − exp(η)·Λ̂_0(t).
    w, z = cox_surrogate_weights_at(Xs, time, event, beta_hat_s, ties=ties)
    w = np.asarray(w, dtype=np.float64)
    z = np.asarray(z, dtype=np.float64)
    # μ̂_cox = exp(η)·Λ̂_0(t) = event − g = event + w·(η − z).
    # The GLM core consumes (y, mu_hat) and forms (y − mu_hat) =
    # event − μ̂_cox = -g, the Cox score residual.
    mu_hat_cox = event + w * (eta_hat - z)

    lam_nw = _build_lambda_nodewise(lambda_nodewise, n, p)
    if np.any(lam_nw <= 0):
        raise ValueError("lambda_nodewise entries must be > 0")

    # Inline the GLM-core debiasing math; intercept_s=0 because Cox
    # has none. Mirrors _debiased_glm_core but produces DebiasedCoxResult.
    w_safe = np.maximum(w, _GLM_WEIGHT_FLOOR)
    w_sqrt = np.sqrt(w_safe)
    X_tilde = Xs * w_sqrt[:, None]

    results = Parallel(n_jobs=n_jobs)(
        delayed(_fit_nodewise_column)(X_tilde, j, float(lam_nw[j]), max_iter, tol)
        for j in range(p)
    )
    gammas = [g for g, _ in results]
    tau2s = np.array([t for _, t in results], dtype=np.float64)
    Theta = _assemble_theta_rows(gammas, tau2s)

    score = Xs.T @ (event - mu_hat_cox)
    beta_d_s = beta_hat_s + (Theta @ score) / n

    U = X_tilde @ Theta.T
    var_d_s = np.einsum("ij,ij->j", U, U) / (n * n)
    se_s = np.sqrt(np.maximum(var_d_s, 0.0))

    beta_d = beta_d_s / x_scale
    beta_glm = beta_hat_s / x_scale
    se = se_s / x_scale
    intercept = -float(np.dot(beta_d, x_mean))
    # Risk score is the η̂ on the *original* feature scale —
    # X @ beta_d + intercept reconstructs the centered linear
    # predictor; the Cox partial likelihood is invariant to the
    # additive intercept so this matches the conventional usage.
    risk_score = X @ beta_d + intercept

    z_alpha = float(stats.norm.ppf(1.0 - alpha / 2.0))
    safe_se = np.where(se > 0, se, 1.0)
    z_scores = np.where(se > 0, beta_d / safe_se, 0.0)
    pvalues = 2.0 * stats.norm.sf(np.abs(z_scores))
    ci_lower = beta_d - z_alpha * se
    ci_upper = beta_d + z_alpha * se

    return DebiasedCoxResult(
        coef_debiased=beta_d,
        coef_glm=beta_glm,
        intercept_=intercept,
        se=se,
        ci_lower=ci_lower,
        ci_upper=ci_upper,
        pvalues=pvalues,
        z_scores=z_scores,
        risk_score=risk_score,
        Theta=Theta,
        lambda_main=lambda_main,
        lambda_nodewise=lam_nw,
        alpha=alpha,
        family="cox",
        ties=ties,
    )


class DebiasedCoxLassoRegressor(BaseEstimator):
    """Sklearn-style facade over :func:`debiased_cox_lasso`.

    Cox PH has a different signature from the scalar-``y`` GLMs
    (requires ``time`` and ``event`` instead of ``y``), so this
    estimator doesn't inherit from :class:`_DebiasedGLMRegressorBase`.

    Attributes mirror the other debiased estimators with the GLM
    suffix convention (``coef_``, ``se_``, ``pvalues_``, etc.). The
    Cox-specific :attr:`risk_score_` is set to the prognostic index
    on the training data after :meth:`fit`.
    """

    coef_: NDArray[np.float64]
    coef_glm_: NDArray[np.float64]
    intercept_: float
    se_: NDArray[np.float64]
    ci_lower_: NDArray[np.float64]
    ci_upper_: NDArray[np.float64]
    pvalues_: NDArray[np.float64]
    z_scores_: NDArray[np.float64]
    risk_score_: NDArray[np.float64]
    Theta_: NDArray[np.float64]
    lambda_main_: float
    lambda_nodewise_: NDArray[np.float64]
    n_features_in_: int

    def __init__(
        self,
        *,
        lambda_: float | None = None,
        lambda_nodewise: float | NDArray[np.float64] | None = None,
        alpha: float = 0.05,
        ties: str = "breslow",
        standardize: bool = True,
        max_iter: int = 1000,
        tol: float = 1e-7,
        n_jobs: int | None = None,
    ) -> None:
        self.lambda_ = lambda_
        self.lambda_nodewise = lambda_nodewise
        self.alpha = alpha
        self.ties = ties
        self.standardize = standardize
        self.max_iter = max_iter
        self.tol = tol
        self.n_jobs = n_jobs

    def fit(
        self, X: Any, time: Any, event: Any
    ) -> "DebiasedCoxLassoRegressor":
        res = debiased_cox_lasso(
            X, time, event,
            lambda_=self.lambda_,
            lambda_nodewise=self.lambda_nodewise,
            alpha=self.alpha,
            ties=self.ties,
            standardize=self.standardize,
            max_iter=self.max_iter,
            tol=self.tol,
            n_jobs=self.n_jobs,
        )
        self.coef_ = res.coef_debiased
        self.coef_glm_ = res.coef_glm
        self.intercept_ = res.intercept_
        self.se_ = res.se
        self.ci_lower_ = res.ci_lower
        self.ci_upper_ = res.ci_upper
        self.pvalues_ = res.pvalues
        self.z_scores_ = res.z_scores
        self.risk_score_ = res.risk_score
        self.Theta_ = res.Theta
        self.lambda_main_ = res.lambda_main
        self.lambda_nodewise_ = res.lambda_nodewise
        self.family_ = res.family
        self.ties_ = res.ties
        self.n_features_in_ = int(res.coef_debiased.shape[0])
        return self

    def decision_function(self, X: Any) -> NDArray[np.float64]:
        """Prognostic index ``η = X · β + intercept``."""
        X = np.ascontiguousarray(X, dtype=np.float64)
        return X @ self.coef_ + self.intercept_

    def predict(self, X: Any) -> NDArray[np.float64]:
        """Alias for :meth:`decision_function` (Cox has no μ)."""
        return self.decision_function(X)


__all__ = [
    "DebiasedLassoResult",
    "debiased_lasso",
    "DebiasedLassoRegressor",
    "DebiasedGLMResult",
    "debiased_logistic_lasso",
    "debiased_poisson_lasso",
    "DebiasedLogisticLassoRegressor",
    "DebiasedPoissonLassoRegressor",
    "DebiasedCoxResult",
    "debiased_cox_lasso",
    "DebiasedCoxLassoRegressor",
]