Source code for pylmrob.results

# SPDX-License-Identifier: GPL-3.0-or-later
"""Result object returned by ``lmrob`` fits."""

from __future__ import annotations

from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any

import numpy as np

if TYPE_CHECKING:
    # NOTE: do *not* import pylmrob.bootstrap here. bootstrap.py imports
    # LmRobResults from this module (also under TYPE_CHECKING), and
    # CodeQL's py/unsafe-cyclic-import flags the bidirectional pair
    # even though both are TYPE_CHECKING-only. The bootstrap() method
    # below uses a forward-reference string annotation instead.
    from pylmrob.control import Control
    from pylmrob.summary import SummaryLmRob



[docs]
@dataclass
class LmRobResults:
    """Output of an ``lmrob`` fit.

    Attributes mirror R's ``lmrob`` object where practical.
    """

    coef_: np.ndarray
    scale_: float
    weights_: np.ndarray  # final IRWLS weights = wgt(r/sigma)
    rweights_: np.ndarray  # alias for R's $rweights (same as weights_ here)
    residuals_: np.ndarray
    fitted_: np.ndarray
    cov_: np.ndarray
    df_residual_: int
    converged_: bool
    n_iter_: int
    nobs_: int
    term_names_: list[str]
    control: Control
    init_: dict[str, object] = field(default_factory=dict)
    # formulaic ModelSpec for the RHS, used by ``predict`` to re-apply the
    # design transformation (factor encoding, I(x**2), etc.) to new data.
    rhs_spec_: object | None = None
    # Design matrix and response stashed for downstream operations
    # (e.g. anova(test="Deviance") refits reduced models on the full scale).
    design_x_: np.ndarray | None = None
    design_y_: np.ndarray | None = None

    # Backwards-compatible alias to mirror R's $coefficients.
    @property
    def coefficients_(self) -> np.ndarray:
        return self.coef_

    @property
    def standard_errors_(self) -> np.ndarray:
        return np.sqrt(np.diag(self.cov_))


[docs]
    def confint(
        self,
        level: float = 0.95,
        method: str = "wald",
        *,
        n_boot: int = 1000,
        seed: int | None = None,
        n_workers: int = 1,
        kind: str = "percentile",
    ) -> np.ndarray:
        """Confidence intervals for the regression coefficients.

        Two methods:

        - ``"wald"`` (default): asymptotic normal CIs from the sandwich
          covariance. ``z * se`` where ``z`` is the standard-normal
          quantile at ``(1 + level) / 2``.
        - ``"bootstrap"``: case-resampling bootstrap; runs
          :meth:`bootstrap` internally and returns the requested ``kind``
          (``"percentile"`` or ``"basic"``) CIs.

        Parameters
        ----------
        level
            Coverage level, e.g. ``0.95``.
        method
            ``"wald"`` (default) or ``"bootstrap"``.
        n_boot, seed, n_workers
            Forwarded to :meth:`bootstrap` when ``method="bootstrap"``.
        kind
            Bootstrap CI kind: ``"percentile"`` or ``"basic"``. Ignored
            for ``method="wald"``.
        """
        if method == "wald":
            from scipy.stats import norm

            z = norm.ppf((1 + level) / 2)
            se = self.standard_errors_
            lo = self.coef_ - z * se
            hi = self.coef_ + z * se
            return np.column_stack([lo, hi])
        if method == "bootstrap":
            boot = self.bootstrap(n_boot=n_boot, level=level, seed=seed, n_workers=n_workers)
            ci = boot.percentile_ci if kind == "percentile" else boot.basic_ci
            return np.asarray(ci, dtype=np.float64)
        raise ValueError(f"method must be 'wald' or 'bootstrap', got {method!r}")


    # ------------------------------------------------------------------
    # statsmodels-style attribute aliases. Let pylmrob fits drop into
    # statsmodels.regression-shaped code without adapters.
    # ------------------------------------------------------------------

    @property
    def params(self) -> np.ndarray:
        return self.coef_

    @property
    def bse(self) -> np.ndarray:
        return self.standard_errors_

    @property
    def tvalues(self) -> np.ndarray:
        se = self.standard_errors_
        with np.errstate(divide="ignore", invalid="ignore"):
            return np.where(se > 0, self.coef_ / np.where(se > 0, se, 1.0), np.nan)

    @property
    def pvalues(self) -> np.ndarray:
        from scipy.stats import t as t_dist

        return 2.0 * t_dist.sf(np.abs(self.tvalues), df=self.df_residual_)


[docs]
    def conf_int(self, alpha: float = 0.05) -> np.ndarray:
        """``statsmodels`` spelling of :meth:`confint`. Uses ``1 - alpha``."""
        return self.confint(level=1.0 - alpha)



[docs]
    def predict(
        self,
        new_data: object,
        *,
        interval: str = "none",
        level: float = 0.95,
    ) -> np.ndarray:
        """Predict on new data, optionally with confidence/prediction bands.

        Accepts either:

        - a pandas ``DataFrame`` with the columns referenced by the original
          formula. The fit's stored formulaic ``ModelSpec`` re-applies any
          factor encoding, interactions, ``I(x**2)`` transforms, etc.
        - a 2-D NumPy array already shaped ``(n, p)``, matching the original
          design (intercept column included if the formula had one).

        Parameters
        ----------
        interval :
            ``"none"`` (default) returns the point predictions, shape ``(n,)``.
            ``"confidence"`` returns ``(n, 3)`` columns ``(fit, lwr, upr)``
            with the confidence interval for the *mean* response at each
            new observation (Var = X^T cov X).
            ``"prediction"`` returns ``(n, 3)`` with the prediction interval
            for a single *new* observation (Var = sigma^2 + X^T cov X).
        level :
            Confidence level for the interval. Default 0.95.

        Bands use the t-distribution with ``df_residual_`` degrees of
        freedom, mirroring R's ``predict.lm`` / ``predict.lmrob`` convention.
        """
        # pandas is a hard dependency, but only required when ``new_data`` is
        # a DataFrame. We dispatch by duck-typing to avoid a top-level import.
        is_dataframe = type(new_data).__name__ == "DataFrame" and hasattr(new_data, "columns")

        arr: np.ndarray
        if is_dataframe:
            if self.rhs_spec_ is None:
                # Fit used the simple-formula fast path; rebuild the design
                # by selecting term columns from new_data directly. Cast to
                # ``Any`` because we duck-typed ``new_data`` as a DataFrame
                # without importing pandas.
                df: Any = new_data
                cols = [c for c in self.term_names_ if c not in ("Intercept", "(Intercept)")]
                missing = [c for c in cols if c not in df.columns]
                if missing:
                    raise ValueError(
                        f"predict(DataFrame): missing columns in new_data: {missing!r}"
                    )
                feat = np.asarray(df.loc[:, cols].to_numpy(), dtype=np.float64)
                has_intercept = bool(
                    self.term_names_ and self.term_names_[0] in ("Intercept", "(Intercept)")
                )
                arr = np.column_stack([np.ones(feat.shape[0]), feat]) if has_intercept else feat
            else:
                from pylmrob.formula import apply_spec

                arr = apply_spec(self.rhs_spec_, new_data)
        else:
            arr = np.asarray(new_data, dtype=np.float64)

        if arr.ndim != 2:
            raise ValueError(f"predict expected a 2-D matrix; got shape {arr.shape}")
        if arr.shape[1] != self.coef_.size:
            raise ValueError(
                f"predict: design has {arr.shape[1]} columns but the fit "
                f"has {self.coef_.size} coefficients"
            )
        point = arr @ self.coef_
        if interval == "none":
            return point
        if interval not in ("confidence", "prediction"):
            raise ValueError(
                f"interval must be one of 'none', 'confidence', 'prediction'; got {interval!r}"
            )
        # Var(X beta) per row = sum((X cov) * X, axis=1) = diag(X cov X^T).
        var_fit = np.einsum("ij,jk,ik->i", arr, self.cov_, arr)
        var_fit = np.maximum(var_fit, 0.0)  # numerical safety
        var = var_fit + (self.scale_**2 if interval == "prediction" else 0.0)
        from scipy.stats import t as t_dist

        q = t_dist.ppf((1.0 + level) / 2.0, df=self.df_residual_)
        se = np.sqrt(var)
        return np.column_stack([point, point - q * se, point + q * se])



[docs]
    def predict_std(self, new_data: object, *, kind: str = "confidence") -> np.ndarray:
        """Standard deviation of the prediction at each new observation.

        Returns ``sqrt(Var(X^T beta_hat))`` (``kind="confidence"``, default)
        or ``sqrt(sigma^2 + Var(X^T beta_hat))`` (``kind="prediction"``).
        Use this when you want the raw SE and intend to build your own
        intervals (e.g. with a non-Gaussian distribution or for a Bayesian
        update); :meth:`predict` with ``interval="confidence"`` already
        returns t-quantile-scaled bands.
        """
        # Build the design matrix the same way predict() does, then return
        # only the standard error column.
        # Re-use predict() by extracting the (upr - point) / q value, but
        # that's wasteful. Cheaper to inline the design build.
        is_dataframe = type(new_data).__name__ == "DataFrame" and hasattr(new_data, "columns")
        arr: np.ndarray
        if is_dataframe:
            if self.rhs_spec_ is None:
                df: Any = new_data
                cols = [c for c in self.term_names_ if c not in ("Intercept", "(Intercept)")]
                missing = [c for c in cols if c not in df.columns]
                if missing:
                    raise ValueError(f"predict_std(DataFrame): missing columns: {missing!r}")
                feat = np.asarray(df.loc[:, cols].to_numpy(), dtype=np.float64)
                has_intercept = bool(
                    self.term_names_ and self.term_names_[0] in ("Intercept", "(Intercept)")
                )
                arr = np.column_stack([np.ones(feat.shape[0]), feat]) if has_intercept else feat
            else:
                from pylmrob.formula import apply_spec

                arr = apply_spec(self.rhs_spec_, new_data)
        else:
            arr = np.asarray(new_data, dtype=np.float64)
        if arr.ndim != 2 or arr.shape[1] != self.coef_.size:
            raise ValueError(
                f"predict_std: design shape {arr.shape} incompatible with "
                f"{self.coef_.size} coefficients"
            )
        var_fit = np.einsum("ij,jk,ik->i", arr, self.cov_, arr)
        var_fit = np.maximum(var_fit, 0.0)
        if kind == "confidence":
            return np.sqrt(var_fit)
        if kind == "prediction":
            return np.sqrt(var_fit + self.scale_ * self.scale_)
        raise ValueError(f"kind must be 'confidence' or 'prediction'; got {kind!r}")



[docs]
    def diagnostics(self, outlier_threshold: float = 2.5) -> object:
        """Per-observation diagnostic statistics.

        Returns a :class:`pylmrob.diagnostics.DiagnosticsTable` with
        leverage, robust Cook's distance, standardized residuals, the
        robust weights, and a boolean outlier flag
        (``|std_residuals| > outlier_threshold``).

        Requires the fit to have a stashed design matrix
        (``design_x_``); the default ``lmrob()`` call always stashes it.
        """
        if self.design_x_ is None:
            raise RuntimeError(
                "diagnostics() needs the design matrix; fit was created without design_x_"
            )
        from pylmrob.diagnostics import (
            DiagnosticsTable,
            cooks_distance,
            hatvalues,
        )
        from pylmrob.diagnostics import (
            dfbetas as _dfbetas,
        )

        h = hatvalues(self, self.design_x_)
        cd = cooks_distance(self, self.design_x_)
        dfb = _dfbetas(self, self.design_x_)
        sigma = max(self.scale_, 1e-300)
        z = self.residuals_ / sigma
        outliers = np.abs(z) > outlier_threshold
        # Masked outliers: rows that the robust fit rejected (rweights ~ 0)
        # that ALSO look far from the clean data's design centre. The "clean
        # data" is everything with rweights > 0.5. We compute leverage of
        # every row against the clean-data X'X; the contamination ends up
        # with high values because they sit outside the clean centroid in
        # X-space. Pure OLS leverage doesn't work here because the
        # contamination's own weight pulls the OLS centroid toward itself
        # (this is the masking phenomenon).
        X = self.design_x_
        n = X.shape[0]
        p = self.coef_.size
        wt_eps = 1e-3
        clean = self.rweights_ > 0.5
        masked_outliers = np.zeros(n, dtype=bool)
        if int(clean.sum()) > p:
            X_clean = X[clean]
            try:
                xtx_inv_clean = np.linalg.inv(X_clean.T @ X_clean)
                clean_lev = np.einsum("ij,jk,ik->i", X, xtx_inv_clean, X)
                # Threshold: median of the clean-rows' own leverages plus
                # 3 * MAD. Robust to the contamination block.
                clean_lev_in = clean_lev[clean]
                med = float(np.median(clean_lev_in))
                mad = float(np.median(np.abs(clean_lev_in - med))) * 1.4826
                lev_thresh = med + 3.0 * max(mad, 1e-12)
                masked_outliers = (clean_lev > lev_thresh) & (self.rweights_ < wt_eps)
            except np.linalg.LinAlgError:
                pass
        return DiagnosticsTable(
            leverage=h,
            cooks_distance=cd,
            std_residuals=z,
            rweights=self.rweights_,
            outliers=outliers,
            masked_outliers=masked_outliers,
            dfbetas=dfb,
        )



[docs]
    def bootstrap(
        self,
        n_boot: int = 1000,
        level: float = 0.95,
        seed: int | np.random.Generator | None = None,
        n_workers: int = 1,
    ) -> Any:  # BootstrapResult; see note at top of file re cyclic import
        """Method-style spelling of :func:`pylmrob.bootstrap`.

        Equivalent to ``pylmrob.bootstrap(self, n_boot=..., ...)``;
        matches the ``fit.anova()`` / ``fit.diagnostics()`` style.
        """
        from pylmrob.bootstrap import bootstrap as _bootstrap

        return _bootstrap(self, n_boot=n_boot, level=level, seed=seed, n_workers=n_workers)



[docs]
    def anova(self, *others: LmRobResults, test: str = "Wald") -> object:
        """Method-style spelling of :func:`pylmrob.anova`.

        Equivalent to ``pylmrob.anova(self, *others, test=test)``; lets
        you write ``full.anova(reduced)`` instead of the free-function
        form, matching R's idiom.
        """
        from pylmrob.anova import anova as _anova

        return _anova(self, *others, test=test)



[docs]
    def summary(self, style: str = "r", detail: str = "brief") -> SummaryLmRob:
        """Return a ``SummaryLmRob`` matching R's ``summary.lmrob``.

        Parameters
        ----------
        style
            ``"r"`` (default): R-style ``summary.lmrob`` output, matching
            ``robustbase`` line-for-line where practical.
            ``"statsmodels"``: a fixed-width table matching the
            ``statsmodels.iolib.summary.Summary`` layout. Use this when
            piping pylmrob fits into statsmodels-shaped reporting code.
        detail
            ``"brief"`` (default) emits the standard summary.
            ``"full"`` appends a footer with init method, init scale,
            MM iter count, and engine settings (``engine_c``, ``rng``).
            Use this when debugging convergence or unexpected results.

        The returned object stringifies to the chosen style via
        ``str()`` or ``print()``; calling its ``render`` method overrides
        the stored choice without rebuilding the object.
        """
        from pylmrob.summary import make_summary

        out = make_summary(self)
        out.style = style
        out.detail = detail
        return out


    def __repr__(self) -> str:
        """Compact R-style ``print.lmrob`` output.

        For the full coefficient table with std errors, t / p values and
        R-squared, call ``print(self.summary())``.
        """
        method = self.control.method or "MM"
        lines: list[str] = [
            f'lmrob(method="{method}", psi="{self.control.psi}")',
            "",
            "Coefficients:",
        ]
        # Column-aligned coefficient row matching R's print.
        name_w = max(11, max((len(n) for n in self.term_names_), default=11))
        header = "  ".join(f"{n:>{name_w}}" for n in self.term_names_)
        values = "  ".join(f"{v:>{name_w}.4g}" for v in self.coef_)
        lines.append(header)
        lines.append(values)
        return "\n".join(lines)