# SPDX-License-Identifier: GPL-3.0-or-later
"""Result object returned by ``lmrob`` fits."""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any
import numpy as np
if TYPE_CHECKING:
# NOTE: do *not* import pylmrob.bootstrap here. bootstrap.py imports
# LmRobResults from this module (also under TYPE_CHECKING), and
# CodeQL's py/unsafe-cyclic-import flags the bidirectional pair
# even though both are TYPE_CHECKING-only. The bootstrap() method
# below uses a forward-reference string annotation instead.
from pylmrob.control import Control
from pylmrob.summary import SummaryLmRob
[docs]
@dataclass
class LmRobResults:
"""Output of an ``lmrob`` fit.
Attributes mirror R's ``lmrob`` object where practical.
"""
coef_: np.ndarray
scale_: float
weights_: np.ndarray # final IRWLS weights = wgt(r/sigma)
rweights_: np.ndarray # alias for R's $rweights (same as weights_ here)
residuals_: np.ndarray
fitted_: np.ndarray
cov_: np.ndarray
df_residual_: int
converged_: bool
n_iter_: int
nobs_: int
term_names_: list[str]
control: Control
init_: dict[str, object] = field(default_factory=dict)
# formulaic ModelSpec for the RHS, used by ``predict`` to re-apply the
# design transformation (factor encoding, I(x**2), etc.) to new data.
rhs_spec_: object | None = None
# Design matrix and response stashed for downstream operations
# (e.g. anova(test="Deviance") refits reduced models on the full scale).
design_x_: np.ndarray | None = None
design_y_: np.ndarray | None = None
# Backwards-compatible alias to mirror R's $coefficients.
@property
def coefficients_(self) -> np.ndarray:
return self.coef_
@property
def standard_errors_(self) -> np.ndarray:
return np.sqrt(np.diag(self.cov_))
[docs]
def confint(
self,
level: float = 0.95,
method: str = "wald",
*,
n_boot: int = 1000,
seed: int | None = None,
n_workers: int = 1,
kind: str = "percentile",
) -> np.ndarray:
"""Confidence intervals for the regression coefficients.
Two methods:
- ``"wald"`` (default): asymptotic normal CIs from the sandwich
covariance. ``z * se`` where ``z`` is the standard-normal
quantile at ``(1 + level) / 2``.
- ``"bootstrap"``: case-resampling bootstrap; runs
:meth:`bootstrap` internally and returns the requested ``kind``
(``"percentile"`` or ``"basic"``) CIs.
Parameters
----------
level
Coverage level, e.g. ``0.95``.
method
``"wald"`` (default) or ``"bootstrap"``.
n_boot, seed, n_workers
Forwarded to :meth:`bootstrap` when ``method="bootstrap"``.
kind
Bootstrap CI kind: ``"percentile"`` or ``"basic"``. Ignored
for ``method="wald"``.
"""
if method == "wald":
from scipy.stats import norm
z = norm.ppf((1 + level) / 2)
se = self.standard_errors_
lo = self.coef_ - z * se
hi = self.coef_ + z * se
return np.column_stack([lo, hi])
if method == "bootstrap":
boot = self.bootstrap(n_boot=n_boot, level=level, seed=seed, n_workers=n_workers)
ci = boot.percentile_ci if kind == "percentile" else boot.basic_ci
return np.asarray(ci, dtype=np.float64)
raise ValueError(f"method must be 'wald' or 'bootstrap', got {method!r}")
# ------------------------------------------------------------------
# statsmodels-style attribute aliases. Let pylmrob fits drop into
# statsmodels.regression-shaped code without adapters.
# ------------------------------------------------------------------
@property
def params(self) -> np.ndarray:
return self.coef_
@property
def bse(self) -> np.ndarray:
return self.standard_errors_
@property
def tvalues(self) -> np.ndarray:
se = self.standard_errors_
with np.errstate(divide="ignore", invalid="ignore"):
return np.where(se > 0, self.coef_ / np.where(se > 0, se, 1.0), np.nan)
@property
def pvalues(self) -> np.ndarray:
from scipy.stats import t as t_dist
return 2.0 * t_dist.sf(np.abs(self.tvalues), df=self.df_residual_)
[docs]
def conf_int(self, alpha: float = 0.05) -> np.ndarray:
"""``statsmodels`` spelling of :meth:`confint`. Uses ``1 - alpha``."""
return self.confint(level=1.0 - alpha)
[docs]
def predict(
self,
new_data: object,
*,
interval: str = "none",
level: float = 0.95,
) -> np.ndarray:
"""Predict on new data, optionally with confidence/prediction bands.
Accepts either:
- a pandas ``DataFrame`` with the columns referenced by the original
formula. The fit's stored formulaic ``ModelSpec`` re-applies any
factor encoding, interactions, ``I(x**2)`` transforms, etc.
- a 2-D NumPy array already shaped ``(n, p)``, matching the original
design (intercept column included if the formula had one).
Parameters
----------
interval :
``"none"`` (default) returns the point predictions, shape ``(n,)``.
``"confidence"`` returns ``(n, 3)`` columns ``(fit, lwr, upr)``
with the confidence interval for the *mean* response at each
new observation (Var = X^T cov X).
``"prediction"`` returns ``(n, 3)`` with the prediction interval
for a single *new* observation (Var = sigma^2 + X^T cov X).
level :
Confidence level for the interval. Default 0.95.
Bands use the t-distribution with ``df_residual_`` degrees of
freedom, mirroring R's ``predict.lm`` / ``predict.lmrob`` convention.
"""
# pandas is a hard dependency, but only required when ``new_data`` is
# a DataFrame. We dispatch by duck-typing to avoid a top-level import.
is_dataframe = type(new_data).__name__ == "DataFrame" and hasattr(new_data, "columns")
arr: np.ndarray
if is_dataframe:
if self.rhs_spec_ is None:
# Fit used the simple-formula fast path; rebuild the design
# by selecting term columns from new_data directly. Cast to
# ``Any`` because we duck-typed ``new_data`` as a DataFrame
# without importing pandas.
df: Any = new_data
cols = [c for c in self.term_names_ if c not in ("Intercept", "(Intercept)")]
missing = [c for c in cols if c not in df.columns]
if missing:
raise ValueError(
f"predict(DataFrame): missing columns in new_data: {missing!r}"
)
feat = np.asarray(df.loc[:, cols].to_numpy(), dtype=np.float64)
has_intercept = bool(
self.term_names_ and self.term_names_[0] in ("Intercept", "(Intercept)")
)
arr = np.column_stack([np.ones(feat.shape[0]), feat]) if has_intercept else feat
else:
from pylmrob.formula import apply_spec
arr = apply_spec(self.rhs_spec_, new_data)
else:
arr = np.asarray(new_data, dtype=np.float64)
if arr.ndim != 2:
raise ValueError(f"predict expected a 2-D matrix; got shape {arr.shape}")
if arr.shape[1] != self.coef_.size:
raise ValueError(
f"predict: design has {arr.shape[1]} columns but the fit "
f"has {self.coef_.size} coefficients"
)
point = arr @ self.coef_
if interval == "none":
return point
if interval not in ("confidence", "prediction"):
raise ValueError(
f"interval must be one of 'none', 'confidence', 'prediction'; got {interval!r}"
)
# Var(X beta) per row = sum((X cov) * X, axis=1) = diag(X cov X^T).
var_fit = np.einsum("ij,jk,ik->i", arr, self.cov_, arr)
var_fit = np.maximum(var_fit, 0.0) # numerical safety
var = var_fit + (self.scale_**2 if interval == "prediction" else 0.0)
from scipy.stats import t as t_dist
q = t_dist.ppf((1.0 + level) / 2.0, df=self.df_residual_)
se = np.sqrt(var)
return np.column_stack([point, point - q * se, point + q * se])
[docs]
def predict_std(self, new_data: object, *, kind: str = "confidence") -> np.ndarray:
"""Standard deviation of the prediction at each new observation.
Returns ``sqrt(Var(X^T beta_hat))`` (``kind="confidence"``, default)
or ``sqrt(sigma^2 + Var(X^T beta_hat))`` (``kind="prediction"``).
Use this when you want the raw SE and intend to build your own
intervals (e.g. with a non-Gaussian distribution or for a Bayesian
update); :meth:`predict` with ``interval="confidence"`` already
returns t-quantile-scaled bands.
"""
# Build the design matrix the same way predict() does, then return
# only the standard error column.
# Re-use predict() by extracting the (upr - point) / q value, but
# that's wasteful. Cheaper to inline the design build.
is_dataframe = type(new_data).__name__ == "DataFrame" and hasattr(new_data, "columns")
arr: np.ndarray
if is_dataframe:
if self.rhs_spec_ is None:
df: Any = new_data
cols = [c for c in self.term_names_ if c not in ("Intercept", "(Intercept)")]
missing = [c for c in cols if c not in df.columns]
if missing:
raise ValueError(f"predict_std(DataFrame): missing columns: {missing!r}")
feat = np.asarray(df.loc[:, cols].to_numpy(), dtype=np.float64)
has_intercept = bool(
self.term_names_ and self.term_names_[0] in ("Intercept", "(Intercept)")
)
arr = np.column_stack([np.ones(feat.shape[0]), feat]) if has_intercept else feat
else:
from pylmrob.formula import apply_spec
arr = apply_spec(self.rhs_spec_, new_data)
else:
arr = np.asarray(new_data, dtype=np.float64)
if arr.ndim != 2 or arr.shape[1] != self.coef_.size:
raise ValueError(
f"predict_std: design shape {arr.shape} incompatible with "
f"{self.coef_.size} coefficients"
)
var_fit = np.einsum("ij,jk,ik->i", arr, self.cov_, arr)
var_fit = np.maximum(var_fit, 0.0)
if kind == "confidence":
return np.sqrt(var_fit)
if kind == "prediction":
return np.sqrt(var_fit + self.scale_ * self.scale_)
raise ValueError(f"kind must be 'confidence' or 'prediction'; got {kind!r}")
[docs]
def diagnostics(self, outlier_threshold: float = 2.5) -> object:
"""Per-observation diagnostic statistics.
Returns a :class:`pylmrob.diagnostics.DiagnosticsTable` with
leverage, robust Cook's distance, standardized residuals, the
robust weights, and a boolean outlier flag
(``|std_residuals| > outlier_threshold``).
Requires the fit to have a stashed design matrix
(``design_x_``); the default ``lmrob()`` call always stashes it.
"""
if self.design_x_ is None:
raise RuntimeError(
"diagnostics() needs the design matrix; fit was created without design_x_"
)
from pylmrob.diagnostics import (
DiagnosticsTable,
cooks_distance,
hatvalues,
)
from pylmrob.diagnostics import (
dfbetas as _dfbetas,
)
h = hatvalues(self, self.design_x_)
cd = cooks_distance(self, self.design_x_)
dfb = _dfbetas(self, self.design_x_)
sigma = max(self.scale_, 1e-300)
z = self.residuals_ / sigma
outliers = np.abs(z) > outlier_threshold
# Masked outliers: rows that the robust fit rejected (rweights ~ 0)
# that ALSO look far from the clean data's design centre. The "clean
# data" is everything with rweights > 0.5. We compute leverage of
# every row against the clean-data X'X; the contamination ends up
# with high values because they sit outside the clean centroid in
# X-space. Pure OLS leverage doesn't work here because the
# contamination's own weight pulls the OLS centroid toward itself
# (this is the masking phenomenon).
X = self.design_x_
n = X.shape[0]
p = self.coef_.size
wt_eps = 1e-3
clean = self.rweights_ > 0.5
masked_outliers = np.zeros(n, dtype=bool)
if int(clean.sum()) > p:
X_clean = X[clean]
try:
xtx_inv_clean = np.linalg.inv(X_clean.T @ X_clean)
clean_lev = np.einsum("ij,jk,ik->i", X, xtx_inv_clean, X)
# Threshold: median of the clean-rows' own leverages plus
# 3 * MAD. Robust to the contamination block.
clean_lev_in = clean_lev[clean]
med = float(np.median(clean_lev_in))
mad = float(np.median(np.abs(clean_lev_in - med))) * 1.4826
lev_thresh = med + 3.0 * max(mad, 1e-12)
masked_outliers = (clean_lev > lev_thresh) & (self.rweights_ < wt_eps)
except np.linalg.LinAlgError:
pass
return DiagnosticsTable(
leverage=h,
cooks_distance=cd,
std_residuals=z,
rweights=self.rweights_,
outliers=outliers,
masked_outliers=masked_outliers,
dfbetas=dfb,
)
[docs]
def bootstrap(
self,
n_boot: int = 1000,
level: float = 0.95,
seed: int | np.random.Generator | None = None,
n_workers: int = 1,
) -> Any: # BootstrapResult; see note at top of file re cyclic import
"""Method-style spelling of :func:`pylmrob.bootstrap`.
Equivalent to ``pylmrob.bootstrap(self, n_boot=..., ...)``;
matches the ``fit.anova()`` / ``fit.diagnostics()`` style.
"""
from pylmrob.bootstrap import bootstrap as _bootstrap
return _bootstrap(self, n_boot=n_boot, level=level, seed=seed, n_workers=n_workers)
[docs]
def anova(self, *others: LmRobResults, test: str = "Wald") -> object:
"""Method-style spelling of :func:`pylmrob.anova`.
Equivalent to ``pylmrob.anova(self, *others, test=test)``; lets
you write ``full.anova(reduced)`` instead of the free-function
form, matching R's idiom.
"""
from pylmrob.anova import anova as _anova
return _anova(self, *others, test=test)
[docs]
def summary(self, style: str = "r", detail: str = "brief") -> SummaryLmRob:
"""Return a ``SummaryLmRob`` matching R's ``summary.lmrob``.
Parameters
----------
style
``"r"`` (default): R-style ``summary.lmrob`` output, matching
``robustbase`` line-for-line where practical.
``"statsmodels"``: a fixed-width table matching the
``statsmodels.iolib.summary.Summary`` layout. Use this when
piping pylmrob fits into statsmodels-shaped reporting code.
detail
``"brief"`` (default) emits the standard summary.
``"full"`` appends a footer with init method, init scale,
MM iter count, and engine settings (``engine_c``, ``rng``).
Use this when debugging convergence or unexpected results.
The returned object stringifies to the chosen style via
``str()`` or ``print()``; calling its ``render`` method overrides
the stored choice without rebuilding the object.
"""
from pylmrob.summary import make_summary
out = make_summary(self)
out.style = style
out.detail = detail
return out
def __repr__(self) -> str:
"""Compact R-style ``print.lmrob`` output.
For the full coefficient table with std errors, t / p values and
R-squared, call ``print(self.summary())``.
"""
method = self.control.method or "MM"
lines: list[str] = [
f'lmrob(method="{method}", psi="{self.control.psi}")',
"",
"Coefficients:",
]
# Column-aligned coefficient row matching R's print.
name_w = max(11, max((len(n) for n in self.term_names_), default=11))
header = " ".join(f"{n:>{name_w}}" for n in self.term_names_)
values = " ".join(f"{v:>{name_w}.4g}" for v in self.coef_)
lines.append(header)
lines.append(values)
return "\n".join(lines)