Source code for pylmrob.anova

# SPDX-License-Identifier: GPL-3.0-or-later
"""Robust Wald and Deviance tests for nested ``LmRobResults`` models.

Mirrors ``robustbase:::anova.lmrob``.

- **Wald** (default): ``chi2 = beta_drop' V[drop, drop]^{-1} beta_drop`` where
  ``beta_drop`` is the subvector of the largest model's coefficients that the
  reduced model omits and ``V`` is the largest model's robust covariance.
- **Deviance**: refits the reduced model via M-iteration at the full model's
  scale, then computes ``T = 2 * tauStar * (sum_rho_reduced - sum_rho_full)``
  with ``tauStar = mean(psi'(r/s0)) / mean(psi(r/s0)^2)`` evaluated at the
  full residuals.

P-value is from the chi-squared distribution with ``df = ndrop``.
"""

from __future__ import annotations

from dataclasses import dataclass

import numpy as np

from pylmrob.results import LmRobResults


[docs] @dataclass class AnovaTable: """Robust Wald-test table comparing nested ``lmrob`` fits. Columns mirror R's ``anova.lmrob`` output: ``pseudoDf``, ``Test.Stat``, ``Df``, ``Pr(>chisq)``. """ table: np.ndarray # shape (nmodels, 4) term_lists: list[list[str]] test: str method: str def __str__(self) -> str: title = "Robust Wald Test Table" if self.test == "Wald" else "Robust Deviance Table" rows: list[str] = [title, ""] for i, terms in enumerate(self.term_lists, start=1): rows.append(f"Model {i}: {' + '.join(terms) if terms else '(no terms)'}") rows.append(f"Largest model fitted by lmrob(), i.e. {self.method}") rows.append("") header = f"{'':<3} {'pseudoDf':>9} {'Test.Stat':>11} {'Df':>4} {'Pr(>chisq)':>11}" rows.append(header) for i, row in enumerate(self.table, start=1): pdf, ts, df, pv = row if np.isnan(ts): rows.append(f"{i:<3} {pdf:9.0f} {'':>11} {'':>4} {'':>11}") else: rows.append(f"{i:<3} {pdf:9.0f} {ts:11.4g} {df:4.0f} {pv:11.4g}") return "\n".join(rows) def __repr__(self) -> str: return self.__str__()
def _h0_indices(full_terms: list[str], reduced_terms: list[str]) -> list[int]: """Indices in ``full_terms`` that are absent from ``reduced_terms``. Reduced model must be strictly nested in the full one (every term in the reduced fit must appear in the full fit). """ full_set = set(full_terms) missing = [t for t in reduced_terms if t not in full_set] if missing: raise ValueError(f"models are not nested; reduced has terms not in full: {missing!r}") drop = [i for i, t in enumerate(full_terms) if t not in set(reduced_terms)] if not drop: raise ValueError("models are not strictly nested (no terms dropped)") return drop def _wald_pair(full: LmRobResults, reduced: LmRobResults) -> tuple[int, float, int, float]: """Wald chi-sq for one nested pair. Returns ``(pseudoDf, chi2, df_drop, p_value)`` where ``pseudoDf`` is R's ``n - p_full + df_drop``. """ from scipy.stats import chi2 as chi2_dist drop = _h0_indices(list(full.term_names_), list(reduced.term_names_)) coef = np.asarray(full.coef_, dtype=np.float64) cov = np.asarray(full.cov_, dtype=np.float64) h0_coef = coef[drop] h0_cov = cov[np.ix_(drop, drop)] sol = np.linalg.solve(h0_cov, h0_coef) chi2 = float(h0_coef @ sol) df = len(drop) p = full.coef_.size n = full.nobs_ pseudo_df = n - p + df pval = float(chi2_dist.sf(chi2, df=df)) return pseudo_df, chi2, df, pval def _deviance_pair(full: LmRobResults, reduced: LmRobResults) -> tuple[int, float, int, float]: """Deviance chi-sq for one nested pair. Refits the reduced model via M-iteration at the full's scale, then computes ``T = 2 * tauStar * (RM_sRho - FM_sRho)``. """ from scipy.stats import chi2 as chi2_dist from pylmrob import psi as _psi_mod from pylmrob._mm import mm_iterate if full.design_x_ is None or full.design_y_ is None: raise RuntimeError( "anova(test='Deviance') requires the full fit to carry its design " "matrix; refit with the current pylmrob to populate design_x_." ) if reduced.design_x_ is None: raise RuntimeError("reduced fit is missing design_x_; refit and try again") method = full.control.method or "MM" if not method.endswith("M"): raise ValueError( f"anova(test='Deviance') requires the full model's method to end " f"with 'M'; got method={method!r}." ) drop = _h0_indices(list(full.term_names_), list(reduced.term_names_)) df = len(drop) df_full = full.coef_.size n = int(full.nobs_) keep_idx = [i for i, t in enumerate(full.term_names_) if t in set(reduced.term_names_)] x_full = np.asarray(full.design_x_, dtype=np.float64) y = np.asarray(full.design_y_, dtype=np.float64) x_red = x_full[:, keep_idx] s0 = float(full.scale_) if s0 <= 0: raise ValueError("full model has non-positive scale; deviance test undefined") psi_family = full.control.psi or "bisquare" psi_k = tuple(np.atleast_1d(np.asarray(full.control.tuning_psi, dtype=float)).ravel()) beta_red_init = np.asarray(reduced.coef_, dtype=np.float64).copy() if beta_red_init.size != x_red.shape[1]: raise ValueError( f"reduced fit has {beta_red_init.size} coefficients but the kept " f"columns of the full design have {x_red.shape[1]} (term ordering mismatch)" ) rm = mm_iterate( X=x_red, y=y, beta_init=beta_red_init, sigma=s0, psi_family=psi_family, psi_k=psi_k, max_it=full.control.max_it, rel_tol=full.control.rel_tol, ) rm_res = y - x_red @ rm.coef fm_res = np.asarray(full.residuals_, dtype=np.float64) # Our ``psi.rho`` is the normalised chi (chi(inf)=1); R's ``Mpsi(deriv=-1)`` # is the unnormalised rho. Scale by ``rho_inf = 1 / chi_prime_factor`` to # recover R's convention before differencing. from pylmrob._psifuns import _chi_prime_factor psi_k_arr = np.asarray(psi_k, dtype=float) rho_inf = 1.0 / _chi_prime_factor(psi_family, psi_k_arr) fm_sRho = float(np.sum(_psi_mod.rho(fm_res / s0, psi_family, psi_k))) * rho_inf rm_sRho = float(np.sum(_psi_mod.rho(rm_res / s0, psi_family, psi_k))) * rho_inf psi_vals = _psi_mod.psi(fm_res / s0, psi_family, psi_k) psi_p_vals = _psi_mod.psi_prime(fm_res / s0, psi_family, psi_k) denom = float(np.mean(psi_vals**2)) if denom <= 0: raise ValueError("E[psi^2] is zero on full residuals; deviance test undefined") tau_star = float(np.mean(psi_p_vals) / denom) chi2 = 2.0 * tau_star * (rm_sRho - fm_sRho) pval = float(chi2_dist.sf(chi2, df=df)) pseudo_df = n - df_full + df return pseudo_df, chi2, df, pval
[docs] def anova(*fits: LmRobResults, test: str = "Wald") -> AnovaTable: """Robust nested-model anova on a sequence of ``LmRobResults``. The first argument is the largest (full) model; subsequent arguments are progressively reduced models. Each adjacent pair must be strictly nested via term names. Parameters ---------- fits : Two or more ``LmRobResults`` ordered from largest to smallest. test : ``"Wald"`` (default) or ``"Deviance"``. The Deviance test refits the reduced model via M-iteration at the full model's scale; it requires the full model's method to end with ``"M"``. """ if test not in ("Wald", "Deviance"): raise NotImplementedError(f"anova test={test!r}: use 'Wald' or 'Deviance'") if len(fits) < 2: raise ValueError("anova needs at least two fits") full = fits[0] nmodels = len(fits) table = np.full((nmodels, 4), np.nan, dtype=np.float64) table[0, 0] = full.nobs_ - full.coef_.size pair_fn = _wald_pair if test == "Wald" else _deviance_pair cur_full = full for k in range(1, nmodels): reduced = fits[k] if reduced.nobs_ != full.nobs_: raise ValueError("all fits in anova() must be on the same data (different nobs)") pseudo_df, chi2, df, pval = pair_fn(cur_full, reduced) table[k, 0] = pseudo_df table[k, 1] = chi2 table[k, 2] = df table[k, 3] = pval cur_full = reduced method = full.control.method or "MM" term_lists = [list(f.term_names_) for f in fits] return AnovaTable(table=table, term_lists=term_lists, test=test, method=method)
__all__ = ["AnovaTable", "anova"]