igerber · igerber · Jun 27, 2026 · Jun 27, 2026
diff --git a/TODO.md b/TODO.md
@@ -164,7 +164,6 @@ Deferred items from PR reviews that were not addressed before merge.
 | Rust-backend HC2 implementation. Current Rust path only supports HC1; HC2 and CR2 Bell-McCaffrey fall through to the NumPy backend. For large-n fits this is noticeable. | `rust/src/linalg.rs` | Phase 1a | Low |
 | CR2 Bell-McCaffrey DOF uses a naive `O(n² k)` per-coefficient loop over cluster pairs. Pustejovsky-Tipton (2018) Appendix B has a scores-based formulation that avoids the full `n × n` `M` matrix. Switch when a user hits a large-`n` cluster-robust design. | `linalg.py::_compute_cr2_bm` | Phase 1a | Low |
 | `SyntheticControl` retains a full `_SyntheticControlFitSnapshot` (pivoted outcome/predictor panels) on EVERY fit to support the opt-in `in_space_placebo()`, so callers who never run the placebo still pay O(units × periods × predictor-vars) memory (same as `SyntheticDiD`'s always-on snapshot for `in_time_placebo`). Store a compact array/index representation instead of per-variable DataFrames, or build the snapshot lazily on first placebo call (would need to retain the source data, ~same cost). | `synthetic_control.py` snapshot build, `synthetic_control_results.py::_SyntheticControlFitSnapshot` | follow-up | Low |
-| EfficientDiD DR (covariate) path rebuilds the full polynomial sieve basis `_polynomial_sieve_basis(X, K)` for every candidate `K` inside each of the three nuisance fits (outcome regression, propensity ratio, inverse propensity), per `fit()`. After the growing-sieve cap removal (PR-B), large covariate-adjusted fits at large `n` pay more avoidable basis-construction cost. Cache the basis per `(X, K)` within a `fit()` and share it across the nuisance helpers. | `diff_diff/efficient_did_covariates.py` (the three sieve helpers) | PR-B follow-up | Low |
 | Wild cluster bootstrap CI inversion calls `_t_star(r)` ~O(100) times (outward bracketing + bisection per endpoint), and each call materializes a fresh `(B × n)` `y_star` matrix plus the `(k × B)` refit and `(n × B)` residual arrays. For large panels or large `n_bootstrap` this allocation churn is noticeable. The bootstrap is for the few-cluster regime (small `B` when enumerated; `n` typically modest), so it is acceptable today; if a large-`n`/large-`B` user hits it, chunk `_t_star` over bootstrap draws or precompute the `r`-independent cluster-level pieces (the restricted residuals are linear in `r`) so each inversion evaluation avoids rebuilding the full `B × n` matrix. | `diff_diff/utils.py::wild_bootstrap_se._t_star` | #543 | Low |
 
 #### Testing/Docs

diff --git a/diff_diff/efficient_did.py b/diff_diff/efficient_did.py
@@ -789,6 +789,11 @@ def fit(
         m_hat_cache: Dict[Tuple, np.ndarray] = {}
         r_hat_cache: Dict[Tuple[float, float], np.ndarray] = {}
         s_hat_cache: Dict[float, np.ndarray] = {}  # inverse propensities per group
+        # Per-fit cache of the polynomial sieve basis, keyed (id(X), degree). The three
+        # sieve nuisance helpers all build the basis from the same fit-level
+        # `covariate_matrix`, so this shares each distinct degree's basis across them
+        # instead of rebuilding it per helper. Lives only for this fit() call.
+        sieve_basis_cache: Dict[Tuple[int, int], np.ndarray] = {}
 
         if use_covariates:
             assert covariates is not None  # for type narrowing
@@ -934,6 +939,7 @@ def fit(
                                 k_max=self.sieve_k_max,
                                 criterion=self.sieve_criterion,
                                 unit_weights=unit_level_weights,
+                                basis_cache=sieve_basis_cache,
                             )
                         # m_{g', tpre, 1}(X)
                         key_gp_tpre = (gp, tpre_col_val, effective_p1_col)
@@ -950,6 +956,7 @@ def fit(
                                 k_max=self.sieve_k_max,
                                 criterion=self.sieve_criterion,
                                 unit_weights=unit_level_weights,
+                                basis_cache=sieve_basis_cache,
                             )
                         # r_{g, inf}(X) and r_{g, g'}(X) via sieve (Eq 4.1-4.2)
                         for comp in {np.inf, gp}:
@@ -966,6 +973,7 @@ def fit(
                                     criterion=self.sieve_criterion,
                                     ratio_clip=self.ratio_clip,
                                     unit_weights=unit_level_weights,
+                                    basis_cache=sieve_basis_cache,
                                 )
 
                     # Per-unit DR generated outcomes: shape (n_units, H)
@@ -998,6 +1006,7 @@ def fit(
                                 k_max=self.sieve_k_max,
                                 criterion=self.sieve_criterion,
                                 unit_weights=unit_level_weights,
+                                basis_cache=sieve_basis_cache,
                             )
 
                     # Conditional Omega*(X) with per-unit propensities (Eq 3.12)

diff --git a/diff_diff/efficient_did_covariates.py b/diff_diff/efficient_did_covariates.py
@@ -42,6 +42,7 @@ def estimate_outcome_regression(
     k_max: Optional[int] = None,
     criterion: str = "bic",
     unit_weights: Optional[np.ndarray] = None,
+    basis_cache: Optional[Dict[Tuple[int, int], np.ndarray]] = None,
 ) -> np.ndarray:
     r"""Estimate conditional mean outcome change m_hat(X) via a polynomial sieve.
 
@@ -169,7 +170,7 @@ def estimate_outcome_regression(
         if n_basis >= n_pos:
             break
 
-        basis_all = _polynomial_sieve_basis(covariate_matrix, K)
+        basis_all = _sieve_basis_cached(covariate_matrix, K, basis_cache)
         basis_group = basis_all[group_mask]
 
         # Rank guard on the (weighted) design Gram, mirroring the propensity sieve.
@@ -288,6 +289,38 @@ def _polynomial_sieve_basis(X: np.ndarray, degree: int) -> np.ndarray:
     return np.column_stack(columns)
 
 
+def _sieve_basis_cached(
+    X: np.ndarray, degree: int, cache: Optional[Dict[Tuple[int, int], np.ndarray]]
+) -> np.ndarray:
+    """Per-fit memoized :func:`_polynomial_sieve_basis`.
+
+    ``cache`` is a dict owned by one ``EfficientDiD.fit()`` and shared across the three
+    sieve nuisance helpers, which all receive the same fit-level ``covariate_matrix``.
+    The basis is a pure function of ``(X, degree)``, so for any degree reached by more
+    than one helper (every helper starts at ``K=1`` on the same ``X``) the identical
+    array would otherwise be rebuilt from scratch each time.
+
+    Keyed on ``(id(X), degree)``: ``X`` is fixed for a fit, so the basis depends only on
+    ``degree``; ``id(X)`` guards against accidental reuse of a cache with a different
+    matrix. The cache lives only for the duration of one ``fit()`` (``covariate_matrix``
+    stays alive throughout, so its ``id`` is stable and uncollidable), so there is no
+    cross-fit leak and no ``id``-reuse hazard.
+
+    When ``cache is None`` (the default for any standalone caller) this is a plain
+    pass-through to :func:`_polynomial_sieve_basis`, leaving non-``EfficientDiD`` callers
+    byte-for-byte unchanged. The helpers only read the returned array (no in-place
+    mutation), so returning a shared cached object is bit-identical to rebuilding it.
+    """
+    if cache is None:
+        return _polynomial_sieve_basis(X, degree)
+    key = (id(X), degree)
+    basis = cache.get(key)
+    if basis is None:
+        basis = _polynomial_sieve_basis(X, degree)
+        cache[key] = basis
+    return basis
+
+
 def estimate_propensity_ratio_sieve(
     covariate_matrix: np.ndarray,
     mask_g: np.ndarray,
@@ -296,6 +329,7 @@ def estimate_propensity_ratio_sieve(
     criterion: str = "bic",
     ratio_clip: float = 20.0,
     unit_weights: Optional[np.ndarray] = None,
+    basis_cache: Optional[Dict[Tuple[int, int], np.ndarray]] = None,
 ) -> np.ndarray:
     r"""Estimate propensity ratio via sieve convex minimization (Eq 4.1-4.2).
 
@@ -396,7 +430,7 @@ def estimate_propensity_ratio_sieve(
         if n_basis >= n_gp_pos:
             break
 
-        basis_all = _polynomial_sieve_basis(covariate_matrix, K)
+        basis_all = _sieve_basis_cached(covariate_matrix, K, basis_cache)
         Psi_gp = basis_all[mask_gp]  # (n_gp, n_basis)
         Psi_g = basis_all[mask_g]  # (n_g, n_basis)
 
@@ -496,6 +530,7 @@ def estimate_inverse_propensity_sieve(
     k_max: Optional[int] = None,
     criterion: str = "bic",
     unit_weights: Optional[np.ndarray] = None,
+    basis_cache: Optional[Dict[Tuple[int, int], np.ndarray]] = None,
 ) -> np.ndarray:
     r"""Estimate s_{g'}(X) = 1/p_{g'}(X) via sieve convex minimization.
 
@@ -586,7 +621,7 @@ def estimate_inverse_propensity_sieve(
         if n_basis >= n_group_pos:
             break
 
-        basis_all = _polynomial_sieve_basis(covariate_matrix, K)
+        basis_all = _sieve_basis_cached(covariate_matrix, K, basis_cache)
         Psi_gp = basis_all[group_mask]
 
         # Normal equations (weighted when survey weights present):

diff --git a/tests/test_efficient_did.py b/tests/test_efficient_did.py
@@ -2787,3 +2787,107 @@ def test_fit_clone_idempotent_on_vcov_type(self):
         assert r1.overall_att == r2.overall_att
         assert r1.overall_se == r2.overall_se
         assert r1.vcov_type == r2.vcov_type
+
+
+class TestSieveBasisCache:
+    """The per-fit sieve-basis cache shares ``_polynomial_sieve_basis(X, K)`` across the
+    three DR nuisance helpers. Because the basis is a pure function of ``(X, degree)`` and
+    the helpers only read it, caching is bit-identical to rebuilding — these tests pin the
+    cache mechanism (the end-to-end bit-identity is also proven against an origin/main
+    capture during development)."""
+
+    def test_cache_hit_returns_same_object_and_is_bit_identical(self):
+        from diff_diff.efficient_did_covariates import (
+            _polynomial_sieve_basis,
+            _sieve_basis_cached,
+        )
+
+        rng = np.random.default_rng(0)
+        X = rng.normal(size=(40, 2))
+        cache: dict = {}
+        a = _sieve_basis_cached(X, 2, cache)
+        b = _sieve_basis_cached(X, 2, cache)
+        # Cache hit returns the SAME object (so downstream reads see identical bytes)...
+        assert a is b
+        assert len(cache) == 1
+        # ...and it equals a fresh build bit-for-bit.
+        np.testing.assert_array_equal(a, _polynomial_sieve_basis(X, 2))
+        # A different degree adds a second, distinct entry.
+        c = _sieve_basis_cached(X, 3, cache)
+        assert len(cache) == 2
+        assert c is not a
+        np.testing.assert_array_equal(c, _polynomial_sieve_basis(X, 3))
+
+    def test_cache_none_is_plain_passthrough(self):
+        from diff_diff.efficient_did_covariates import (
+            _polynomial_sieve_basis,
+            _sieve_basis_cached,
+        )
+
+        rng = np.random.default_rng(1)
+        X = rng.normal(size=(30, 2))
+        a = _sieve_basis_cached(X, 2, None)
+        b = _sieve_basis_cached(X, 2, None)
+        # No cache: distinct fresh arrays, each equal to a direct build.
+        assert a is not b
+        np.testing.assert_array_equal(a, b)
+        np.testing.assert_array_equal(a, _polynomial_sieve_basis(X, 2))
+
+    def test_reads_do_not_mutate_cached_basis(self):
+        from diff_diff.efficient_did_covariates import (
+            _polynomial_sieve_basis,
+            _sieve_basis_cached,
+        )
+
+        rng = np.random.default_rng(2)
+        X = rng.normal(size=(50, 2))
+        pristine = _polynomial_sieve_basis(X, 2)
+        cache: dict = {}
+        cached = _sieve_basis_cached(X, 2, cache)
+        # The representative reads the helpers perform on basis_all.
+        mask = np.arange(50) % 2 == 0
+        _ = cached[mask]
+        _ = cached @ np.ones(cached.shape[1])
+        _ = (np.ones(50)[:, None] * cached).sum(axis=0)
+        _ = cached.sum(axis=0)
+        # Re-fetch: still the same object and still bit-identical to the pristine build.
+        again = _sieve_basis_cached(X, 2, cache)
+        assert again is cached
+        np.testing.assert_array_equal(again, pristine)
+
+    def test_fit_builds_each_degree_once_across_helpers(self, monkeypatch):
+        """End-to-end: a covariate DR fit requests the basis many times (3 helpers ×
+        multiple (g,t) cells) but builds each distinct degree exactly once, proving the
+        per-fit cache actually shares work."""
+        import diff_diff.efficient_did_covariates as cov
+
+        real_build = cov._polynomial_sieve_basis
+        real_cached = cov._sieve_basis_cached
+        build_keys: list = []  # one entry per ACTUAL _polynomial_sieve_basis build
+        request_keys: list = []  # one entry per _sieve_basis_cached request
+
+        def counting_build(X, degree):
+            build_keys.append((id(X), degree))
+            return real_build(X, degree)
+
+        def counting_cached(X, degree, cache):
+            request_keys.append((id(X), degree))
+            return real_cached(X, degree, cache)
+
+        monkeypatch.setattr(cov, "_polynomial_sieve_basis", counting_build)
+        monkeypatch.setattr(cov, "_sieve_basis_cached", counting_cached)
+
+        df = _make_covariate_panel(n_units=150)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            result = EfficientDiD(pt_assumption="post").fit(
+                df, "y", "unit", "time", "first_treat", covariates=["x1", "x2"]
+            )
+        assert np.isfinite(result.overall_att)
+        # The path was exercised through the cache.
+        assert request_keys, "covariate DR path did not run the sieve helpers"
+        # Each distinct (X, degree) was built exactly once (perfect dedup)...
+        assert len(build_keys) == len(set(build_keys))
+        assert len(build_keys) == len(set(request_keys))
+        # ...and there was genuine redundancy for the cache to eliminate.
+        assert len(request_keys) > len(build_keys)