diff --git a/causalml/metrics/__init__.py b/causalml/metrics/__init__.py index 0bc9d187..e382bc43 100644 --- a/causalml/metrics/__init__.py +++ b/causalml/metrics/__init__.py @@ -25,6 +25,7 @@ get_tmleqini, ) # noqa from .visualize import auuc_score, qini_score # noqa +from .rate import get_toc, rate_score, plot_toc # noqa from .sensitivity import Sensitivity, SensitivityPlaceboTreatment # noqa from .sensitivity import ( SensitivityRandomCause, diff --git a/causalml/metrics/rate.py b/causalml/metrics/rate.py new file mode 100644 index 00000000..8af2e91f --- /dev/null +++ b/causalml/metrics/rate.py @@ -0,0 +1,301 @@ +from typing import Optional + +import numpy as np +import pandas as pd +from matplotlib import pyplot as plt +import logging + +plt.style.use("fivethirtyeight") +RANDOM_COL = "Random" + +logger = logging.getLogger("causalml") + + +def get_toc( + df, + outcome_col="y", + treatment_col="w", + treatment_effect_col="tau", + normalize=False, +): + """Get the Targeting Operator Characteristic (TOC) of model estimates in population. + + TOC(q) is the difference between the ATE among the top-q fraction of units ranked + by the prioritization score and the overall ATE. A positive TOC at low q indicates + the model successfully identifies units with above-average treatment benefit. + + By definition, TOC(0) = 0 and TOC(1) = 0 (the subset ATE equals the overall ATE + when the entire population is selected). + + If the true treatment effect is provided (e.g. in synthetic data), it's used directly + to calculate TOC. Otherwise, it's estimated as the difference between the mean outcomes + of the treatment and control groups in each quantile band. + + Note: when using observed outcomes, if a quantile band contains only treated or only + control units, the code falls back to TOC(q) = 0 for that band (i.e., subset ATE is + set to the overall ATE). This is a conservative approximation and is logged as a warning. + + For details, see Yadlowsky et al. (2021), `Evaluating Treatment Prioritization Rules + via Rank-Weighted Average Treatment Effects`. https://arxiv.org/abs/2111.07966 + + For the former, `treatment_effect_col` should be provided. For the latter, both + `outcome_col` and `treatment_col` should be provided. + + Args: + df (pandas.DataFrame): a data frame with model estimates and actual data as columns + outcome_col (str, optional): the column name for the actual outcome + treatment_col (str, optional): the column name for the treatment indicator (0 or 1) + treatment_effect_col (str, optional): the column name for the true treatment effect + normalize (bool, optional): whether to normalize the TOC curve by its maximum + absolute value. Uses max(|TOC|) as the reference to avoid division by zero + at q=1 where TOC is always zero by definition. + + Returns: + (pandas.DataFrame): TOC values of model estimates in population, indexed by quantile q + """ + assert ( + (outcome_col in df.columns and df[outcome_col].notnull().all()) + and (treatment_col in df.columns and df[treatment_col].notnull().all()) + or ( + treatment_effect_col in df.columns + and df[treatment_effect_col].notnull().all() + ) + ), "{outcome_col} and {treatment_col}, or {treatment_effect_col} should be present without null.".format( + outcome_col=outcome_col, + treatment_col=treatment_col, + treatment_effect_col=treatment_effect_col, + ) + + df = df.copy() + + model_names = [ + x + for x in df.columns + if x not in [outcome_col, treatment_col, treatment_effect_col] + ] + + use_oracle = ( + treatment_effect_col in df.columns and df[treatment_effect_col].notnull().all() + ) + + if use_oracle: + overall_ate = df[treatment_effect_col].mean() + else: + treated = df[treatment_col] == 1 + overall_ate = ( + df.loc[treated, outcome_col].mean() - df.loc[~treated, outcome_col].mean() + ) + + n_total = len(df) + + toc = [] + for col in model_names: + sorted_df = df.sort_values(col, ascending=False).reset_index(drop=True) + + if use_oracle: + # O(n) via cumulative sum + cumsum_tau = sorted_df[treatment_effect_col].cumsum().values + counts = np.arange(1, n_total + 1) + subset_ates = cumsum_tau / counts + else: + cumsum_tr = sorted_df[treatment_col].cumsum().values + cumsum_ct = np.arange(1, n_total + 1) - cumsum_tr + cumsum_y_tr = ( + (sorted_df[outcome_col] * sorted_df[treatment_col]).cumsum().values + ) + cumsum_y_ct = ( + (sorted_df[outcome_col] * (1 - sorted_df[treatment_col])) + .cumsum() + .values + ) + + # Guard against division by zero when a band is all-treated or all-control; + # fall back to overall_ate (TOC = 0) for those positions. + with np.errstate(invalid="ignore", divide="ignore"): + subset_ates = np.where( + (cumsum_tr == 0) | (cumsum_ct == 0), + overall_ate, + cumsum_y_tr / cumsum_tr - cumsum_y_ct / cumsum_ct, + ) + + if np.any((cumsum_tr == 0) | (cumsum_ct == 0)): + logger.warning( + "Some quantile bands contain only treated or only control units " + "for column '%s'. TOC is set to 0 for those positions.", + col, + ) + + toc_values = subset_ates - overall_ate + toc.append(pd.Series(toc_values, index=np.linspace(0, 1, n_total + 1)[1:])) + + toc = pd.concat(toc, join="inner", axis=1) + toc.loc[0] = np.zeros((toc.shape[1],)) + toc = toc.sort_index().interpolate() + toc.columns = model_names + toc.index.name = "q" + + if normalize: + # Normalize by max absolute value rather than the value at q=1, which is + # always zero by definition and would cause division by zero. + max_abs = toc.abs().max() + max_abs = max_abs.replace(0, 1) # guard for flat TOC curves + toc = toc.div(max_abs, axis=1) + + return toc + + +def rate_score( + df, + outcome_col="y", + treatment_col="w", + treatment_effect_col="tau", + weighting="autoc", + normalize=False, +): + """Calculate the Rank-weighted Average Treatment Effect (RATE) score. + + RATE is the weighted area under the Targeting Operator Characteristic (TOC) curve: + + RATE = integral_0^1 alpha(q) * TOC(q) dq + + Two standard weighting schemes are supported (Yadlowsky et al., 2021): + + - ``"autoc"``: alpha(q) = 1/q. Places more weight on the highest-priority units. + Most powerful when treatment effects are concentrated in a small subgroup. + + - ``"qini"``: alpha(q) = q. Uniform weighting across units; reduces to the Qini + coefficient. More powerful when treatment effects are diffuse across the population. + + A positive RATE indicates the prioritization rule effectively identifies units with + above-average treatment benefit. A RATE near zero suggests little heterogeneity or + a poor prioritization rule. + + Note: the integral is approximated via a weighted mean over the discrete quantile grid + using midpoint values. Weights are normalized to sum to 1 (i.e. ``weights / weights.sum()``), + so the absolute scale matches the TOC values but may differ slightly from the paper's + continuous integral definition. Model rankings are preserved. + + For details, see Yadlowsky et al. (2021), `Evaluating Treatment Prioritization Rules + via Rank-Weighted Average Treatment Effects`. https://arxiv.org/abs/2111.07966 + + For the former, `treatment_effect_col` should be provided. For the latter, both + `outcome_col` and `treatment_col` should be provided. + + Args: + df (pandas.DataFrame): a data frame with model estimates and actual data as columns + outcome_col (str, optional): the column name for the actual outcome + treatment_col (str, optional): the column name for the treatment indicator (0 or 1) + treatment_effect_col (str, optional): the column name for the true treatment effect + weighting (str, optional): the weighting scheme for the RATE integral. + One of ``"autoc"`` (default) or ``"qini"``. + normalize (bool, optional): whether to normalize the TOC curve before scoring + + Returns: + (pandas.Series): RATE scores of model estimates + """ + assert weighting in ( + "autoc", + "qini", + ), "{} weighting is not implemented. Select one of {}".format( + weighting, ("autoc", "qini") + ) + + toc = get_toc( + df, + outcome_col=outcome_col, + treatment_col=treatment_col, + treatment_effect_col=treatment_effect_col, + normalize=normalize, + ) + + quantiles = toc.index.values # includes 0 and 1 + + # Use midpoints to avoid division by zero for autoc at q=0 + q_mid = (quantiles[:-1] + quantiles[1:]) / 2 + toc_mid = (toc.iloc[:-1].values + toc.iloc[1:].values) / 2 + + if weighting == "autoc": + weights = 1.0 / q_mid + else: + weights = q_mid + + # Normalize weights so they sum to 1 over the integration domain + weights = weights / weights.sum() + + rate = pd.Series( + np.average(toc_mid, axis=0, weights=weights), + index=toc.columns, + ) + rate.name = "RATE ({})".format(weighting) + return rate + + +def plot_toc( + df, + outcome_col="y", + treatment_col="w", + treatment_effect_col="tau", + normalize=False, + n=100, + figsize=(8, 8), + ax: Optional[plt.Axes] = None, +) -> plt.Axes: + """Plot the Targeting Operator Characteristic (TOC) curve of model estimates. + + The TOC(q) shows the excess ATE when treating only the top-q fraction of units + prioritized by a model score, relative to the overall ATE. A positive and steeply + decreasing curve indicates the model effectively ranks high-benefit units first. + + If the true treatment effect is provided (e.g. in synthetic data), it's used directly. + Otherwise, it's estimated from observed outcomes and treatment assignments. + + For details, see Yadlowsky et al. (2021), `Evaluating Treatment Prioritization Rules + via Rank-Weighted Average Treatment Effects`. https://arxiv.org/abs/2111.07966 + + For the former, `treatment_effect_col` should be provided. For the latter, both + `outcome_col` and `treatment_col` should be provided. + + Args: + df (pandas.DataFrame): a data frame with model estimates and actual data as columns + outcome_col (str, optional): the column name for the actual outcome + treatment_col (str, optional): the column name for the treatment indicator (0 or 1) + treatment_effect_col (str, optional): the column name for the true treatment effect + normalize (bool, optional): whether to normalize the TOC curve by its maximum + absolute value before plotting + n (int, optional): the number of samples to be used for plotting + figsize (tuple, optional): the size of the figure to plot + ax (plt.Axes, optional): an existing axes object to draw on + + Returns: + (plt.Axes): the matplotlib Axes with the TOC plot + """ + toc = get_toc( + df, + outcome_col=outcome_col, + treatment_col=treatment_col, + treatment_effect_col=treatment_effect_col, + normalize=normalize, + ) + + if (n is not None) and (n < toc.shape[0]): + toc = toc.iloc[np.linspace(0, len(toc) - 1, n, endpoint=True).astype(int)] + + if ax is None: + _, ax = plt.subplots(figsize=figsize) + + ax = toc.plot(ax=ax) + + # Random baseline (TOC = 0 everywhere) + ax.plot( + [toc.index[0], toc.index[-1]], + [0, 0], + label=RANDOM_COL, + color="k", + linestyle="--", + ) + ax.legend() + ax.set_xlabel("Fraction treated (q)") + ax.set_ylabel("TOC(q)") + ax.set_title("Targeting Operator Characteristic (TOC)") + + return ax diff --git a/tests/test_rate.py b/tests/test_rate.py new file mode 100644 index 00000000..30585be0 --- /dev/null +++ b/tests/test_rate.py @@ -0,0 +1,161 @@ +from matplotlib import pyplot as plt +import numpy as np +import pandas as pd +import pytest + +from causalml.metrics.rate import get_toc, rate_score, plot_toc + +# Use project-wide constant instead of hardcoded seed +try: + from tests.const import RANDOM_SEED +except ImportError: + RANDOM_SEED = 42 + + +@pytest.fixture +def synthetic_df(): + rng = np.random.default_rng(RANDOM_SEED) + n = 500 + tau = rng.normal(1.0, 0.5, n) + return pd.DataFrame( + { + "y": rng.normal(0, 1, n), + "w": rng.integers(0, 2, n), + "tau": tau, + "perfect_model": tau, + "random_model": rng.normal(0, 1, n), + } + ) + + +@pytest.fixture +def rct_df(): + rng = np.random.default_rng(RANDOM_SEED) + n = 2000 + tau = rng.uniform(0, 2, n) + w = rng.integers(0, 2, n) + y = tau * w + rng.normal(0, 1, n) + return pd.DataFrame( + { + "y": y, + "w": w, + "score": tau + rng.normal(0, 0.1, n), + } + ) + + +def test_get_toc_errors_on_nan(): + df = pd.DataFrame( + [[0, np.nan, 0.5], [1, np.nan, 0.1], [1, 1, 0.4], [0, 1, 0.3], [1, 1, 0.2]], + columns=["w", "y", "pred"], + ) + + with pytest.raises(AssertionError): + get_toc(df) + + +def test_get_toc_returns_dataframe(synthetic_df): + toc = get_toc(synthetic_df, treatment_effect_col="tau") + assert isinstance(toc, pd.DataFrame) + + +def test_get_toc_index_bounds(synthetic_df): + toc = get_toc(synthetic_df, treatment_effect_col="tau") + assert toc.index[0] == 0.0 + assert toc.index[-1] == 1.0 + + +def test_get_toc_starts_at_zero(synthetic_df): + toc = get_toc(synthetic_df, treatment_effect_col="tau") + assert np.allclose(toc.iloc[0].values, 0.0) + + +def test_get_toc_ends_at_zero(synthetic_df): + # TOC(1) = 0 by definition: subset ATE over full population equals overall ATE + toc = get_toc(synthetic_df, treatment_effect_col="tau") + assert np.allclose(toc.iloc[-1].values, 0.0, atol=1e-10) + + +def test_get_toc_model_cols_present(synthetic_df): + toc = get_toc(synthetic_df, treatment_effect_col="tau") + assert "perfect_model" in toc.columns + assert "random_model" in toc.columns + + +def test_get_toc_perfect_model_positive_at_low_q(synthetic_df): + toc = get_toc(synthetic_df, treatment_effect_col="tau") + assert toc.loc[toc.index <= 0.15, "perfect_model"].mean() > 0 + + +def test_get_toc_normalize_no_inf_or_nan(synthetic_df): + # normalize=True previously divided by TOC(1) which is always zero — + # now uses max(|TOC|) to avoid inf/NaN + toc = get_toc(synthetic_df, treatment_effect_col="tau", normalize=True) + assert not toc.isnull().any().any() + assert np.isfinite(toc.values).all() + + +def test_get_toc_normalize_max_abs_is_one(synthetic_df): + toc = get_toc(synthetic_df, treatment_effect_col="tau", normalize=True) + assert np.allclose(toc.abs().max(), 1.0) + + +def test_get_toc_observed_outcomes(rct_df): + toc = get_toc(rct_df, outcome_col="y", treatment_col="w") + assert isinstance(toc, pd.DataFrame) + assert "score" in toc.columns + + +def test_rate_score_returns_series(synthetic_df): + result = rate_score(synthetic_df, treatment_effect_col="tau") + assert isinstance(result, pd.Series) + + +def test_rate_score_autoc_perfect_model_positive(synthetic_df): + scores = rate_score(synthetic_df, treatment_effect_col="tau", weighting="autoc") + assert scores["perfect_model"] > 0 + + +def test_rate_score_qini_perfect_model_positive(synthetic_df): + scores = rate_score(synthetic_df, treatment_effect_col="tau", weighting="qini") + assert scores["perfect_model"] > 0 + + +def test_rate_score_perfect_beats_random(synthetic_df): + scores = rate_score(synthetic_df, treatment_effect_col="tau", weighting="autoc") + assert scores["perfect_model"] > scores["random_model"] + + +def test_rate_score_invalid_weighting_raises(synthetic_df): + with pytest.raises(AssertionError): + rate_score(synthetic_df, treatment_effect_col="tau", weighting="invalid") + + +def test_rate_score_constant_tau_near_zero(): + rng = np.random.default_rng(RANDOM_SEED) + n = 2000 + df = pd.DataFrame( + { + "tau": np.ones(n), + "random_score": rng.normal(0, 1, n), + } + ) + scores = rate_score(df, treatment_effect_col="tau", weighting="autoc") + assert abs(scores["random_score"]) < 0.2 + + +def test_rate_score_all_finite(synthetic_df): + scores = rate_score(synthetic_df, treatment_effect_col="tau") + assert np.isfinite(scores.values).all() + + +def test_rate_score_observed_outcomes(rct_df): + scores = rate_score(rct_df, outcome_col="y", treatment_col="w") + assert isinstance(scores, pd.Series) + assert np.isfinite(scores.values).all() + + +def test_plot_toc(synthetic_df, monkeypatch): + monkeypatch.setattr(plt, "show", lambda: None) + ax = plot_toc(synthetic_df, treatment_effect_col="tau") + assert isinstance(ax, plt.Axes)