Marketing Case Study: Email Campaign Uplift¶
Scenario: An e-commerce company sends promotional emails to customers. Not all customers respond equally — some are "persuadables" (they buy only when emailed) while others would buy anyway. We want to identify and target the persuadables in real time as customers arrive.
Problem: Treatment (email = 1 / no email = 0) is partially confounded by customer engagement score. We use OnlineCML to estimate individual uplift and rank customers by predicted CATE.
Methods compared: OnlineIPW, OnlineAIPW, OnlineTLearner, OnlineRLearner
In [1]:
Copied!
import math
import matplotlib
import matplotlib.pyplot as plt
from river.linear_model import LinearRegression
from onlinecml.datasets import HeterogeneousCausalStream
from onlinecml.reweighting import OnlineIPW, OnlineAIPW
from onlinecml.metalearners import OnlineTLearner, OnlineRLearner
from onlinecml.diagnostics import OnlineSMD, OverlapChecker
from onlinecml.evaluation import progressive_causal_score
from onlinecml.evaluation.metrics import ATEError, UpliftAUC
import math
import matplotlib
import matplotlib.pyplot as plt
from river.linear_model import LinearRegression
from onlinecml.datasets import HeterogeneousCausalStream
from onlinecml.reweighting import OnlineIPW, OnlineAIPW
from onlinecml.metalearners import OnlineTLearner, OnlineRLearner
from onlinecml.diagnostics import OnlineSMD, OverlapChecker
from onlinecml.evaluation import progressive_causal_score
from onlinecml.evaluation.metrics import ATEError, UpliftAUC
1. Simulate the campaign stream¶
We use HeterogeneousCausalStream to simulate customers with varying
engagement features and heterogeneous treatment effects.
In [2]:
Copied!
# Simulate 5000 customers arriving over time
# true_ate = 0.08 (8% average lift in conversion)
# heterogeneity = nonlinear (some customers are much more responsive)
STREAM_PARAMS = dict(n=5000, n_features=5, true_ate=0.08,
heterogeneity="nonlinear", confounding_strength=0.6, seed=42)
# Progressive evaluation: how well do the models rank customers by uplift?
results_t = progressive_causal_score(
stream = HeterogeneousCausalStream(**STREAM_PARAMS),
model = OnlineTLearner(treated_model=LinearRegression(), control_model=LinearRegression()),
metrics = [ATEError(), UpliftAUC()],
step = 500,
)
results_r = progressive_causal_score(
stream = HeterogeneousCausalStream(**STREAM_PARAMS),
model = OnlineRLearner(cate_model=LinearRegression()),
metrics = [ATEError(), UpliftAUC()],
step = 500,
)
print(f"{'Method':>12} | {'Final ATEError':>14} | {'Final UpliftAUC':>15}")
print("-" * 48)
print(f"{'T-Learner':>12} | {results_t['ATEError'][-1]:>14.4f} | {results_t['UpliftAUC'][-1]:>15.4f}")
print(f"{'R-Learner':>12} | {results_r['ATEError'][-1]:>14.4f} | {results_r['UpliftAUC'][-1]:>15.4f}")
# Simulate 5000 customers arriving over time
# true_ate = 0.08 (8% average lift in conversion)
# heterogeneity = nonlinear (some customers are much more responsive)
STREAM_PARAMS = dict(n=5000, n_features=5, true_ate=0.08,
heterogeneity="nonlinear", confounding_strength=0.6, seed=42)
# Progressive evaluation: how well do the models rank customers by uplift?
results_t = progressive_causal_score(
stream = HeterogeneousCausalStream(**STREAM_PARAMS),
model = OnlineTLearner(treated_model=LinearRegression(), control_model=LinearRegression()),
metrics = [ATEError(), UpliftAUC()],
step = 500,
)
results_r = progressive_causal_score(
stream = HeterogeneousCausalStream(**STREAM_PARAMS),
model = OnlineRLearner(cate_model=LinearRegression()),
metrics = [ATEError(), UpliftAUC()],
step = 500,
)
print(f"{'Method':>12} | {'Final ATEError':>14} | {'Final UpliftAUC':>15}")
print("-" * 48)
print(f"{'T-Learner':>12} | {results_t['ATEError'][-1]:>14.4f} | {results_t['UpliftAUC'][-1]:>15.4f}")
print(f"{'R-Learner':>12} | {results_r['ATEError'][-1]:>14.4f} | {results_r['UpliftAUC'][-1]:>15.4f}")
Method | Final ATEError | Final UpliftAUC ------------------------------------------------ T-Learner | 0.0101 | 0.1088 R-Learner | 0.0252 | 0.1099
/home/lucifer/pCloudDrive/onlinecml/onlinecml/evaluation/progressive.py:66: UserWarning: treated_model has not seen any data yet; CATE estimate may be biased. cate_hat = model.predict_one(x) /home/lucifer/pCloudDrive/onlinecml/onlinecml/evaluation/progressive.py:66: UserWarning: control_model has not seen any data yet; CATE estimate may be biased. cate_hat = model.predict_one(x)
2. Real-time CATE ranking¶
In [3]:
Copied!
# Train a final model and rank the last 100 customers by predicted uplift
model = OnlineRLearner(cate_model=LinearRegression())
last_100 = []
for i, (x, w, y, tau) in enumerate(HeterogeneousCausalStream(**STREAM_PARAMS)):
cate_hat = model.predict_one(x)
if i >= 4900:
last_100.append((cate_hat, tau, x))
model.learn_one(x, w, y)
# Sort by predicted CATE descending
last_100.sort(key=lambda t: t[0], reverse=True)
print("Top-10 customers by predicted uplift:")
print(f"{'Rank':>4} | {'CATE_hat':>9} | {'True CATE':>10}")
print("-" * 30)
for rank, (cate_hat, true_cate, _) in enumerate(last_100[:10], 1):
print(f"{rank:>4} | {cate_hat:>9.4f} | {true_cate:>10.4f}")
# Train a final model and rank the last 100 customers by predicted uplift
model = OnlineRLearner(cate_model=LinearRegression())
last_100 = []
for i, (x, w, y, tau) in enumerate(HeterogeneousCausalStream(**STREAM_PARAMS)):
cate_hat = model.predict_one(x)
if i >= 4900:
last_100.append((cate_hat, tau, x))
model.learn_one(x, w, y)
# Sort by predicted CATE descending
last_100.sort(key=lambda t: t[0], reverse=True)
print("Top-10 customers by predicted uplift:")
print(f"{'Rank':>4} | {'CATE_hat':>9} | {'True CATE':>10}")
print("-" * 30)
for rank, (cate_hat, true_cate, _) in enumerate(last_100[:10], 1):
print(f"{rank:>4} | {cate_hat:>9.4f} | {true_cate:>10.4f}")
Top-10 customers by predicted uplift: Rank | CATE_hat | True CATE ------------------------------ 1 | 2.6157 | 2.6548 2 | 2.2389 | 2.2985 3 | 1.9188 | 2.3004 4 | 1.9085 | 2.4400 5 | 1.7700 | 1.6502 6 | 1.7223 | 1.9445 7 | 1.4334 | 1.5938 8 | 1.3852 | 1.6978 9 | 1.2568 | 1.5612 10 | 1.2492 | 1.7730
3. Balance diagnostics¶
In [4]:
Copied!
ipw = OnlineIPW()
smd = OnlineSMD(covariates=[f"x{i}" for i in range(5)])
overlap = OverlapChecker(ps_min=0.05, ps_max=0.95)
for x, w, y, _ in HeterogeneousCausalStream(**STREAM_PARAMS):
ps = ipw.ps_model.predict_one(x)
weight = 1.0 / ps if w == 1 else 1.0 / (1.0 - ps)
smd.update(x, treatment=w, weight=weight)
overlap.update(ps, treatment=w)
ipw.learn_one(x, w, y)
print(f"Overlap adequate : {overlap.is_overlap_adequate()}")
print(f"Flag rate : {overlap.report()['flag_rate']:.1%}")
print(f"Balance adequate : {smd.is_balanced()}")
print(f"\nIPW ATE : {ipw.predict_ate():.4f}")
print(f"True ATE : ~0.0800")
ipw = OnlineIPW()
smd = OnlineSMD(covariates=[f"x{i}" for i in range(5)])
overlap = OverlapChecker(ps_min=0.05, ps_max=0.95)
for x, w, y, _ in HeterogeneousCausalStream(**STREAM_PARAMS):
ps = ipw.ps_model.predict_one(x)
weight = 1.0 / ps if w == 1 else 1.0 / (1.0 - ps)
smd.update(x, treatment=w, weight=weight)
overlap.update(ps, treatment=w)
ipw.learn_one(x, w, y)
print(f"Overlap adequate : {overlap.is_overlap_adequate()}")
print(f"Flag rate : {overlap.report()['flag_rate']:.1%}")
print(f"Balance adequate : {smd.is_balanced()}")
print(f"\nIPW ATE : {ipw.predict_ate():.4f}")
print(f"True ATE : ~0.0800")
Overlap adequate : True Flag rate : 0.0% Balance adequate : True IPW ATE : 0.1685 True ATE : ~0.0800