Page 2
"""
Comprehensive Stress Testing Framework for Oaxaca-Blinder Decomposition
This module provides targeted stress tests for specific scenarios, with organized output
for debugging, critical data points, and business narratives.
Stress Test Categories:
======================
1. SIMPSON'S PARADOX DETECTION
- True Positive: Clear paradox cases that should be detected
- True Negative: No paradox cases that should not trigger false alarms
- Edge Cases: Borderline cases to test sensitivity
2. BUSINESS CONCLUSION LOGIC
- Pure Composition-driven cases
- Pure Performance-driven cases
- Mixed/Balanced cases
- Threshold edge cases
3. MATHEMATICAL ACCURACY
- Perfect decomposition validation
- Edge cases (zero denominators, extreme values)
- Different baseline types consistency
4. NARRATIVE GENERATION
- Positive vs negative performance gaps
- Different gap magnitudes
- Various driver combinations
5. BASELINE ROBUSTNESS
- rest_of_world vs global_average vs top_performer
- Limited data scenarios
"""
import os
import sys
import pandas as pd
sys.path.append(os.path.join(os.path.dirname(__file__), "docs"))
from oaxaca_blinder import (
determine_business_focus,
detect_aggregation_bias,
enhanced_rca_analysis,
oaxaca_blinder_decomposition,
)
class StressTestFramework:
"""Framework for organized stress testing with categorized outputs."""
def __init__(self):
self.test_results = {}
def run_all_stress_tests(self):
"""Run all stress test categories."""
print("=" * 80)
print("OAXACA-BLINDER STRESS TESTING FRAMEWORK")
print("=" * 80)
# Category 1: Simpson's Paradox Detection
self._test_simpson_paradox_detection()
# Category 2: Business Conclusion Logic
self._test_business_conclusion_logic()
# Category 3: Mathematical Accuracy
self._test_mathematical_accuracy()
# Category 4: Narrative Generation
self._test_narrative_generation()
# Category 5: Baseline Robustness
self._test_baseline_robustness()
def _test_simpson_paradox_detection(self):
"""Test Simpson's paradox detection with targeted scenarios."""
print("\n" + "=" * 60)
print("STRESS TEST 1: SIMPSON'S PARADOX DETECTION")
print("=" * 60)
# Test 1A: TRUE POSITIVE - Clear paradox case
print("\n--- Test 1A: TRUE POSITIVE (Should detect paradox) ---")
df_true_positive = self._create_true_positive_paradox_data()
self._run_paradox_test("TRUE_POSITIVE", df_true_positive, expected_paradox=True)
# Test 1B: TRUE NEGATIVE - No paradox case
print("\n--- Test 1B: TRUE NEGATIVE (Should NOT detect paradox) ---")
df_true_negative = self._create_true_negative_paradox_data()
self._run_paradox_test(
"TRUE_NEGATIVE", df_true_negative, expected_paradox=False
)
# Test 1C: EDGE CASE - Borderline paradox
print("\n--- Test 1C: EDGE CASE (Borderline sensitivity test) ---")
df_edge_case = self._create_edge_case_paradox_data()
self._run_paradox_test("EDGE_CASE", df_edge_case, expected_paradox=None)
# Test 1D: REGION-LEVEL PARADOX - Region underperforms overall but outperforms in every product
print("\n--- Test 1D: REGION-LEVEL PARADOX (Region vs Region comparison) ---")
df_region_paradox = self._create_region_level_paradox_data()
self._run_region_paradox_test("REGION_LEVEL_PARADOX", df_region_paradox, expected_paradox=True)
def _test_business_conclusion_logic(self):
"""Test business conclusion logic with clear cases."""
print("\n" + "=" * 60)
print("STRESS TEST 2: BUSINESS CONCLUSION LOGIC")
print("=" * 60)
# Test 2A: Pure composition-driven
print("\n--- Test 2A: PURE COMPOSITION-DRIVEN ---")
df_comp = self._create_pure_composition_data()
self._run_conclusion_test(
"PURE_COMPOSITION", df_comp, expected_conclusion="composition_driven"
)
# Test 2B: Pure performance-driven
print("\n--- Test 2B: PURE PERFORMANCE-DRIVEN ---")
df_perf = self._create_pure_performance_data()
self._run_conclusion_test(
"PURE_PERFORMANCE", df_perf, expected_conclusion="performance_driven"
)
# Test 2C: Mixed/Balanced (now expects decisive conclusion)
print("\n--- Test 2C: MIXED/BALANCED ---")
df_mixed = self._create_mixed_balanced_data()
self._run_conclusion_test(
"MIXED_BALANCED", df_mixed, expected_conclusion="performance_driven"
)
# Test 2D: Identical rates scenario (real-world edge case)
print("\n--- Test 2D: IDENTICAL RATES SCENARIO ---")
df_identical = self._create_identical_rates_data()
self._run_identical_rates_test("IDENTICAL_RATES", df_identical)
# Test 2E: Threshold sensitivity analysis
print("\n--- Test 2E: THRESHOLD SENSITIVITY ANALYSIS ---")
self._run_threshold_sensitivity_analysis()
def _test_mathematical_accuracy(self):
"""Test mathematical accuracy and validation."""
print("\n" + "=" * 60)
print("STRESS TEST 3: MATHEMATICAL ACCURACY")
print("=" * 60)
# Test 3A: Perfect decomposition validation
print("\n--- Test 3A: PERFECT DECOMPOSITION VALIDATION ---")
df_perfect = self._create_perfect_validation_data()
self._run_validation_test("PERFECT_VALIDATION", df_perfect)
# Test 3B: Edge cases with extreme values
print("\n--- Test 3B: EXTREME VALUES EDGE CASES ---")
df_extreme = self._create_extreme_values_data()
self._run_validation_test("EXTREME_VALUES", df_extreme)
def _test_narrative_generation(self):
"""Test narrative generation quality."""
print("\n" + "=" * 60)
print("STRESS TEST 4: NARRATIVE GENERATION")
print("=" * 60)
# Test 4A: Large positive gap
print("\n--- Test 4A: LARGE POSITIVE GAP NARRATIVE ---")
df_pos = self._create_large_positive_gap_data()
self._run_narrative_test("LARGE_POSITIVE", df_pos)
# Test 4B: Large negative gap
print("\n--- Test 4B: LARGE NEGATIVE GAP NARRATIVE ---")
df_neg = self._create_large_negative_gap_data()
self._run_narrative_test("LARGE_NEGATIVE", df_neg)
def _test_baseline_robustness(self):
"""Test different baseline types for consistency."""
print("\n" + "=" * 60)
print("STRESS TEST 5: BASELINE ROBUSTNESS")
print("=" * 60)
# Test 5A: Baseline consistency across methods
print("\n--- Test 5A: BASELINE CONSISTENCY TEST ---")
df_baseline = self._create_baseline_consistency_data()
self._run_baseline_consistency_test("BASELINE_CONSISTENCY", df_baseline)
# =================================================================
# DATA CREATION METHODS - Each creates targeted test data
# =================================================================
def _create_true_positive_paradox_data(self):
"""Create data that SHOULD trigger Simpson's paradox detection."""
# Region X underperforms overall but outperforms in every subcategory
# This is a classic Simpson's paradox case
data = [
# Region X: 30% overall (bad) but beats baseline in both subcategories
{
"region": "X",
"product": "Product_A",
"vertical": "High_Perf",
"numerator": 45,
"denominator": 100,
}, # 45% vs baseline 40%
{
"region": "X",
"product": "Product_A",
"vertical": "Low_Perf",
"numerator": 225,
"denominator": 900,
}, # 25% vs baseline 20%
# Baseline regions: 35% overall (good) but worse in each subcategory
{
"region": "Y",
"product": "Product_A",
"vertical": "High_Perf",
"numerator": 360,
"denominator": 900,
}, # 40%
{
"region": "Y",
"product": "Product_A",
"vertical": "Low_Perf",
"numerator": 20,
"denominator": 100,
}, # 20%
{
"region": "Z",
"product": "Product_A",
"vertical": "High_Perf",
"numerator": 360,
"denominator": 900,
}, # 40%
{
"region": "Z",
"product": "Product_A",
"vertical": "Low_Perf",
"numerator": 20,
"denominator": 100,
}, # 20%
]
return pd.DataFrame(data)
def _create_true_negative_paradox_data(self):
"""Create data that should NOT trigger paradox detection."""
# Consistent performance across all levels
data = [
# Region X: Consistently underperforms at all levels
{
"region": "X",
"product": "Product_A",
"vertical": "High_Perf",
"numerator": 35,
"denominator": 100,
}, # 35% vs 40%
{
"region": "X",
"product": "Product_A",
"vertical": "Low_Perf",
"numerator": 150,
"denominator": 900,
}, # 17% vs 20%
# Baseline regions: Consistently better
{
"region": "Y",
"product": "Product_A",
"vertical": "High_Perf",
"numerator": 400,
"denominator": 1000,
}, # 40%
{
"region": "Y",
"product": "Product_A",
"vertical": "Low_Perf",
"numerator": 200,
"denominator": 1000,
}, # 20%
{
"region": "Z",
"product": "Product_A",
"vertical": "High_Perf",
"numerator": 400,
"denominator": 1000,
}, # 40%
{
"region": "Z",
"product": "Product_A",
"vertical": "Low_Perf",
"numerator": 200,
"denominator": 1000,
}, # 20%
]
return pd.DataFrame(data)
def _create_edge_case_paradox_data(self):
"""Create borderline case to test sensitivity."""
# Marginal differences that might or might not trigger detection
data = [
# Region X: Slightly mixed signals
{
"region": "X",
"product": "Product_A",
"vertical": "High_Perf",
"numerator": 41,
"denominator": 100,
}, # 41% vs 40%
{
"region": "X",
"product": "Product_A",
"vertical": "Low_Perf",
"numerator": 190,
"denominator": 900,
}, # 21% vs 20%
# Baseline regions
{
"region": "Y",
"product": "Product_A",
"vertical": "High_Perf",
"numerator": 360,
"denominator": 900,
}, # 40%
{
"region": "Y",
"product": "Product_A",
"vertical": "Low_Perf",
"numerator": 20,
"denominator": 100,
}, # 20%
{
"region": "Z",
"product": "Product_A",
"vertical": "High_Perf",
"numerator": 360,
"denominator": 900,
}, # 40%
{
"region": "Z",
"product": "Product_A",
"vertical": "Low_Perf",
"numerator": 20,
"denominator": 100,
}, # 20%
]
return pd.DataFrame(data)
def _create_pure_composition_data(self):
"""Create data that is clearly composition-driven."""
# Same rates, very different mix - but rates must be different to avoid cancellation
data = [
# Region X: Bad mix (over-allocated to low-performing segment)
{
"region": "X",
"product": "Product_A",
"vertical": "High_Perf",
"numerator": 60,
"denominator": 100,
}, # 60% rate, 10% mix
{
"region": "X",
"product": "Product_A",
"vertical": "Low_Perf",
"numerator": 270,
"denominator": 900,
}, # 30% rate, 90% mix
# Baseline: Good mix (balanced allocation) with same rates
{
"region": "Y",
"product": "Product_A",
"vertical": "High_Perf",
"numerator": 300,
"denominator": 500,
}, # 60% rate, 50% mix
{
"region": "Y",
"product": "Product_A",
"vertical": "Low_Perf",
"numerator": 150,
"denominator": 500,
}, # 30% rate, 50% mix
]
return pd.DataFrame(data)
def _create_pure_performance_data(self):
"""Create data that is clearly performance-driven."""
# Same mix, very different rates
data = [
# Region X: Bad execution across all segments
{
"region": "X",
"product": "Product_A",
"vertical": "High_Perf",
"numerator": 300,
"denominator": 1000,
}, # 30% rate, 50% mix
{
"region": "X",
"product": "Product_A",
"vertical": "Low_Perf",
"numerator": 100,
"denominator": 1000,
}, # 10% rate, 50% mix
# Baseline: Good execution
{
"region": "Y",
"product": "Product_A",
"vertical": "High_Perf",
"numerator": 500,
"denominator": 1000,
}, # 50% rate, 50% mix
{
"region": "Y",
"product": "Product_A",
"vertical": "Low_Perf",
"numerator": 300,
"denominator": 1000,
}, # 30% rate, 50% mix
]
return pd.DataFrame(data)
def _create_mixed_balanced_data(self):
"""Create data with balanced composition and performance effects."""
data = [
# Region X: Both mix and rate differences that are roughly equal in magnitude
{
"region": "X",
"product": "Product_A",
"vertical": "High_Perf",
"numerator": 280,
"denominator": 700,
}, # 40% rate, 70% mix (same rate as baseline, different mix)
{
"region": "X",
"product": "Product_A",
"vertical": "Low_Perf",
"numerator": 60,
"denominator": 300,
}, # 20% rate, 30% mix (worse rate than baseline, different mix)
# Baseline: Balanced mix, different rates
{
"region": "Y",
"product": "Product_A",
"vertical": "High_Perf",
"numerator": 200,
"denominator": 500,
}, # 40% rate, 50% mix
{
"region": "Y",
"product": "Product_A",
"vertical": "Low_Perf",
"numerator": 150,
"denominator": 500,
}, # 30% rate, 50% mix
]
return pd.DataFrame(data)
def _create_perfect_validation_data(self):
"""Create data for perfect mathematical validation."""
# Simple, clean data for validation testing
data = [
{
"region": "A",
"product": "P1",
"vertical": "V1",
"numerator": 400,
"denominator": 1000,
},
{
"region": "A",
"product": "P1",
"vertical": "V2",
"numerator": 300,
"denominator": 1000,
},
{
"region": "B",
"product": "P1",
"vertical": "V1",
"numerator": 350,
"denominator": 1000,
},
{
"region": "B",
"product": "P1",
"vertical": "V2",
"numerator": 250,
"denominator": 1000,
},
]
return pd.DataFrame(data)
def _create_extreme_values_data(self):
"""Create data with extreme values to test edge cases."""
data = [
# Very high rates
{
"region": "A",
"product": "P1",
"vertical": "V1",
"numerator": 950,
"denominator": 1000,
},
{
"region": "A",
"product": "P1",
"vertical": "V2",
"numerator": 10,
"denominator": 1000,
},
# Very low rates
{
"region": "B",
"product": "P1",
"vertical": "V1",
"numerator": 50,
"denominator": 1000,
},
{
"region": "B",
"product": "P1",
"vertical": "V2",
"numerator": 990,
"denominator": 1000,
},
]
return pd.DataFrame(data)
def _create_large_positive_gap_data(self):
"""Create data with large positive performance gap."""
data = [
{
"region": "WINNER",
"product": "P1",
"vertical": "V1",
"numerator": 800,
"denominator": 1000,
},
{
"region": "WINNER",
"product": "P1",
"vertical": "V2",
"numerator": 700,
"denominator": 1000,
},
{
"region": "BASELINE",
"product": "P1",
"vertical": "V1",
"numerator": 300,
"denominator": 1000,
},
{
"region": "BASELINE",
"product": "P1",
"vertical": "V2",
"numerator": 200,
"denominator": 1000,
},
]
return pd.DataFrame(data)
def _create_large_negative_gap_data(self):
"""Create data with large negative performance gap."""
data = [
{
"region": "UNDERPERFORMER",
"product": "P1",
"vertical": "V1",
"numerator": 100,
"denominator": 1000,
},
{
"region": "UNDERPERFORMER",
"product": "P1",
"vertical": "V2",
"numerator": 50,
"denominator": 1000,
},
{
"region": "BASELINE",
"product": "P1",
"vertical": "V1",
"numerator": 500,
"denominator": 1000,
},
{
"region": "BASELINE",
"product": "P1",
"vertical": "V2",
"numerator": 400,
"denominator": 1000,
},
]
return pd.DataFrame(data)
def _create_baseline_consistency_data(self):
"""Create data for testing baseline consistency."""
data = [
{
"region": "A",
"product": "P1",
"vertical": "V1",
"numerator": 400,
"denominator": 1000,
},
{
"region": "A",
"product": "P1",
"vertical": "V2",
"numerator": 300,
"denominator": 1000,
},
{
"region": "B",
"product": "P1",
"vertical": "V1",
"numerator": 350,
"denominator": 1000,
},
{
"region": "B",
"product": "P1",
"vertical": "V2",
"numerator": 250,
"denominator": 1000,
},
{
"region": "C",
"product": "P1",
"vertical": "V1",
"numerator": 300,
"denominator": 1000,
},
{
"region": "C",
"product": "P1",
"vertical": "V2",
"numerator": 200,
"denominator": 1000,
},
]
return pd.DataFrame(data)
def _create_identical_rates_data(self):
"""Create data where subcategories have identical rates but different mix."""
# This tests the real-world scenario where construct gaps cancel out mathematically
data = [
# Region X: Bad mix (10% high-value, 90% low-value) but identical rates
{
"region": "X",
"product": "Product_A",
"vertical": "High_Value",
"numerator": 50,
"denominator": 100,
}, # 50% rate, 10% mix
{
"region": "X",
"product": "Product_A",
"vertical": "Low_Value",
"numerator": 450,
"denominator": 900,
}, # 50% rate, 90% mix
# Region Y: Good mix (50% each) with identical rates
{
"region": "Y",
"product": "Product_A",
"vertical": "High_Value",
"numerator": 250,
"denominator": 500,
}, # 50% rate, 50% mix
{
"region": "Y",
"product": "Product_A",
"vertical": "Low_Value",
"numerator": 250,
"denominator": 500,
}, # 50% rate, 50% mix
]
return pd.DataFrame(data)
def _create_region_level_paradox_data(self):
"""Create data for region-level Simpson's paradox test case.
Region X appears to underperform overall (13.6% vs 20.4%) but actually
outperforms in every product due to unfortunate product mix.
"""
data = [
# Region X: 13.6% overall but outperforms in both products
# Product_High: 24.0% rate, 20% mix (200 volume)
{
"region": "X",
"product": "Product_High",
"numerator": 48, # 24.0% rate
"denominator": 200, # 20% of 1000 total
},
# Product_Low: 11.0% rate, 80% mix (800 volume)
{
"region": "X",
"product": "Product_Low",
"numerator": 88, # 11.0% rate
"denominator": 800, # 80% of 1000 total
},
# Region Y: 20.4% overall but underperforms in both products
# Product_High: 23.0% rate, 80% mix (800 volume)
{
"region": "Y",
"product": "Product_High",
"numerator": 184, # 23.0% rate
"denominator": 800, # 80% of 1000 total
},
# Product_Low: 10.0% rate, 20% mix (200 volume)
{
"region": "Y",
"product": "Product_Low",
"numerator": 20, # 10.0% rate
"denominator": 200, # 20% of 1000 total
},
]
return pd.DataFrame(data)
def _run_identical_rates_test(self, test_name, df):
"""Test the identical rates scenario and its business implications."""
# === DEBUG DATA ===
print(f"\n[DEBUG] {test_name} - Raw Data:")
for _, row in df.iterrows():
rate = row["numerator"] / row["denominator"]
total_denom = df[df["region"] == row["region"]]["denominator"].sum()
mix_pct = row["denominator"] / total_denom
print(
f" {row['region']}-{row['vertical']}: {rate:.1%} rate, {mix_pct:.1%} mix ({row['numerator']}/{row['denominator']})"
)
# Calculate overall regional performance
print(f"\n[DEBUG] Overall Regional Performance:")
for region in df["region"].unique():
region_data = df[df["region"] == region]
total_num = region_data["numerator"].sum()
total_denom = region_data["denominator"].sum()
overall_rate = total_num / total_denom
print(f" {region}: {overall_rate:.1%} overall ({total_num}/{total_denom})")
# Run decomposition
results = oaxaca_blinder_decomposition(
df=df,
region_column="region",
numerator_column="numerator",
denominator_column="denominator",
category_columns=["product", "vertical"],
)
# === CRITICAL DATA POINTS ===
print(f"\n[CRITICAL] {test_name} Results:")
for region in df["region"].unique():
region_results = results[results["region"] == region]
if len(region_results) == 0:
continue
construct_gap = region_results["construct_gap_contribution"].sum()
performance_gap = region_results["performance_gap_contribution"].sum()
net_gap = region_results["net_gap"].sum()
print(f" {region}:")
print(
f" Construct Gap: {construct_gap:+.4f} ({construct_gap*100:+.1f}pp)"
)
print(
f" Performance Gap: {performance_gap:+.4f} ({performance_gap*100:+.1f}pp)"
)
print(f" Net Gap: {net_gap:+.4f} ({net_gap*100:+.1f}pp)")
# Check for identical rates scenario
if abs(construct_gap) < 0.001 and abs(performance_gap) < 0.001:
print(f" ⚠️ IDENTICAL RATES DETECTED: Mathematical cancellation")
print(f" 📊 Business Reality: Strategic mix differences exist")
print(f" 💡 Recommendation: Consider segment value weighting")
# === NARRATIVE ===
print(f"\n[NARRATIVE] {test_name}:")
print(
f" Identical rates scenario demonstrates mathematical limitations of standard decomposition"
)
print(
f" Business insight: Mix differences may still have strategic importance"
)
print(
f" Recommendation: Supplement with volume-based or value-weighted analysis"
)
def _run_threshold_sensitivity_analysis(self):
"""Analyze decisive business conclusion logic with various gap scenarios."""
# Create test data with known construct/performance gap ratios
test_scenarios = [
{"construct": 0.02, "performance": 0.03, "expected": "performance_driven"}, # 0.67 ratio - performance larger
{"construct": 0.03, "performance": 0.02, "expected": "composition_driven"}, # 1.5 ratio - construct larger
{"construct": 0.025, "performance": 0.025, "expected": "performance_driven"}, # 1.0 ratio - equal, defaults to performance
{"construct": 0.02, "performance": 0.025, "expected": "performance_driven"}, # 0.8 ratio - performance larger
]
print(f"\n[DEBUG] THRESHOLD_SENSITIVITY - Test Scenarios:")
for i, scenario in enumerate(test_scenarios):
ratio = abs(scenario["construct"]) / abs(scenario["performance"])
print(f" Scenario {i+1}: Construct={scenario['construct']:.3f}, Performance={scenario['performance']:.3f}, Ratio={ratio:.2f}")
print(f"\n[CRITICAL] THRESHOLD_SENSITIVITY Results:")
print(f" {'Scenario':<12} {'Ratio':<8} {'Expected':<20} {'Actual':<20} {'Result':<10}")
print(f" {'-'*12} {'-'*8} {'-'*20} {'-'*20} {'-'*10}")
all_passed = True
for i, scenario in enumerate(test_scenarios):
construct = scenario["construct"]
performance = scenario["performance"]
expected = scenario["expected"]
ratio = abs(construct) / abs(performance)
# Use our actual business conclusion function
business_focus = determine_business_focus(construct, performance)
actual = business_focus["business_conclusion"]
passed = actual == expected
all_passed = all_passed and passed
result = "✅ PASS" if passed else "❌ FAIL"
print(f" Scenario {i+1:<4} {ratio:<8.2f} {expected:<20} {actual:<20} {result:<10}")
# === NARRATIVE ===
print(f"\n[NARRATIVE] THRESHOLD_SENSITIVITY:")
print(f" Decisive logic provides clear, actionable business guidance")
print(f" Always focuses on the larger absolute contributor (construct vs performance)")
print(f" Eliminates ambiguous 'mixed' recommendations for better decision-making")
print(f" Test Result: {'✅ ALL PASS' if all_passed else '❌ SOME FAILURES'}")
# =================================================================
# TEST EXECUTION METHODS - Organized output structure
# =================================================================
def _run_paradox_test(self, test_name, df, expected_paradox):
"""Run paradox detection test with organized output."""
# === DEBUG DATA (for reasoning) ===
print(f"\n[DEBUG] {test_name} - Raw Data:")
for _, row in df.iterrows():
rate = row["numerator"] / row["denominator"]
print(
f" {row['region']}-{row['product']}-{row['vertical']}: {rate:.1%} ({row['numerator']}/{row['denominator']})"
)
# Run paradox detection
paradox_results = detect_aggregation_bias(
df=df,
region_column="region",
numerator_column="numerator",
denominator_column="denominator",
category_columns=["product"],
subcategory_columns=["vertical"],
top_n_focus=1,
)
# === CRITICAL DATA POINTS (for conclusion) ===
paradox_detected = len(paradox_results) > 0
print(f"\n[CRITICAL] {test_name} Results:")
print(f" Expected Paradox: {expected_paradox}")
print(f" Detected Paradox: {paradox_detected}")
print(
f" Test Result: {'✅ PASS' if (expected_paradox is None or paradox_detected == expected_paradox) else '❌ FAIL'}"
)
if paradox_detected:
for _, row in paradox_results.iterrows():
print(f" Paradox Details: {row['region']}-{row['category']}")
print(
f" Aggregate → Detailed: {row['aggregate_recommendation']} → {row['subcategory_recommendation']}"
)
print(f" Business Risk: {row['gap_significance']}")
# === NARRATIVE (for business users) ===
if paradox_detected:
print(f"\n[NARRATIVE] {test_name}:")
print(
f" Simpson's paradox detected - aggregate analysis would mislead business decisions"
)
else:
print(f"\n[NARRATIVE] {test_name}:")
print(
f" No paradox detected - analysis is reliable for business decisions"
)
def _run_conclusion_test(self, test_name, df, expected_conclusion):
"""Run business conclusion test with organized output."""
# Run decomposition
results = oaxaca_blinder_decomposition(
df=df,
region_column="region",
numerator_column="numerator",
denominator_column="denominator",
category_columns=["product", "vertical"],
)
# Get first region's results
region = results["region"].iloc[0]
region_data = results[results["region"] == region]
construct_gap = region_data["construct_gap_contribution"].sum()
performance_gap = region_data["performance_gap_contribution"].sum()
# === DEBUG DATA ===
print(f"\n[DEBUG] {test_name} - Gap Components:")
print(f" Construct Gap: {construct_gap:+.4f} ({construct_gap*100:+.1f}pp)")
print(
f" Performance Gap: {performance_gap:+.4f} ({performance_gap*100:+.1f}pp)"
)
ratio_value = (
abs(construct_gap) / abs(performance_gap)
if performance_gap != 0
else float("inf")
)
ratio_str = f"{ratio_value:.2f}" if ratio_value != float("inf") else "inf"
print(f" Ratio: |Construct|/|Performance| = {ratio_str}")
# === CRITICAL DATA POINTS ===
business_focus = determine_business_focus(construct_gap, performance_gap)
actual_conclusion = business_focus["business_conclusion"]
print(f"\n[CRITICAL] {test_name} Results:")
print(f" Expected Conclusion: {expected_conclusion}")
print(f" Actual Conclusion: {actual_conclusion}")
print(
f" Test Result: {'✅ PASS' if actual_conclusion == expected_conclusion else '❌ FAIL'}"
)
# === NARRATIVE ===
print(f"\n[NARRATIVE] {test_name}:")
if actual_conclusion == "composition_driven":
print(
f" Focus on strategic allocation adjustments - mix optimization needed"
)
elif actual_conclusion == "performance_driven":
print(f" Focus on operational improvements - execution enhancement needed")
else:
print(
f" Balanced approach needed - both allocation and execution improvements"
)
def _run_validation_test(self, test_name, df):
"""Run mathematical validation test."""
# Calculate actual regional gaps
actual_gaps = {}
for region in df["region"].unique():
region_data = df[df["region"] == region]
rest_data = df[df["region"] != region]
region_rate = (
region_data["numerator"].sum() / region_data["denominator"].sum()
)
rest_rate = rest_data["numerator"].sum() / rest_data["denominator"].sum()
actual_gaps[region] = region_rate - rest_rate
# Run decomposition
results = oaxaca_blinder_decomposition(
df=df,
region_column="region",
numerator_column="numerator",
denominator_column="denominator",
category_columns=["product", "vertical"],
)
# === DEBUG DATA ===
print(f"\n[DEBUG] {test_name} - Validation Details:")
# === CRITICAL DATA POINTS ===
print(f"\n[CRITICAL] {test_name} Validation:")
max_error = 0
for region in actual_gaps:
region_results = results[results["region"] == region]
decomposed_gap = region_results["net_gap"].sum()
error = abs(actual_gaps[region] - decomposed_gap)
max_error = max(max_error, error)
print(
f" {region}: Actual={actual_gaps[region]:+.4f}, Decomposed={decomposed_gap:+.4f}, Error={error:.6f}"
)
print(f" Maximum Error: {max_error:.6f}")
print(f" Test Result: {'✅ PASS' if max_error < 1e-10 else '❌ FAIL'}")
# === NARRATIVE ===
print(f"\n[NARRATIVE] {test_name}:")
if max_error < 1e-10:
print(f" Mathematical decomposition is perfectly accurate")
else:
print(f" Mathematical accuracy issue detected - needs investigation")
def _run_narrative_test(self, test_name, df):
"""Run narrative generation test."""
# Run enhanced RCA
results = enhanced_rca_analysis(
df=df,
region_column="region",
numerator_column="numerator",
denominator_column="denominator",
category_columns=["product"],
subcategory_columns=["vertical"],
)
# === DEBUG DATA ===
print(f"\n[DEBUG] {test_name} - Analysis Components:")
# === CRITICAL DATA POINTS ===
print(f"\n[CRITICAL] {test_name} Narrative Quality:")
for region, analysis in results["regional_analysis"].items():
gap_magnitude = abs(analysis["total_gap"]) * 100
print(
f" {region}: Gap={analysis['total_gap']:+.1%}, Conclusion={analysis['business_conclusion']}"
)
# === NARRATIVE ===
print(f"\n[NARRATIVE] {test_name}:")
for region, analysis in results["regional_analysis"].items():
print(f" {region}: {analysis['business_narrative']}")
def _run_region_paradox_test(self, test_name, df, expected_paradox):
"""Run region-level Simpson's paradox test with organized output."""
# === DEBUG DATA (validate the test case setup) ===
print(f"\n[DEBUG] {test_name} - Raw Data:")
# Calculate overall regional rates
overall_rates = {}
for region in df["region"].unique():
region_data = df[df["region"] == region]
total_num = region_data["numerator"].sum()
total_denom = region_data["denominator"].sum()
overall_rate = total_num / total_denom
overall_rates[region] = overall_rate
print(f" {region}: {overall_rate:.1%} overall ({total_num}/{total_denom})")
# Calculate product-level rates
print(f"\n[DEBUG] Product-Level Rates:")
product_rates = {}
for region in df["region"].unique():
product_rates[region] = {}
for product in df["product"].unique():
product_data = df[(df["region"] == region) & (df["product"] == product)]
if len(product_data) > 0:
rate = product_data["numerator"].iloc[0] / product_data["denominator"].iloc[0]
mix = product_data["denominator"].iloc[0] / df[df["region"] == region]["denominator"].sum()
product_rates[region][product] = rate
print(f" {region}-{product}: {rate:.1%} rate, {mix:.0%} mix")
# Run region-level paradox detection
paradox_results = detect_aggregation_bias(
df=df,
region_column="region",
numerator_column="numerator",
denominator_column="denominator",
category_columns=None, # Region-level detection
subcategory_columns=["product"],
)
# === CRITICAL DATA POINTS ===
paradox_detected = len(paradox_results) > 0
print(f"\n[CRITICAL] {test_name} Results:")
print(f" Expected Paradox: {expected_paradox}")
print(f" Detected Paradox: {paradox_detected}")
# Validate the paradox logic
if "X" in overall_rates and "Y" in overall_rates:
overall_gap = overall_rates["X"] - overall_rates["Y"]
print(f" Overall Gap: X vs Y = {overall_gap:+.1%} (X appears {'better' if overall_gap > 0 else 'worse'})")
# Check product-level comparisons
product_comparisons = []
if "X" in product_rates and "Y" in product_rates:
for product in product_rates["X"]:
if product in product_rates["Y"]:
x_rate = product_rates["X"][product]
y_rate = product_rates["Y"][product]
x_wins = x_rate > y_rate
product_comparisons.append(x_wins)
print(f" {product}: X ({x_rate:.1%}) vs Y ({y_rate:.1%}) = X {'wins' if x_wins else 'loses'}")
all_products_favor_x = all(product_comparisons) if product_comparisons else False
paradox_exists = (overall_gap < 0) and all_products_favor_x
print(f" Paradox Logic: Overall X<Y ({overall_gap < 0}) + All Products X>Y ({all_products_favor_x}) = Paradox Exists ({paradox_exists})")
print(f" Test Result: {'✅ PASS' if (expected_paradox is None or paradox_detected == expected_paradox) else '❌ FAIL'}")
if paradox_detected:
for _, row in paradox_results.iterrows():
print(f" Paradox Details: {row['region']}")
contradiction_rate = row.get('contradiction_rate', 'N/A')
if isinstance(contradiction_rate, (int, float)):
print(f" Contradiction Rate: {contradiction_rate:.1%}")
else:
print(f" Contradiction Rate: {contradiction_rate}")
print(f" Severity: {row.get('severity', 'N/A')}")
# === NARRATIVE ===
print(f"\n[NARRATIVE] {test_name}:")
if paradox_detected:
print(f" Region-level Simpson's paradox detected - overall regional comparison misleads")
print(f" Business insight: Product mix differences mask true performance capabilities")
print(f" Recommendation: Focus on product-level analysis for accurate performance assessment")
else:
print(f" No region-level paradox detected - overall regional comparison is reliable")
def _run_baseline_consistency_test(self, test_name, df):
"""Run baseline consistency test."""
baselines = ["rest_of_world", "global_average", "top_performer"]
baseline_results = {}
for baseline in baselines:
try:
results = oaxaca_blinder_decomposition(
df=df,
region_column="region",
numerator_column="numerator",
denominator_column="denominator",
category_columns=["product"],
baseline_region=baseline,
)
baseline_results[baseline] = results
except Exception as e:
print(f"[DEBUG] {baseline} failed: {e}")
# === CRITICAL DATA POINTS ===
print(f"\n[CRITICAL] {test_name} Baseline Consistency:")
for baseline, results in baseline_results.items():
if results is not None:
total_gaps = {}
for region in results["region"].unique():
region_data = results[results["region"] == region]
total_gaps[region] = region_data["net_gap"].sum()
print(f" {baseline}: {total_gaps}")
# === NARRATIVE ===
print(f"\n[NARRATIVE] {test_name}:")
print(
f" Baseline consistency validated across {len(baseline_results)} methods"
)
if __name__ == "__main__":
framework = StressTestFramework()
framework.run_all_stress_tests()
Last updated