Page 2

"""
Comprehensive Stress Testing Framework for Oaxaca-Blinder Decomposition

This module provides targeted stress tests for specific scenarios, with organized output
for debugging, critical data points, and business narratives.

Stress Test Categories:
======================

1. SIMPSON'S PARADOX DETECTION
   - True Positive: Clear paradox cases that should be detected
   - True Negative: No paradox cases that should not trigger false alarms
   - Edge Cases: Borderline cases to test sensitivity

2. BUSINESS CONCLUSION LOGIC
   - Pure Composition-driven cases
   - Pure Performance-driven cases
   - Mixed/Balanced cases
   - Threshold edge cases

3. MATHEMATICAL ACCURACY
   - Perfect decomposition validation
   - Edge cases (zero denominators, extreme values)
   - Different baseline types consistency

4. NARRATIVE GENERATION
   - Positive vs negative performance gaps
   - Different gap magnitudes
   - Various driver combinations

5. BASELINE ROBUSTNESS
   - rest_of_world vs global_average vs top_performer
   - Limited data scenarios
"""

import os
import sys

import pandas as pd

sys.path.append(os.path.join(os.path.dirname(__file__), "docs"))

from oaxaca_blinder import (
    determine_business_focus,
    detect_aggregation_bias,
    enhanced_rca_analysis,
    oaxaca_blinder_decomposition,
)


class StressTestFramework:
    """Framework for organized stress testing with categorized outputs."""

    def __init__(self):
        self.test_results = {}

    def run_all_stress_tests(self):
        """Run all stress test categories."""
        print("=" * 80)
        print("OAXACA-BLINDER STRESS TESTING FRAMEWORK")
        print("=" * 80)

        # Category 1: Simpson's Paradox Detection
        self._test_simpson_paradox_detection()

        # Category 2: Business Conclusion Logic
        self._test_business_conclusion_logic()

        # Category 3: Mathematical Accuracy
        self._test_mathematical_accuracy()

        # Category 4: Narrative Generation
        self._test_narrative_generation()

        # Category 5: Baseline Robustness
        self._test_baseline_robustness()

    def _test_simpson_paradox_detection(self):
        """Test Simpson's paradox detection with targeted scenarios."""
        print("\n" + "=" * 60)
        print("STRESS TEST 1: SIMPSON'S PARADOX DETECTION")
        print("=" * 60)

        # Test 1A: TRUE POSITIVE - Clear paradox case
        print("\n--- Test 1A: TRUE POSITIVE (Should detect paradox) ---")
        df_true_positive = self._create_true_positive_paradox_data()
        self._run_paradox_test("TRUE_POSITIVE", df_true_positive, expected_paradox=True)

        # Test 1B: TRUE NEGATIVE - No paradox case
        print("\n--- Test 1B: TRUE NEGATIVE (Should NOT detect paradox) ---")
        df_true_negative = self._create_true_negative_paradox_data()
        self._run_paradox_test(
            "TRUE_NEGATIVE", df_true_negative, expected_paradox=False
        )

        # Test 1C: EDGE CASE - Borderline paradox
        print("\n--- Test 1C: EDGE CASE (Borderline sensitivity test) ---")
        df_edge_case = self._create_edge_case_paradox_data()
        self._run_paradox_test("EDGE_CASE", df_edge_case, expected_paradox=None)

        # Test 1D: REGION-LEVEL PARADOX - Region underperforms overall but outperforms in every product
        print("\n--- Test 1D: REGION-LEVEL PARADOX (Region vs Region comparison) ---")
        df_region_paradox = self._create_region_level_paradox_data()
        self._run_region_paradox_test("REGION_LEVEL_PARADOX", df_region_paradox, expected_paradox=True)

    def _test_business_conclusion_logic(self):
        """Test business conclusion logic with clear cases."""
        print("\n" + "=" * 60)
        print("STRESS TEST 2: BUSINESS CONCLUSION LOGIC")
        print("=" * 60)

        # Test 2A: Pure composition-driven
        print("\n--- Test 2A: PURE COMPOSITION-DRIVEN ---")
        df_comp = self._create_pure_composition_data()
        self._run_conclusion_test(
            "PURE_COMPOSITION", df_comp, expected_conclusion="composition_driven"
        )

        # Test 2B: Pure performance-driven
        print("\n--- Test 2B: PURE PERFORMANCE-DRIVEN ---")
        df_perf = self._create_pure_performance_data()
        self._run_conclusion_test(
            "PURE_PERFORMANCE", df_perf, expected_conclusion="performance_driven"
        )

        # Test 2C: Mixed/Balanced (now expects decisive conclusion)
        print("\n--- Test 2C: MIXED/BALANCED ---")
        df_mixed = self._create_mixed_balanced_data()
        self._run_conclusion_test(
            "MIXED_BALANCED", df_mixed, expected_conclusion="performance_driven"
        )

        # Test 2D: Identical rates scenario (real-world edge case)
        print("\n--- Test 2D: IDENTICAL RATES SCENARIO ---")
        df_identical = self._create_identical_rates_data()
        self._run_identical_rates_test("IDENTICAL_RATES", df_identical)

        # Test 2E: Threshold sensitivity analysis
        print("\n--- Test 2E: THRESHOLD SENSITIVITY ANALYSIS ---")
        self._run_threshold_sensitivity_analysis()

    def _test_mathematical_accuracy(self):
        """Test mathematical accuracy and validation."""
        print("\n" + "=" * 60)
        print("STRESS TEST 3: MATHEMATICAL ACCURACY")
        print("=" * 60)

        # Test 3A: Perfect decomposition validation
        print("\n--- Test 3A: PERFECT DECOMPOSITION VALIDATION ---")
        df_perfect = self._create_perfect_validation_data()
        self._run_validation_test("PERFECT_VALIDATION", df_perfect)

        # Test 3B: Edge cases with extreme values
        print("\n--- Test 3B: EXTREME VALUES EDGE CASES ---")
        df_extreme = self._create_extreme_values_data()
        self._run_validation_test("EXTREME_VALUES", df_extreme)

    def _test_narrative_generation(self):
        """Test narrative generation quality."""
        print("\n" + "=" * 60)
        print("STRESS TEST 4: NARRATIVE GENERATION")
        print("=" * 60)

        # Test 4A: Large positive gap
        print("\n--- Test 4A: LARGE POSITIVE GAP NARRATIVE ---")
        df_pos = self._create_large_positive_gap_data()
        self._run_narrative_test("LARGE_POSITIVE", df_pos)

        # Test 4B: Large negative gap
        print("\n--- Test 4B: LARGE NEGATIVE GAP NARRATIVE ---")
        df_neg = self._create_large_negative_gap_data()
        self._run_narrative_test("LARGE_NEGATIVE", df_neg)

    def _test_baseline_robustness(self):
        """Test different baseline types for consistency."""
        print("\n" + "=" * 60)
        print("STRESS TEST 5: BASELINE ROBUSTNESS")
        print("=" * 60)

        # Test 5A: Baseline consistency across methods
        print("\n--- Test 5A: BASELINE CONSISTENCY TEST ---")
        df_baseline = self._create_baseline_consistency_data()
        self._run_baseline_consistency_test("BASELINE_CONSISTENCY", df_baseline)

    # =================================================================
    # DATA CREATION METHODS - Each creates targeted test data
    # =================================================================

    def _create_true_positive_paradox_data(self):
        """Create data that SHOULD trigger Simpson's paradox detection."""
        # Region X underperforms overall but outperforms in every subcategory
        # This is a classic Simpson's paradox case
        data = [
            # Region X: 30% overall (bad) but beats baseline in both subcategories
            {
                "region": "X",
                "product": "Product_A",
                "vertical": "High_Perf",
                "numerator": 45,
                "denominator": 100,
            },  # 45% vs baseline 40%
            {
                "region": "X",
                "product": "Product_A",
                "vertical": "Low_Perf",
                "numerator": 225,
                "denominator": 900,
            },  # 25% vs baseline 20%
            # Baseline regions: 35% overall (good) but worse in each subcategory
            {
                "region": "Y",
                "product": "Product_A",
                "vertical": "High_Perf",
                "numerator": 360,
                "denominator": 900,
            },  # 40%
            {
                "region": "Y",
                "product": "Product_A",
                "vertical": "Low_Perf",
                "numerator": 20,
                "denominator": 100,
            },  # 20%
            {
                "region": "Z",
                "product": "Product_A",
                "vertical": "High_Perf",
                "numerator": 360,
                "denominator": 900,
            },  # 40%
            {
                "region": "Z",
                "product": "Product_A",
                "vertical": "Low_Perf",
                "numerator": 20,
                "denominator": 100,
            },  # 20%
        ]
        return pd.DataFrame(data)

    def _create_true_negative_paradox_data(self):
        """Create data that should NOT trigger paradox detection."""
        # Consistent performance across all levels
        data = [
            # Region X: Consistently underperforms at all levels
            {
                "region": "X",
                "product": "Product_A",
                "vertical": "High_Perf",
                "numerator": 35,
                "denominator": 100,
            },  # 35% vs 40%
            {
                "region": "X",
                "product": "Product_A",
                "vertical": "Low_Perf",
                "numerator": 150,
                "denominator": 900,
            },  # 17% vs 20%
            # Baseline regions: Consistently better
            {
                "region": "Y",
                "product": "Product_A",
                "vertical": "High_Perf",
                "numerator": 400,
                "denominator": 1000,
            },  # 40%
            {
                "region": "Y",
                "product": "Product_A",
                "vertical": "Low_Perf",
                "numerator": 200,
                "denominator": 1000,
            },  # 20%
            {
                "region": "Z",
                "product": "Product_A",
                "vertical": "High_Perf",
                "numerator": 400,
                "denominator": 1000,
            },  # 40%
            {
                "region": "Z",
                "product": "Product_A",
                "vertical": "Low_Perf",
                "numerator": 200,
                "denominator": 1000,
            },  # 20%
        ]
        return pd.DataFrame(data)

    def _create_edge_case_paradox_data(self):
        """Create borderline case to test sensitivity."""
        # Marginal differences that might or might not trigger detection
        data = [
            # Region X: Slightly mixed signals
            {
                "region": "X",
                "product": "Product_A",
                "vertical": "High_Perf",
                "numerator": 41,
                "denominator": 100,
            },  # 41% vs 40%
            {
                "region": "X",
                "product": "Product_A",
                "vertical": "Low_Perf",
                "numerator": 190,
                "denominator": 900,
            },  # 21% vs 20%
            # Baseline regions
            {
                "region": "Y",
                "product": "Product_A",
                "vertical": "High_Perf",
                "numerator": 360,
                "denominator": 900,
            },  # 40%
            {
                "region": "Y",
                "product": "Product_A",
                "vertical": "Low_Perf",
                "numerator": 20,
                "denominator": 100,
            },  # 20%
            {
                "region": "Z",
                "product": "Product_A",
                "vertical": "High_Perf",
                "numerator": 360,
                "denominator": 900,
            },  # 40%
            {
                "region": "Z",
                "product": "Product_A",
                "vertical": "Low_Perf",
                "numerator": 20,
                "denominator": 100,
            },  # 20%
        ]
        return pd.DataFrame(data)

    def _create_pure_composition_data(self):
        """Create data that is clearly composition-driven."""
        # Same rates, very different mix - but rates must be different to avoid cancellation
        data = [
            # Region X: Bad mix (over-allocated to low-performing segment)
            {
                "region": "X",
                "product": "Product_A",
                "vertical": "High_Perf",
                "numerator": 60,
                "denominator": 100,
            },  # 60% rate, 10% mix
            {
                "region": "X",
                "product": "Product_A",
                "vertical": "Low_Perf",
                "numerator": 270,
                "denominator": 900,
            },  # 30% rate, 90% mix
            # Baseline: Good mix (balanced allocation) with same rates
            {
                "region": "Y",
                "product": "Product_A",
                "vertical": "High_Perf",
                "numerator": 300,
                "denominator": 500,
            },  # 60% rate, 50% mix
            {
                "region": "Y",
                "product": "Product_A",
                "vertical": "Low_Perf",
                "numerator": 150,
                "denominator": 500,
            },  # 30% rate, 50% mix
        ]
        return pd.DataFrame(data)

    def _create_pure_performance_data(self):
        """Create data that is clearly performance-driven."""
        # Same mix, very different rates
        data = [
            # Region X: Bad execution across all segments
            {
                "region": "X",
                "product": "Product_A",
                "vertical": "High_Perf",
                "numerator": 300,
                "denominator": 1000,
            },  # 30% rate, 50% mix
            {
                "region": "X",
                "product": "Product_A",
                "vertical": "Low_Perf",
                "numerator": 100,
                "denominator": 1000,
            },  # 10% rate, 50% mix
            # Baseline: Good execution
            {
                "region": "Y",
                "product": "Product_A",
                "vertical": "High_Perf",
                "numerator": 500,
                "denominator": 1000,
            },  # 50% rate, 50% mix
            {
                "region": "Y",
                "product": "Product_A",
                "vertical": "Low_Perf",
                "numerator": 300,
                "denominator": 1000,
            },  # 30% rate, 50% mix
        ]
        return pd.DataFrame(data)

    def _create_mixed_balanced_data(self):
        """Create data with balanced composition and performance effects."""
        data = [
            # Region X: Both mix and rate differences that are roughly equal in magnitude
            {
                "region": "X",
                "product": "Product_A",
                "vertical": "High_Perf",
                "numerator": 280,
                "denominator": 700,
            },  # 40% rate, 70% mix (same rate as baseline, different mix)
            {
                "region": "X",
                "product": "Product_A",
                "vertical": "Low_Perf",
                "numerator": 60,
                "denominator": 300,
            },  # 20% rate, 30% mix (worse rate than baseline, different mix)
            # Baseline: Balanced mix, different rates
            {
                "region": "Y",
                "product": "Product_A",
                "vertical": "High_Perf",
                "numerator": 200,
                "denominator": 500,
            },  # 40% rate, 50% mix
            {
                "region": "Y",
                "product": "Product_A",
                "vertical": "Low_Perf",
                "numerator": 150,
                "denominator": 500,
            },  # 30% rate, 50% mix
        ]
        return pd.DataFrame(data)

    def _create_perfect_validation_data(self):
        """Create data for perfect mathematical validation."""
        # Simple, clean data for validation testing
        data = [
            {
                "region": "A",
                "product": "P1",
                "vertical": "V1",
                "numerator": 400,
                "denominator": 1000,
            },
            {
                "region": "A",
                "product": "P1",
                "vertical": "V2",
                "numerator": 300,
                "denominator": 1000,
            },
            {
                "region": "B",
                "product": "P1",
                "vertical": "V1",
                "numerator": 350,
                "denominator": 1000,
            },
            {
                "region": "B",
                "product": "P1",
                "vertical": "V2",
                "numerator": 250,
                "denominator": 1000,
            },
        ]
        return pd.DataFrame(data)

    def _create_extreme_values_data(self):
        """Create data with extreme values to test edge cases."""
        data = [
            # Very high rates
            {
                "region": "A",
                "product": "P1",
                "vertical": "V1",
                "numerator": 950,
                "denominator": 1000,
            },
            {
                "region": "A",
                "product": "P1",
                "vertical": "V2",
                "numerator": 10,
                "denominator": 1000,
            },
            # Very low rates
            {
                "region": "B",
                "product": "P1",
                "vertical": "V1",
                "numerator": 50,
                "denominator": 1000,
            },
            {
                "region": "B",
                "product": "P1",
                "vertical": "V2",
                "numerator": 990,
                "denominator": 1000,
            },
        ]
        return pd.DataFrame(data)

    def _create_large_positive_gap_data(self):
        """Create data with large positive performance gap."""
        data = [
            {
                "region": "WINNER",
                "product": "P1",
                "vertical": "V1",
                "numerator": 800,
                "denominator": 1000,
            },
            {
                "region": "WINNER",
                "product": "P1",
                "vertical": "V2",
                "numerator": 700,
                "denominator": 1000,
            },
            {
                "region": "BASELINE",
                "product": "P1",
                "vertical": "V1",
                "numerator": 300,
                "denominator": 1000,
            },
            {
                "region": "BASELINE",
                "product": "P1",
                "vertical": "V2",
                "numerator": 200,
                "denominator": 1000,
            },
        ]
        return pd.DataFrame(data)

    def _create_large_negative_gap_data(self):
        """Create data with large negative performance gap."""
        data = [
            {
                "region": "UNDERPERFORMER",
                "product": "P1",
                "vertical": "V1",
                "numerator": 100,
                "denominator": 1000,
            },
            {
                "region": "UNDERPERFORMER",
                "product": "P1",
                "vertical": "V2",
                "numerator": 50,
                "denominator": 1000,
            },
            {
                "region": "BASELINE",
                "product": "P1",
                "vertical": "V1",
                "numerator": 500,
                "denominator": 1000,
            },
            {
                "region": "BASELINE",
                "product": "P1",
                "vertical": "V2",
                "numerator": 400,
                "denominator": 1000,
            },
        ]
        return pd.DataFrame(data)

    def _create_baseline_consistency_data(self):
        """Create data for testing baseline consistency."""
        data = [
            {
                "region": "A",
                "product": "P1",
                "vertical": "V1",
                "numerator": 400,
                "denominator": 1000,
            },
            {
                "region": "A",
                "product": "P1",
                "vertical": "V2",
                "numerator": 300,
                "denominator": 1000,
            },
            {
                "region": "B",
                "product": "P1",
                "vertical": "V1",
                "numerator": 350,
                "denominator": 1000,
            },
            {
                "region": "B",
                "product": "P1",
                "vertical": "V2",
                "numerator": 250,
                "denominator": 1000,
            },
            {
                "region": "C",
                "product": "P1",
                "vertical": "V1",
                "numerator": 300,
                "denominator": 1000,
            },
            {
                "region": "C",
                "product": "P1",
                "vertical": "V2",
                "numerator": 200,
                "denominator": 1000,
            },
        ]
        return pd.DataFrame(data)

    def _create_identical_rates_data(self):
        """Create data where subcategories have identical rates but different mix."""
        # This tests the real-world scenario where construct gaps cancel out mathematically
        data = [
            # Region X: Bad mix (10% high-value, 90% low-value) but identical rates
            {
                "region": "X",
                "product": "Product_A",
                "vertical": "High_Value",
                "numerator": 50,
                "denominator": 100,
            },  # 50% rate, 10% mix
            {
                "region": "X",
                "product": "Product_A",
                "vertical": "Low_Value",
                "numerator": 450,
                "denominator": 900,
            },  # 50% rate, 90% mix
            # Region Y: Good mix (50% each) with identical rates
            {
                "region": "Y",
                "product": "Product_A",
                "vertical": "High_Value",
                "numerator": 250,
                "denominator": 500,
            },  # 50% rate, 50% mix
            {
                "region": "Y",
                "product": "Product_A",
                "vertical": "Low_Value",
                "numerator": 250,
                "denominator": 500,
            },  # 50% rate, 50% mix
        ]
        return pd.DataFrame(data)

    def _create_region_level_paradox_data(self):
        """Create data for region-level Simpson's paradox test case.

        Region X appears to underperform overall (13.6% vs 20.4%) but actually
        outperforms in every product due to unfortunate product mix.
        """
        data = [
            # Region X: 13.6% overall but outperforms in both products
            # Product_High: 24.0% rate, 20% mix (200 volume)
            {
                "region": "X",
                "product": "Product_High",
                "numerator": 48,  # 24.0% rate
                "denominator": 200,  # 20% of 1000 total
            },
            # Product_Low: 11.0% rate, 80% mix (800 volume)
            {
                "region": "X",
                "product": "Product_Low",
                "numerator": 88,  # 11.0% rate
                "denominator": 800,  # 80% of 1000 total
            },
            # Region Y: 20.4% overall but underperforms in both products
            # Product_High: 23.0% rate, 80% mix (800 volume)
            {
                "region": "Y",
                "product": "Product_High",
                "numerator": 184,  # 23.0% rate
                "denominator": 800,  # 80% of 1000 total
            },
            # Product_Low: 10.0% rate, 20% mix (200 volume)
            {
                "region": "Y",
                "product": "Product_Low",
                "numerator": 20,  # 10.0% rate
                "denominator": 200,  # 20% of 1000 total
            },
        ]
        return pd.DataFrame(data)

    def _run_identical_rates_test(self, test_name, df):
        """Test the identical rates scenario and its business implications."""

        # === DEBUG DATA ===
        print(f"\n[DEBUG] {test_name} - Raw Data:")
        for _, row in df.iterrows():
            rate = row["numerator"] / row["denominator"]
            total_denom = df[df["region"] == row["region"]]["denominator"].sum()
            mix_pct = row["denominator"] / total_denom
            print(
                f"  {row['region']}-{row['vertical']}: {rate:.1%} rate, {mix_pct:.1%} mix ({row['numerator']}/{row['denominator']})"
            )

        # Calculate overall regional performance
        print(f"\n[DEBUG] Overall Regional Performance:")
        for region in df["region"].unique():
            region_data = df[df["region"] == region]
            total_num = region_data["numerator"].sum()
            total_denom = region_data["denominator"].sum()
            overall_rate = total_num / total_denom
            print(f"  {region}: {overall_rate:.1%} overall ({total_num}/{total_denom})")

        # Run decomposition
        results = oaxaca_blinder_decomposition(
            df=df,
            region_column="region",
            numerator_column="numerator",
            denominator_column="denominator",
            category_columns=["product", "vertical"],
        )

        # === CRITICAL DATA POINTS ===
        print(f"\n[CRITICAL] {test_name} Results:")
        for region in df["region"].unique():
            region_results = results[results["region"] == region]
            if len(region_results) == 0:
                continue

            construct_gap = region_results["construct_gap_contribution"].sum()
            performance_gap = region_results["performance_gap_contribution"].sum()
            net_gap = region_results["net_gap"].sum()

            print(f"  {region}:")
            print(
                f"    Construct Gap: {construct_gap:+.4f} ({construct_gap*100:+.1f}pp)"
            )
            print(
                f"    Performance Gap: {performance_gap:+.4f} ({performance_gap*100:+.1f}pp)"
            )
            print(f"    Net Gap: {net_gap:+.4f} ({net_gap*100:+.1f}pp)")

            # Check for identical rates scenario
            if abs(construct_gap) < 0.001 and abs(performance_gap) < 0.001:
                print(f"    ⚠️  IDENTICAL RATES DETECTED: Mathematical cancellation")
                print(f"    📊 Business Reality: Strategic mix differences exist")
                print(f"    💡 Recommendation: Consider segment value weighting")

        # === NARRATIVE ===
        print(f"\n[NARRATIVE] {test_name}:")
        print(
            f"  Identical rates scenario demonstrates mathematical limitations of standard decomposition"
        )
        print(
            f"  Business insight: Mix differences may still have strategic importance"
        )
        print(
            f"  Recommendation: Supplement with volume-based or value-weighted analysis"
        )

    def _run_threshold_sensitivity_analysis(self):
        """Analyze decisive business conclusion logic with various gap scenarios."""

        # Create test data with known construct/performance gap ratios
        test_scenarios = [
            {"construct": 0.02, "performance": 0.03, "expected": "performance_driven"},  # 0.67 ratio - performance larger
            {"construct": 0.03, "performance": 0.02, "expected": "composition_driven"},  # 1.5 ratio - construct larger
            {"construct": 0.025, "performance": 0.025, "expected": "performance_driven"},  # 1.0 ratio - equal, defaults to performance
            {"construct": 0.02, "performance": 0.025, "expected": "performance_driven"},  # 0.8 ratio - performance larger
        ]

        print(f"\n[DEBUG] THRESHOLD_SENSITIVITY - Test Scenarios:")
        for i, scenario in enumerate(test_scenarios):
            ratio = abs(scenario["construct"]) / abs(scenario["performance"])
            print(f"  Scenario {i+1}: Construct={scenario['construct']:.3f}, Performance={scenario['performance']:.3f}, Ratio={ratio:.2f}")

        print(f"\n[CRITICAL] THRESHOLD_SENSITIVITY Results:")
        print(f"  {'Scenario':<12} {'Ratio':<8} {'Expected':<20} {'Actual':<20} {'Result':<10}")
        print(f"  {'-'*12} {'-'*8} {'-'*20} {'-'*20} {'-'*10}")

        all_passed = True
        for i, scenario in enumerate(test_scenarios):
            construct = scenario["construct"]
            performance = scenario["performance"]
            expected = scenario["expected"]
            ratio = abs(construct) / abs(performance)

            # Use our actual business conclusion function
            business_focus = determine_business_focus(construct, performance)
            actual = business_focus["business_conclusion"]

            passed = actual == expected
            all_passed = all_passed and passed
            result = "✅ PASS" if passed else "❌ FAIL"

            print(f"  Scenario {i+1:<4} {ratio:<8.2f} {expected:<20} {actual:<20} {result:<10}")

        # === NARRATIVE ===
        print(f"\n[NARRATIVE] THRESHOLD_SENSITIVITY:")
        print(f"  Decisive logic provides clear, actionable business guidance")
        print(f"  Always focuses on the larger absolute contributor (construct vs performance)")
        print(f"  Eliminates ambiguous 'mixed' recommendations for better decision-making")
        print(f"  Test Result: {'✅ ALL PASS' if all_passed else '❌ SOME FAILURES'}")

    # =================================================================
    # TEST EXECUTION METHODS - Organized output structure
    # =================================================================

    def _run_paradox_test(self, test_name, df, expected_paradox):
        """Run paradox detection test with organized output."""

        # === DEBUG DATA (for reasoning) ===
        print(f"\n[DEBUG] {test_name} - Raw Data:")
        for _, row in df.iterrows():
            rate = row["numerator"] / row["denominator"]
            print(
                f"  {row['region']}-{row['product']}-{row['vertical']}: {rate:.1%} ({row['numerator']}/{row['denominator']})"
            )

        # Run paradox detection
        paradox_results = detect_aggregation_bias(
            df=df,
            region_column="region",
            numerator_column="numerator",
            denominator_column="denominator",
            category_columns=["product"],
            subcategory_columns=["vertical"],
            top_n_focus=1,
        )

        # === CRITICAL DATA POINTS (for conclusion) ===
        paradox_detected = len(paradox_results) > 0
        print(f"\n[CRITICAL] {test_name} Results:")
        print(f"  Expected Paradox: {expected_paradox}")
        print(f"  Detected Paradox: {paradox_detected}")
        print(
            f"  Test Result: {'✅ PASS' if (expected_paradox is None or paradox_detected == expected_paradox) else '❌ FAIL'}"
        )

        if paradox_detected:
            for _, row in paradox_results.iterrows():
                print(f"  Paradox Details: {row['region']}-{row['category']}")
                print(
                    f"    Aggregate → Detailed: {row['aggregate_recommendation']} → {row['subcategory_recommendation']}"
                )
                print(f"    Business Risk: {row['gap_significance']}")

        # === NARRATIVE (for business users) ===
        if paradox_detected:
            print(f"\n[NARRATIVE] {test_name}:")
            print(
                f"  Simpson's paradox detected - aggregate analysis would mislead business decisions"
            )
        else:
            print(f"\n[NARRATIVE] {test_name}:")
            print(
                f"  No paradox detected - analysis is reliable for business decisions"
            )

    def _run_conclusion_test(self, test_name, df, expected_conclusion):
        """Run business conclusion test with organized output."""

        # Run decomposition
        results = oaxaca_blinder_decomposition(
            df=df,
            region_column="region",
            numerator_column="numerator",
            denominator_column="denominator",
            category_columns=["product", "vertical"],
        )

        # Get first region's results
        region = results["region"].iloc[0]
        region_data = results[results["region"] == region]

        construct_gap = region_data["construct_gap_contribution"].sum()
        performance_gap = region_data["performance_gap_contribution"].sum()

        # === DEBUG DATA ===
        print(f"\n[DEBUG] {test_name} - Gap Components:")
        print(f"  Construct Gap: {construct_gap:+.4f} ({construct_gap*100:+.1f}pp)")
        print(
            f"  Performance Gap: {performance_gap:+.4f} ({performance_gap*100:+.1f}pp)"
        )
        ratio_value = (
            abs(construct_gap) / abs(performance_gap)
            if performance_gap != 0
            else float("inf")
        )
        ratio_str = f"{ratio_value:.2f}" if ratio_value != float("inf") else "inf"
        print(f"  Ratio: |Construct|/|Performance| = {ratio_str}")

        # === CRITICAL DATA POINTS ===
        business_focus = determine_business_focus(construct_gap, performance_gap)
        actual_conclusion = business_focus["business_conclusion"]
        print(f"\n[CRITICAL] {test_name} Results:")
        print(f"  Expected Conclusion: {expected_conclusion}")
        print(f"  Actual Conclusion: {actual_conclusion}")
        print(
            f"  Test Result: {'✅ PASS' if actual_conclusion == expected_conclusion else '❌ FAIL'}"
        )

        # === NARRATIVE ===
        print(f"\n[NARRATIVE] {test_name}:")
        if actual_conclusion == "composition_driven":
            print(
                f"  Focus on strategic allocation adjustments - mix optimization needed"
            )
        elif actual_conclusion == "performance_driven":
            print(f"  Focus on operational improvements - execution enhancement needed")
        else:
            print(
                f"  Balanced approach needed - both allocation and execution improvements"
            )

    def _run_validation_test(self, test_name, df):
        """Run mathematical validation test."""

        # Calculate actual regional gaps
        actual_gaps = {}
        for region in df["region"].unique():
            region_data = df[df["region"] == region]
            rest_data = df[df["region"] != region]

            region_rate = (
                region_data["numerator"].sum() / region_data["denominator"].sum()
            )
            rest_rate = rest_data["numerator"].sum() / rest_data["denominator"].sum()
            actual_gaps[region] = region_rate - rest_rate

        # Run decomposition
        results = oaxaca_blinder_decomposition(
            df=df,
            region_column="region",
            numerator_column="numerator",
            denominator_column="denominator",
            category_columns=["product", "vertical"],
        )

        # === DEBUG DATA ===
        print(f"\n[DEBUG] {test_name} - Validation Details:")

        # === CRITICAL DATA POINTS ===
        print(f"\n[CRITICAL] {test_name} Validation:")
        max_error = 0
        for region in actual_gaps:
            region_results = results[results["region"] == region]
            decomposed_gap = region_results["net_gap"].sum()
            error = abs(actual_gaps[region] - decomposed_gap)
            max_error = max(max_error, error)

            print(
                f"  {region}: Actual={actual_gaps[region]:+.4f}, Decomposed={decomposed_gap:+.4f}, Error={error:.6f}"
            )

        print(f"  Maximum Error: {max_error:.6f}")
        print(f"  Test Result: {'✅ PASS' if max_error < 1e-10 else '❌ FAIL'}")

        # === NARRATIVE ===
        print(f"\n[NARRATIVE] {test_name}:")
        if max_error < 1e-10:
            print(f"  Mathematical decomposition is perfectly accurate")
        else:
            print(f"  Mathematical accuracy issue detected - needs investigation")

    def _run_narrative_test(self, test_name, df):
        """Run narrative generation test."""

        # Run enhanced RCA
        results = enhanced_rca_analysis(
            df=df,
            region_column="region",
            numerator_column="numerator",
            denominator_column="denominator",
            category_columns=["product"],
            subcategory_columns=["vertical"],
        )

        # === DEBUG DATA ===
        print(f"\n[DEBUG] {test_name} - Analysis Components:")

        # === CRITICAL DATA POINTS ===
        print(f"\n[CRITICAL] {test_name} Narrative Quality:")
        for region, analysis in results["regional_analysis"].items():
            gap_magnitude = abs(analysis["total_gap"]) * 100
            print(
                f"  {region}: Gap={analysis['total_gap']:+.1%}, Conclusion={analysis['business_conclusion']}"
            )

        # === NARRATIVE ===
        print(f"\n[NARRATIVE] {test_name}:")
        for region, analysis in results["regional_analysis"].items():
            print(f"  {region}: {analysis['business_narrative']}")

    def _run_region_paradox_test(self, test_name, df, expected_paradox):
        """Run region-level Simpson's paradox test with organized output."""

        # === DEBUG DATA (validate the test case setup) ===
        print(f"\n[DEBUG] {test_name} - Raw Data:")

        # Calculate overall regional rates
        overall_rates = {}
        for region in df["region"].unique():
            region_data = df[df["region"] == region]
            total_num = region_data["numerator"].sum()
            total_denom = region_data["denominator"].sum()
            overall_rate = total_num / total_denom
            overall_rates[region] = overall_rate
            print(f"  {region}: {overall_rate:.1%} overall ({total_num}/{total_denom})")

        # Calculate product-level rates
        print(f"\n[DEBUG] Product-Level Rates:")
        product_rates = {}
        for region in df["region"].unique():
            product_rates[region] = {}
            for product in df["product"].unique():
                product_data = df[(df["region"] == region) & (df["product"] == product)]
                if len(product_data) > 0:
                    rate = product_data["numerator"].iloc[0] / product_data["denominator"].iloc[0]
                    mix = product_data["denominator"].iloc[0] / df[df["region"] == region]["denominator"].sum()
                    product_rates[region][product] = rate
                    print(f"  {region}-{product}: {rate:.1%} rate, {mix:.0%} mix")

        # Run region-level paradox detection
        paradox_results = detect_aggregation_bias(
            df=df,
            region_column="region",
            numerator_column="numerator",
            denominator_column="denominator",
            category_columns=None,  # Region-level detection
            subcategory_columns=["product"],
        )

        # === CRITICAL DATA POINTS ===
        paradox_detected = len(paradox_results) > 0
        print(f"\n[CRITICAL] {test_name} Results:")
        print(f"  Expected Paradox: {expected_paradox}")
        print(f"  Detected Paradox: {paradox_detected}")

        # Validate the paradox logic
        if "X" in overall_rates and "Y" in overall_rates:
            overall_gap = overall_rates["X"] - overall_rates["Y"]
            print(f"  Overall Gap: X vs Y = {overall_gap:+.1%} (X appears {'better' if overall_gap > 0 else 'worse'})")

            # Check product-level comparisons
            product_comparisons = []
            if "X" in product_rates and "Y" in product_rates:
                for product in product_rates["X"]:
                    if product in product_rates["Y"]:
                        x_rate = product_rates["X"][product]
                        y_rate = product_rates["Y"][product]
                        x_wins = x_rate > y_rate
                        product_comparisons.append(x_wins)
                        print(f"  {product}: X ({x_rate:.1%}) vs Y ({y_rate:.1%}) = X {'wins' if x_wins else 'loses'}")

            all_products_favor_x = all(product_comparisons) if product_comparisons else False
            paradox_exists = (overall_gap < 0) and all_products_favor_x
            print(f"  Paradox Logic: Overall X<Y ({overall_gap < 0}) + All Products X>Y ({all_products_favor_x}) = Paradox Exists ({paradox_exists})")

        print(f"  Test Result: {'✅ PASS' if (expected_paradox is None or paradox_detected == expected_paradox) else '❌ FAIL'}")

        if paradox_detected:
            for _, row in paradox_results.iterrows():
                print(f"  Paradox Details: {row['region']}")
                contradiction_rate = row.get('contradiction_rate', 'N/A')
                if isinstance(contradiction_rate, (int, float)):
                    print(f"    Contradiction Rate: {contradiction_rate:.1%}")
                else:
                    print(f"    Contradiction Rate: {contradiction_rate}")
                print(f"    Severity: {row.get('severity', 'N/A')}")

        # === NARRATIVE ===
        print(f"\n[NARRATIVE] {test_name}:")
        if paradox_detected:
            print(f"  Region-level Simpson's paradox detected - overall regional comparison misleads")
            print(f"  Business insight: Product mix differences mask true performance capabilities")
            print(f"  Recommendation: Focus on product-level analysis for accurate performance assessment")
        else:
            print(f"  No region-level paradox detected - overall regional comparison is reliable")

    def _run_baseline_consistency_test(self, test_name, df):
        """Run baseline consistency test."""

        baselines = ["rest_of_world", "global_average", "top_performer"]
        baseline_results = {}

        for baseline in baselines:
            try:
                results = oaxaca_blinder_decomposition(
                    df=df,
                    region_column="region",
                    numerator_column="numerator",
                    denominator_column="denominator",
                    category_columns=["product"],
                    baseline_region=baseline,
                )
                baseline_results[baseline] = results
            except Exception as e:
                print(f"[DEBUG] {baseline} failed: {e}")

        # === CRITICAL DATA POINTS ===
        print(f"\n[CRITICAL] {test_name} Baseline Consistency:")
        for baseline, results in baseline_results.items():
            if results is not None:
                total_gaps = {}
                for region in results["region"].unique():
                    region_data = results[results["region"] == region]
                    total_gaps[region] = region_data["net_gap"].sum()
                print(f"  {baseline}: {total_gaps}")

        # === NARRATIVE ===
        print(f"\n[NARRATIVE] {test_name}:")
        print(
            f"  Baseline consistency validated across {len(baseline_results)} methods"
        )


if __name__ == "__main__":
    framework = StressTestFramework()
    framework.run_all_stress_tests()

Last updated