From dc7446bcbb7ae598ebfa75ccea2629b41cc9f123 Mon Sep 17 00:00:00 2001
From: VaitaR <vaitar@example.com>
Date: Mon, 23 Jun 2025 14:54:18 +0300
Subject: [PATCH] text fix small

---
 SAMPLE_DATA_GENERATOR_IMPROVEMENTS.md | 143 ++++++++++++++++++++
 app.py                                |  30 ++++-
 core/data_source.py                   | 185 ++++++++++++++++++--------
 path_analyzer.py                      |   8 +-
 4 files changed, 310 insertions(+), 56 deletions(-)
 create mode 100644 SAMPLE_DATA_GENERATOR_IMPROVEMENTS.md

diff --git a/SAMPLE_DATA_GENERATOR_IMPROVEMENTS.md b/SAMPLE_DATA_GENERATOR_IMPROVEMENTS.md
new file mode 100644
index 0000000..a2f65d1
--- /dev/null
+++ b/SAMPLE_DATA_GENERATOR_IMPROVEMENTS.md
@@ -0,0 +1,143 @@
+# Sample Data Generator Improvements Summary
+
+## 🎯 Objective Achieved
+Successfully improved the sample data generator to create **exactly 8 events** with **higher user connectivity** for more realistic funnel analysis.
+
+## 📊 Key Improvements
+
+### 1. **Exactly 8 Events** ✅
+**Before:** 6 main funnel events + 8 additional scattered events (14 total)
+**After:** Exactly 8 focused funnel events
+
+```python
+# NEW Event Sequence (8 events)
+event_sequence = [
+    "Sign Up",
+    "Email Verification", 
+    "First Login",
+    "Profile Setup",
+    "Product Browse",
+    "Add to Cart",
+    "Checkout Start",
+    "Purchase Complete",
+]
+```
+
+### 2. **Higher User Connectivity** 📈
+**Before:** Aggressive dropout rates (25%, 20%, 25%, 20%, 22%)
+**After:** Gradual, realistic dropout rates (12%, 15%, 18%, 20%, 22%, 25%, 28%)
+
+#### Connectivity Statistics:
+- **Average events per user:** 4.91 (significantly improved)
+- **Users completing all 8 events:** 20.6% (1,652 users)
+- **Users completing 5+ events:** 56.0% (4,490 users)
+- **Users completing only 1 event:** 7.2% (580 users)
+
+### 3. **Weighted User Retention** 🎯
+Implemented intelligent user selection based on user properties:
+- **Premium users:** 1.8x more likely to continue
+- **Basic users:** 1.3x more likely to continue  
+- **Younger users (18-35):** 1.2x more likely to continue
+- **Free users:** Standard retention rate
+
+### 4. **Realistic Timing Progression** ⏰
+**Before:** Exponential time distribution with cohort factors
+**After:** Step-specific realistic timing:
+- **Sign Up:** On registration date
+- **Email Verification:** Within 2 hours (exponential)
+- **First Login:** Within 12 hours (exponential)
+- **Profile Setup:** Within 48 hours (exponential)
+- **Shopping Events:** Spread over weeks
+
+### 5. **Enhanced Event Properties** 🔧
+Added step-specific rich properties for better analysis:
+
+#### Purchase Complete:
+- `order_value`: $30-$300 range (lognormal distribution)
+- `payment_method`: credit_card, paypal, apple_pay, google_pay
+- `product_category`: electronics, clothing, books, home
+
+#### Add to Cart:
+- `cart_value`: $25-$200 range
+- `items_count`: 1-5 items with realistic distribution
+
+#### Product Browse:
+- `pages_viewed`: 1-8 pages viewed
+- `time_spent_minutes`: Exponential distribution (avg 8 min)
+
+### 6. **Cross-Step Engagement Events** 🔄
+Added repeat interactions for 40% of users to increase connectivity:
+- **Repeat events:** Product Browse, Add to Cart
+- **Timing:** 1 week to 2 months after initial journey
+- **Enhanced properties:** Longer sessions, higher values for repeat users
+
+### 7. **Improved Data Quality** 🛠️
+- **JSON Serialization:** Fixed numpy type issues with explicit type casting
+- **Session Tracking:** Added unique session IDs for each user interaction
+- **Repeat Action Flags:** Marked repeat actions for analysis
+- **Performance:** Reduced from 10,000 to 8,000 users for optimal performance
+
+## 📈 Results Comparison
+
+### Event Distribution:
+```
+Sign Up:              8,000 events (100.0%)
+Email Verification:   7,040 events (88.0%)
+Product Browse:       6,711 events (83.9%)  ← High engagement
+First Login:          5,984 events (74.8%)
+Add to Cart:          5,812 events (72.7%)  ← High engagement
+Profile Setup:        4,906 events (61.3%)
+Checkout Start:       2,295 events (28.7%)
+Purchase Complete:    1,652 events (20.6%)
+```
+
+### User Journey Connectivity:
+```
+1 event:   580 users (7.2%)   ← Very few single-event users
+2 events:  909 users (11.4%)
+3 events: 1,025 users (12.8%)
+4 events:  996 users (12.4%)
+5 events: 1,073 users (13.4%)
+6 events: 1,122 users (14.0%)
+7 events:  643 users (8.0%)
+8 events: 1,652 users (20.6%) ← Strong complete journey rate
+```
+
+## 🎯 Business Impact
+
+### Better Analysis Capabilities:
+1. **Process Mining:** More connected user journeys for path analysis
+2. **Cohort Analysis:** Realistic user behavior patterns
+3. **Time Series:** Proper timing distributions for temporal analysis
+4. **Segmentation:** Rich properties for detailed segmentation
+
+### More Realistic Funnels:
+1. **E-commerce Focus:** Clear shopping journey from browse to purchase
+2. **Engagement Patterns:** Repeat interactions mirror real user behavior
+3. **Retention Logic:** User properties influence journey completion
+4. **Revenue Tracking:** Order values and payment methods for business analysis
+
+## 🔧 Technical Improvements
+
+### Performance:
+- **Data Size:** Optimized from 10K to 8K users
+- **Event Focus:** Reduced from 14 to 8 events for clarity
+- **JSON Handling:** Fixed serialization issues
+- **Memory Usage:** More efficient data generation
+
+### Code Quality:
+- **Type Safety:** Explicit type casting for JSON serialization
+- **Documentation:** Clear comments explaining each improvement
+- **Maintainability:** Structured approach to event generation
+- **Testing:** Verified with actual funnel analysis
+
+## ✅ Success Metrics
+
+1. **✅ Exactly 8 Events:** Achieved - no more, no less
+2. **✅ Higher Connectivity:** 4.91 avg events/user vs previous lower connectivity
+3. **✅ Realistic Patterns:** E-commerce journey with proper timing
+4. **✅ Rich Properties:** Step-specific properties for advanced analysis
+5. **✅ Performance:** Faster generation and analysis
+6. **✅ Compatibility:** Works seamlessly with existing funnel calculator
+
+The improved sample data generator now provides a much more realistic and connected dataset that better represents actual user behavior in an e-commerce funnel, enabling more meaningful analysis and testing of the funnel analytics platform. 
\ No newline at end of file
diff --git a/app.py b/app.py
index 2a39914..ebd4387 100644
--- a/app.py
+++ b/app.py
@@ -925,7 +925,7 @@ def main():
                 )
 
             # Segmentation в отдельной секции для лучшей видимости
-            st.markdown("### 🎯 Segmentation (Optional)")
+            st.markdown("### 🎯 Segmentation (Optional, in development)")
 
             # Segmentation controls
             selected_property = "None"
@@ -2047,6 +2047,20 @@ def format_time(minutes):
                         help="Display transition counts on visualizations",
                     )
 
+                with col4:
+                    use_funnel_events_only = st.checkbox(
+                        "Use selected events only",
+                        value=True,
+                        help="Analyze only the events selected in your funnel (recommended for focused analysis)",
+                    )
+
+            # Show warning if filtering is enabled but no funnel events selected
+            if use_funnel_events_only and not st.session_state.funnel_steps:
+                st.warning(
+                    "⚠️ 'Use selected events only' is enabled but no funnel events are selected. "
+                    "Please build your funnel first or disable this option to analyze all events."
+                )
+
             # Process Mining Analysis
             if st.button("🚀 Discover Process", type="primary", use_container_width=True):
                 with st.spinner("Analyzing user journeys..."):
@@ -2055,18 +2069,30 @@ def format_time(minutes):
                         config = FunnelConfig()
                         path_analyzer = PathAnalyzer(config)
 
+                        # Determine which events to analyze
+                        filter_events = None
+                        if use_funnel_events_only and st.session_state.funnel_steps:
+                            filter_events = st.session_state.funnel_steps
+
                         # Discover process structure
                         process_data = path_analyzer.discover_process_mining_structure(
                             st.session_state.events_data,
                             min_frequency=min_frequency,
                             include_cycles=include_cycles,
+                            filter_events=filter_events,
                         )
 
                         # Store in session state
                         st.session_state.process_mining_data = process_data
 
+                        # Create success message with filtering info
+                        if filter_events:
+                            filter_info = f" (filtered to {len(filter_events)} selected funnel events)"
+                        else:
+                            filter_info = " (analyzing all events in dataset)"
+                        
                         st.success(
-                            f"✅ Discovered {len(process_data.activities)} activities and {len(process_data.transitions)} transitions"
+                            f"✅ Discovered {len(process_data.activities)} activities and {len(process_data.transitions)} transitions{filter_info}"
                         )
 
                     except Exception as e:
diff --git a/core/data_source.py b/core/data_source.py
index 4f993d6..70e2fc3 100644
--- a/core/data_source.py
+++ b/core/data_source.py
@@ -370,11 +370,11 @@ def get_lazy_frame(self) -> Union[pl.LazyFrame, None]:
 
     @_data_source_performance_monitor("get_sample_data")
     def get_sample_data(self) -> pd.DataFrame:
-        """Generate sample event data for demonstration"""
+        """Generate sample event data for demonstration with exactly 8 events and high user connectivity"""
         np.random.seed(42)
 
         # Generate users with user properties
-        n_users = 10000
+        n_users = 8000  # Reduced for better performance while maintaining connectivity
         user_ids = [f"user_{i:05d}" for i in range(n_users)]
 
         # Generate user properties for segmentation
@@ -396,60 +396,131 @@ def get_sample_data(self) -> pd.DataFrame:
             }
 
         events_data = []
+        
+        # EXACTLY 8 events for focused funnel analysis
         event_sequence = [
-            "User Sign-Up",
-            "Verify Email",
+            "Sign Up",
+            "Email Verification", 
             "First Login",
             "Profile Setup",
-            "Tutorial Completed",
-            "First Purchase",
+            "Product Browse",
+            "Add to Cart",
+            "Checkout Start",
+            "Purchase Complete",
         ]
 
-        # Generate realistic funnel progression
+        # Generate realistic funnel progression with HIGHER user connectivity
         current_users = set(user_ids)
         base_time = datetime(2024, 1, 1)
 
+        # IMPROVED dropout rates for higher connectivity - more gradual, less aggressive
+        # This ensures more users progress through multiple steps
+        dropout_rates = [0.0, 0.12, 0.15, 0.18, 0.20, 0.22, 0.25, 0.28]
+
         for step_idx, event_name in enumerate(event_sequence):
-            # Calculate dropout rate (realistic funnel)
-            dropout_rates = [0.0, 0.25, 0.20, 0.25, 0.20, 0.22]
             remaining_users = list(current_users)
 
             if step_idx > 0:
-                # Remove some users (dropout)
-                n_remaining = int(len(remaining_users) * (1 - dropout_rates[step_idx]))
-                remaining_users = np.random.choice(remaining_users, n_remaining, replace=False)
+                # More gradual dropout for higher connectivity
+                retention_rate = 1 - dropout_rates[step_idx]
+                n_remaining = int(len(remaining_users) * retention_rate)
+                
+                # Use weighted selection to keep more engaged users
+                # Users with premium subscriptions are more likely to continue
+                user_weights = []
+                for user_id in remaining_users:
+                    weight = 1.0
+                    user_props = user_properties[user_id]
+                    # Premium users have higher retention
+                    if user_props["subscription_plan"] == "premium":
+                        weight = 1.8
+                    elif user_props["subscription_plan"] == "basic":
+                        weight = 1.3
+                    # Younger users are more engaged
+                    if user_props["age_group"] in ["18-25", "26-35"]:
+                        weight *= 1.2
+                    user_weights.append(weight)
+                
+                # Normalize weights
+                user_weights = np.array(user_weights)
+                user_weights = user_weights / user_weights.sum()
+                
+                # Select users with weighted probability
+                remaining_users = np.random.choice(
+                    remaining_users, 
+                    size=n_remaining, 
+                    replace=False, 
+                    p=user_weights
+                )
                 current_users = set(remaining_users)
 
-            # Generate events for remaining users
+            # Generate events for remaining users with realistic timing
             for user_id in remaining_users:
-                # Add some time variance between steps with cohort effect
                 user_props = user_properties[user_id]
                 reg_date = datetime.strptime(user_props["registration_date"], "%Y-%m-%d")
 
-                # Time variance based on registration cohort
-                cohort_factor = (reg_date - base_time).days / 365 + 1
-                hours_offset = np.random.exponential(24 * step_idx * cohort_factor + 1)
-                timestamp = reg_date + timedelta(hours=hours_offset)
+                # More realistic timing progression
+                if step_idx == 0:
+                    # Sign up happens on registration date
+                    timestamp = reg_date
+                elif step_idx == 1:
+                    # Email verification within hours
+                    timestamp = reg_date + timedelta(hours=np.random.exponential(2))
+                elif step_idx == 2:
+                    # First login within 1-2 days
+                    timestamp = reg_date + timedelta(hours=np.random.exponential(12))
+                elif step_idx == 3:
+                    # Profile setup within first week
+                    timestamp = reg_date + timedelta(hours=np.random.exponential(48))
+                else:
+                    # Shopping events can be spread over weeks
+                    base_hours = 24 * step_idx
+                    timestamp = reg_date + timedelta(hours=np.random.exponential(base_hours))
 
-                # Add event properties for segmentation
+                # Enhanced event properties for better segmentation
                 properties = {
                     "platform": np.random.choice(
-                        ["mobile", "desktop", "tablet"], p=[0.6, 0.3, 0.1]
+                        ["mobile", "desktop", "tablet"], p=[0.65, 0.30, 0.05]
                     ),
                     "utm_source": np.random.choice(
                         ["organic", "google_ads", "facebook", "email", "direct"],
-                        p=[0.3, 0.25, 0.2, 0.15, 0.1],
+                        p=[0.35, 0.25, 0.20, 0.12, 0.08],
                     ),
                     "utm_campaign": np.random.choice(
-                        ["summer_sale", "new_user", "retargeting", "brand"],
-                        p=[0.3, 0.3, 0.25, 0.15],
+                        ["new_user_2024", "spring_promo", "retargeting", "brand_awareness"],
+                        p=[0.35, 0.30, 0.25, 0.10],
                     ),
                     "app_version": np.random.choice(
-                        ["2.1.0", "2.2.0", "2.3.0"], p=[0.2, 0.3, 0.5]
+                        ["3.1.0", "3.2.0", "3.3.0"], p=[0.15, 0.35, 0.50]
                     ),
-                    "device_type": np.random.choice(["ios", "android", "web"], p=[0.4, 0.4, 0.2]),
+                    "device_type": np.random.choice(["ios", "android", "web"], p=[0.45, 0.40, 0.15]),
+                    "session_id": f"session_{user_id}_{step_idx}_{np.random.randint(1000, 9999)}",
                 }
 
+                # Add step-specific properties for richer analysis
+                if event_name == "Purchase Complete":
+                    properties.update({
+                        "order_value": float(round(np.random.lognormal(3.5, 0.8), 2)),  # $30-$300 range
+                        "payment_method": str(np.random.choice(
+                            ["credit_card", "paypal", "apple_pay", "google_pay"], 
+                            p=[0.50, 0.25, 0.15, 0.10]
+                        )),
+                        "product_category": str(np.random.choice(
+                            ["electronics", "clothing", "books", "home"], 
+                            p=[0.30, 0.35, 0.20, 0.15]
+                        )),
+                    })
+                elif event_name == "Add to Cart":
+                    properties.update({
+                        "cart_value": float(round(np.random.lognormal(3.2, 0.6), 2)),  # $25-$200 range
+                        "items_count": int(np.random.choice([1, 2, 3, 4, 5], p=[0.40, 0.30, 0.15, 0.10, 0.05])),
+                    })
+                elif event_name == "Product Browse":
+                    properties.update({
+                        "pages_viewed": int(np.random.choice([1, 2, 3, 4, 5, 6, 7, 8], p=[0.25, 0.20, 0.15, 0.12, 0.10, 0.08, 0.06, 0.04])),
+                        "time_spent_minutes": float(round(np.random.exponential(8), 1)),
+                    })
+
                 events_data.append(
                     {
                         "user_id": user_id,
@@ -460,39 +531,47 @@ def get_sample_data(self) -> pd.DataFrame:
                     }
                 )
 
-        # Add some non-funnel events for path analysis
-        additional_events = [
-            "Page View",
-            "Product View",
-            "Search",
-            "Add to Wishlist",
-            "Help Page Visit",
-            "Contact Support",
-            "Settings View",
-            "Logout",
-        ]
-
-        # Generate additional events between funnel steps
-        for user_id in user_ids[:5000]:  # Only for subset to make data manageable
-            n_additional = np.random.poisson(3)  # Average 3 additional events per user
+        # Add cross-step engagement events for users who completed multiple steps
+        # This increases connectivity between events
+        engaged_users = [uid for uid in user_ids if np.random.random() < 0.4]  # 40% of users are "engaged"
+        
+        for user_id in engaged_users:
+            # Add repeat interactions for engaged users
+            user_props = user_properties[user_id]
+            reg_date = datetime.strptime(user_props["registration_date"], "%Y-%m-%d")
+            
+            # Generate 1-3 additional events from the main sequence
+            n_additional = np.random.choice([1, 2, 3], p=[0.5, 0.3, 0.2])
+            
             for _ in range(n_additional):
-                event_name = np.random.choice(additional_events)
-                timestamp = base_time + timedelta(
-                    hours=np.random.uniform(0, 24 * 30)  # Within 30 days
+                # Choose events they're likely to repeat (browse, cart actions)
+                repeat_events = ["Product Browse", "Add to Cart"]
+                event_name = np.random.choice(repeat_events)
+                
+                # Timing should be after their initial journey
+                timestamp = reg_date + timedelta(
+                    days=np.random.uniform(7, 60)  # 1 week to 2 months later
                 )
-
+                
                 properties = {
                     "platform": np.random.choice(
-                        ["mobile", "desktop", "tablet"], p=[0.6, 0.3, 0.1]
-                    ),
-                    "utm_source": np.random.choice(
-                        ["organic", "google_ads", "facebook", "email", "direct"],
-                        p=[0.3, 0.25, 0.2, 0.15, 0.1],
-                    ),
-                    "page_category": np.random.choice(
-                        ["product", "help", "account", "home"], p=[0.4, 0.2, 0.2, 0.2]
+                        ["mobile", "desktop", "tablet"], p=[0.65, 0.30, 0.05]
                     ),
+                    "utm_source": "direct",  # Repeat users often come direct
+                    "session_id": f"session_{user_id}_repeat_{np.random.randint(1000, 9999)}",
+                    "is_repeat_action": True,
                 }
+                
+                if event_name == "Product Browse":
+                    properties.update({
+                        "pages_viewed": int(np.random.choice([2, 3, 4, 5, 6], p=[0.20, 0.25, 0.25, 0.20, 0.10])),
+                        "time_spent_minutes": float(round(np.random.exponential(12), 1)),  # Longer sessions for repeat users
+                    })
+                elif event_name == "Add to Cart":
+                    properties.update({
+                        "cart_value": float(round(np.random.lognormal(3.4, 0.7), 2)),  # Slightly higher for repeat users
+                        "items_count": int(np.random.choice([1, 2, 3, 4], p=[0.30, 0.35, 0.25, 0.10])),
+                    })
 
                 events_data.append(
                     {
@@ -500,7 +579,7 @@ def get_sample_data(self) -> pd.DataFrame:
                         "event_name": event_name,
                         "timestamp": timestamp,
                         "event_properties": json.dumps(properties),
-                        "user_properties": json.dumps(user_properties[user_id]),
+                        "user_properties": json.dumps(user_props),
                     }
                 )
 
diff --git a/path_analyzer.py b/path_analyzer.py
index 30a2da0..f691c74 100644
--- a/path_analyzer.py
+++ b/path_analyzer.py
@@ -620,6 +620,7 @@ def discover_process_mining_structure(
         min_frequency: int = 10,
         include_cycles: bool = True,
         time_window_hours: Optional[int] = None,
+        filter_events: Optional[list[str]] = None,
     ) -> ProcessMiningData:
         """
         Automatic process discovery from user events using advanced algorithms
@@ -629,6 +630,7 @@ def discover_process_mining_structure(
             min_frequency: Minimum frequency to include transition
             include_cycles: Whether to detect cycles and loops
             time_window_hours: Optional time window for process analysis
+            filter_events: Optional list of event names to filter analysis to (e.g., funnel events only)
 
         Returns:
             ProcessMiningData with complete process structure
@@ -658,6 +660,10 @@ def discover_process_mining_structure(
             cutoff_time = datetime.now() - timedelta(hours=time_window_hours)
             events_pl = events_pl.filter(pl.col("timestamp") >= cutoff_time)
 
+        # Filter by specific events if specified (e.g., funnel events only)
+        if filter_events:
+            events_pl = events_pl.filter(pl.col("event_name").is_in(filter_events))
+
         # Build user journeys (optimized) - avoid dictionary conversion when possible
         journey_df = self._build_user_journeys_optimized(events_pl)
 
@@ -765,7 +771,7 @@ def _build_user_journeys(self, events_pl: pl.DataFrame) -> dict[str, list[dict[s
         return journeys
 
     def _discover_activities(
-        self, events_pl: pl.DataFrame, user_journeys: dict[str, list[dict[str, Any]]]
+        self, events_pl: pl.DataFrame, user_journeys: Optional[dict[str, list[dict[str, Any]]]] = None
     ) -> dict[str, dict[str, Any]]:
         """Discover activities and their characteristics - optimized version"""
         # Get optimized journey DataFrame