diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml new file mode 100644 index 0000000..ea6a2f6 --- /dev/null +++ b/.github/workflows/linting.yml @@ -0,0 +1,112 @@ +name: Linting + +on: + push: + branches: + - main + - feat/ci-pipeline-4 + pull_request: + branches: + - main + +permissions: + checks: write + contents: write + +jobs: + lint: + name: Lint ${{ matrix.language }} + runs-on: ubuntu-latest + strategy: + matrix: + include: + - language: Python + setup: python + - language: Rust + setup: rust + steps: + - name: Check out Git repository + uses: actions/checkout@v4 + + # Python Setup + - name: Set up Python + if: matrix.setup == 'python' + uses: actions/setup-python@v5 + with: + python-version: '3.x' + + - name: Install Python dependencies + if: matrix.setup == 'python' + run: | + python -m pip install --upgrade pip + pip install black flake8 isort + + - name: Auto-format with black + if: matrix.setup == 'python' + run: black . + + - name: Sort imports with isort + if: matrix.setup == 'python' + run: isort . + + - name: Lint with flake8 + if: matrix.setup == 'python' + run: flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + + # Rust Setup + - name: Install Rust Toolchain + if: matrix.setup == 'rust' + uses: dtolnay/rust-toolchain@stable + with: + components: clippy, rustfmt + + - name: Auto-format with rustfmt + if: matrix.setup == 'rust' + run: cargo fmt --all + working-directory: ./priors + + - name: Run clippy with auto-fix + if: matrix.setup == 'rust' + run: cargo clippy --all-targets --all-features --fix --allow-dirty -- -D warnings + working-directory: ./priors + + commit: + name: Commit all changes + needs: lint + runs-on: ubuntu-latest + steps: + - name: Check out Git repository + uses: actions/checkout@v4 + with: + ref: ${{ github.head_ref }} + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.x' + + - name: Install Python dependencies + run: | + python -m pip install --upgrade pip + pip install black flake8 isort + + - name: Install Rust Toolchain + uses: dtolnay/rust-toolchain@stable + with: + components: clippy, rustfmt + + - name: Run all linters + run: | + black . + isort . + cd priors + cargo fmt --all + cargo clippy --all-targets --all-features --fix --allow-dirty --allow-staged -- -D warnings + + - name: Commit changes + uses: stefanzweifel/git-auto-commit-action@v5 + with: + commit_message: "style: auto-format with linters" + commit_user_name: "github-actions[bot]" + commit_user_email: "github-actions[bot]@users.noreply.github.com" + skip_dirty_check: false diff --git a/benchmark.py b/benchmark.py index b09fb6d..337a085 100755 --- a/benchmark.py +++ b/benchmark.py @@ -1,20 +1,23 @@ #!/usr/bin/env python3 +import os import time + import numpy as np import psutil -import os try: import priors + HAS_PRIORS = True except ImportError: print("❌ priors not installed") exit(1) try: - from mlxtend.frequent_patterns import fpgrowth as mlxtend_fpgrowth import pandas as pd + from mlxtend.frequent_patterns import fpgrowth as mlxtend_fpgrowth + HAS_MLXTEND = True except ImportError: HAS_MLXTEND = False @@ -22,6 +25,7 @@ try: from efficient_apriori import apriori as efficient_apriori + HAS_EFFICIENT_APRIORI = True except ImportError: HAS_EFFICIENT_APRIORI = False @@ -33,7 +37,7 @@ def get_memory_mb(): def generate_data(num_tx, num_items, avg_size, density): - print(f" Generating {num_tx:,} × {num_items} transactions...", end='', flush=True) + print(f" Generating {num_tx:,} × {num_items} transactions...", end="", flush=True) np.random.seed(42) data = np.zeros((num_tx, num_items), dtype=np.int32) @@ -48,7 +52,7 @@ def generate_data(num_tx, num_items, avg_size, density): def benchmark(name, func, data, min_sup): - print(f" {name:25s}", end='', flush=True) + print(f" {name:25s}", end="", flush=True) mem_start = get_memory_mb() try: @@ -83,12 +87,12 @@ def test_priors_lazy(data, sup): chunk_size = 5000 for i in range(0, data.shape[0], chunk_size): - priors.lazy_count_pass(pid, data[i:i+chunk_size]) + priors.lazy_count_pass(pid, data[i : i + chunk_size]) priors.lazy_finalize_counts(pid, sup) for i in range(0, data.shape[0], chunk_size): - priors.lazy_build_pass(pid, data[i:i+chunk_size]) + priors.lazy_build_pass(pid, data[i : i + chunk_size]) result = priors.lazy_mine_patterns(pid, sup) priors.lazy_cleanup(pid) @@ -96,7 +100,9 @@ def test_priors_lazy(data, sup): def test_mlxtend(data, sup): - df = pd.DataFrame(data.astype(bool), columns=[f"i{i}" for i in range(data.shape[1])]) + df = pd.DataFrame( + data.astype(bool), columns=[f"i{i}" for i in range(data.shape[1])] + ) return mlxtend_fpgrowth(df, min_support=sup, use_colnames=True) @@ -106,17 +112,47 @@ def test_efficient_apriori(data, sup): configs = [ - {'name': '10K × 50', 'tx': 10_000, 'items': 50, 'size': 20, 'dens': 0.7, 'sup': 0.02}, - {'name': '30K × 80', 'tx': 30_000, 'items': 80, 'size': 35, 'dens': 0.75, 'sup': 0.01}, - {'name': '60K × 100', 'tx': 60_000, 'items': 100, 'size': 50, 'dens': 0.8, 'sup': 0.008}, - {'name': '100K × 120', 'tx': 100_000, 'items': 120, 'size': 60, 'dens': 0.85, 'sup': 0.005}, + { + "name": "10K × 50", + "tx": 10_000, + "items": 50, + "size": 20, + "dens": 0.7, + "sup": 0.02, + }, + { + "name": "30K × 80", + "tx": 30_000, + "items": 80, + "size": 35, + "dens": 0.75, + "sup": 0.01, + }, + { + "name": "60K × 100", + "tx": 60_000, + "items": 100, + "size": 50, + "dens": 0.8, + "sup": 0.008, + }, + { + "name": "100K × 120", + "tx": 100_000, + "items": 120, + "size": 60, + "dens": 0.85, + "sup": 0.005, + }, ] -print("\n" + "="*80) +print("\n" + "=" * 80) print("⚡ FP-Growth Benchmark") -print("="*80) -print(f"System RAM: {psutil.virtual_memory().total/1024**3:.1f}GB | Available: {psutil.virtual_memory().available/1024**3:.1f}GB") -print("="*80) +print("=" * 80) +print( + f"System RAM: {psutil.virtual_memory().total/1024**3:.1f}GB | Available: {psutil.virtual_memory().available/1024**3:.1f}GB" +) +print("=" * 80) results = [] @@ -124,23 +160,27 @@ def test_efficient_apriori(data, sup): print(f"\n📊 {cfg['name']} (density={cfg['dens']}, support={cfg['sup']})") print("-" * 80) - data = generate_data(cfg['tx'], cfg['items'], cfg['size'], cfg['dens']) + data = generate_data(cfg["tx"], cfg["items"], cfg["size"], cfg["dens"]) - t1, p1, m1 = benchmark('priors (regular)', test_priors_regular, data, cfg['sup']) - t2, p2, m2 = benchmark('priors (lazy)', test_priors_lazy, data, cfg['sup']) + t1, p1, m1 = benchmark("priors (regular)", test_priors_regular, data, cfg["sup"]) + t2, p2, m2 = benchmark("priors (lazy)", test_priors_lazy, data, cfg["sup"]) if HAS_MLXTEND: - t3, p3, m3 = benchmark('mlxtend', test_mlxtend, data, cfg['sup']) + t3, p3, m3 = benchmark("mlxtend", test_mlxtend, data, cfg["sup"]) else: t3, p3, m3 = None, None, None if HAS_EFFICIENT_APRIORI: - t4, p4, m4 = benchmark('efficient-apriori', test_efficient_apriori, data, cfg['sup']) + t4, p4, m4 = benchmark( + "efficient-apriori", test_efficient_apriori, data, cfg["sup"] + ) else: t4, p4, m4 = None, None, None if t1 and t2: - print(f"\n 💡 Lazy vs Regular: {((t2/t1-1)*100):+.1f}% time | {((1-m2/m1)*100):+.1f}% memory savings") + print( + f"\n 💡 Lazy vs Regular: {((t2/t1-1)*100):+.1f}% time | {((1-m2/m1)*100):+.1f}% memory savings" + ) if t1 and t3: print(f" 💡 priors vs mlxtend: {(t3/t1):.1f}x faster") @@ -148,31 +188,41 @@ def test_efficient_apriori(data, sup): if t1 and t4: print(f" 💡 priors vs efficient-apriori: {(t4/t1):.1f}x faster") - results.append({ - 'dataset': cfg['name'], - 'priors_time': t1, - 'lazy_time': t2, - 'mlxtend_time': t3, - 'efficient_time': t4, - 'patterns': p1, - }) - -print("\n" + "="*80) + results.append( + { + "dataset": cfg["name"], + "priors_time": t1, + "lazy_time": t2, + "mlxtend_time": t3, + "efficient_time": t4, + "patterns": p1, + } + ) + +print("\n" + "=" * 80) print("📈 Summary") -print("="*80) +print("=" * 80) for r in results: print(f"\n{r['dataset']}:") - print(f" Patterns: {r['patterns']:,}" if r['patterns'] else " Patterns: N/A") - if r['priors_time']: + print(f" Patterns: {r['patterns']:,}" if r["patterns"] else " Patterns: N/A") + if r["priors_time"]: print(f" priors (regular): {r['priors_time']:.3f}s") - if r['lazy_time']: + if r["lazy_time"]: print(f" priors (lazy): {r['lazy_time']:.3f}s") - if r['mlxtend_time']: - print(f" mlxtend: {r['mlxtend_time']:.3f}s ({r['mlxtend_time']/r['priors_time']:.1f}x slower)" if r['priors_time'] else f" mlxtend: {r['mlxtend_time']:.3f}s") - if r['efficient_time']: - print(f" efficient-apriori: {r['efficient_time']:.3f}s ({r['efficient_time']/r['priors_time']:.1f}x slower)" if r['priors_time'] else f" efficient-apriori: {r['efficient_time']:.3f}s") - -print("\n" + "="*80) + if r["mlxtend_time"]: + print( + f" mlxtend: {r['mlxtend_time']:.3f}s ({r['mlxtend_time']/r['priors_time']:.1f}x slower)" + if r["priors_time"] + else f" mlxtend: {r['mlxtend_time']:.3f}s" + ) + if r["efficient_time"]: + print( + f" efficient-apriori: {r['efficient_time']:.3f}s ({r['efficient_time']/r['priors_time']:.1f}x slower)" + if r["priors_time"] + else f" efficient-apriori: {r['efficient_time']:.3f}s" + ) + +print("\n" + "=" * 80) print("✓ Benchmark Complete") -print("="*80) +print("=" * 80) diff --git a/priors/benchmarks/benchmark_apriori.py b/priors/benchmarks/benchmark_apriori.py index cd9013e..e695bd2 100644 --- a/priors/benchmarks/benchmark_apriori.py +++ b/priors/benchmarks/benchmark_apriori.py @@ -1,55 +1,55 @@ """ Benchmark comparison between different Apriori implementations: - priors (our Rust implementation) -- mlxtend +- mlxtend - efficient-apriori """ +import random +import time +from typing import Any, Dict, List, Tuple + import numpy as np import pandas as pd import pytest -import time -from typing import List, Tuple, Any, Dict -import random +from efficient_apriori import apriori as efficient_apriori +from mlxtend.frequent_patterns import apriori as mlxtend_apriori +from mlxtend.preprocessing import TransactionEncoder # Import the implementations import priors -from mlxtend.frequent_patterns import apriori as mlxtend_apriori -from mlxtend.preprocessing import TransactionEncoder -from efficient_apriori import apriori as efficient_apriori def generate_random_transactions( - num_transactions: int, - num_items: int, - avg_transaction_size: int, - seed: int = 42 + num_transactions: int, num_items: int, avg_transaction_size: int, seed: int = 42 ) -> Tuple[np.ndarray, List[List[int]]]: """ Generate random transaction data for benchmarking. - + Returns: tuple: (binary_matrix, transaction_lists) """ random.seed(seed) np.random.seed(seed) - + transactions_lists = [] binary_matrix = np.zeros((num_transactions, num_items), dtype=np.int32) - + for i in range(num_transactions): # Vary transaction size around the average - size = max(1, int(np.random.normal(avg_transaction_size, avg_transaction_size * 0.3))) + size = max( + 1, int(np.random.normal(avg_transaction_size, avg_transaction_size * 0.3)) + ) size = min(size, num_items) - + # Select random items for this transaction items = random.sample(range(num_items), size) transactions_lists.append(items) - + # Set binary matrix for item in items: binary_matrix[i, item] = 1 - + return binary_matrix, transactions_lists @@ -57,20 +57,20 @@ def generate_correlated_transactions( num_transactions: int, num_items: int, correlation_groups: List[List[int]], - seed: int = 42 + seed: int = 42, ) -> Tuple[np.ndarray, List[List[int]]]: """ Generate transaction data with correlated items for more interesting patterns. """ random.seed(seed) np.random.seed(seed) - + transactions_lists = [] binary_matrix = np.zeros((num_transactions, num_items), dtype=np.int32) - + for i in range(num_transactions): transaction = [] - + # For each correlation group, decide if we include it for group in correlation_groups: if random.random() < 0.3: # 30% chance to include a group @@ -78,21 +78,27 @@ def generate_correlated_transactions( for item in group: if random.random() < 0.8: # 80% chance for each item in the group transaction.append(item) - + # Add some random items - remaining_items = [i for i in range(num_items) if i not in [item for group in correlation_groups for item in group]] + remaining_items = [ + i + for i in range(num_items) + if i not in [item for group in correlation_groups for item in group] + ] num_random = random.randint(0, 3) if remaining_items: - transaction.extend(random.sample(remaining_items, min(num_random, len(remaining_items)))) - + transaction.extend( + random.sample(remaining_items, min(num_random, len(remaining_items))) + ) + # Remove duplicates and sort transaction = sorted(list(set(transaction))) transactions_lists.append(transaction) - + # Set binary matrix for item in transaction: binary_matrix[i, item] = 1 - + return binary_matrix, transactions_lists @@ -105,19 +111,23 @@ def run_priors_apriori(binary_matrix: np.ndarray, min_support: float) -> List[An def run_mlxtend_apriori(binary_matrix: np.ndarray, min_support: float) -> pd.DataFrame: """Run mlxtend implementation""" # Convert binary matrix to DataFrame - df = pd.DataFrame(binary_matrix, columns=[f'item_{i}' for i in range(binary_matrix.shape[1])]) + df = pd.DataFrame( + binary_matrix, columns=[f"item_{i}" for i in range(binary_matrix.shape[1])] + ) df = df.astype(bool) - + # Run apriori frequent_itemsets = mlxtend_apriori(df, min_support=min_support, use_colnames=True) return frequent_itemsets -def run_efficient_apriori(transaction_lists: List[List[int]], min_support: float, num_transactions: int) -> Tuple[Any, Any]: +def run_efficient_apriori( + transaction_lists: List[List[int]], min_support: float, num_transactions: int +) -> Tuple[Any, Any]: """Run efficient-apriori implementation""" # Convert to the format expected by efficient-apriori transactions_tuples = [tuple(transaction) for transaction in transaction_lists] - + # efficient-apriori expects relative support between 0 and 1 itemsets, rules = efficient_apriori(transactions_tuples, min_support=min_support) return itemsets, rules @@ -125,30 +135,30 @@ def run_efficient_apriori(transaction_lists: List[List[int]], min_support: float class BenchmarkDatasets: """Pre-generated datasets for consistent benchmarking""" - + @staticmethod def small_dataset() -> Tuple[np.ndarray, List[List[int]]]: """Small dataset: 1000 transactions, 20 items""" return generate_random_transactions(1000, 20, 5, seed=42) - - @staticmethod + + @staticmethod def medium_dataset() -> Tuple[np.ndarray, List[List[int]]]: """Medium dataset: 5000 transactions, 50 items""" return generate_random_transactions(5000, 50, 8, seed=42) - + @staticmethod def large_dataset() -> Tuple[np.ndarray, List[List[int]]]: """Large dataset: 10000 transactions, 100 items""" return generate_random_transactions(10000, 100, 12, seed=42) - + @staticmethod def correlated_dataset() -> Tuple[np.ndarray, List[List[int]]]: """Dataset with correlated items for interesting patterns""" correlation_groups = [ - [0, 1, 2], # Group 1: items 0, 1, 2 often appear together - [5, 6, 7, 8], # Group 2: items 5, 6, 7, 8 often appear together - [10, 11], # Group 3: items 10, 11 often appear together - [15, 16, 17] # Group 4: items 15, 16, 17 often appear together + [0, 1, 2], # Group 1: items 0, 1, 2 often appear together + [5, 6, 7, 8], # Group 2: items 5, 6, 7, 8 often appear together + [10, 11], # Group 3: items 10, 11 often appear together + [15, 16, 17], # Group 4: items 15, 16, 17 often appear together ] return generate_correlated_transactions(3000, 25, correlation_groups, seed=42) @@ -156,85 +166,107 @@ def correlated_dataset() -> Tuple[np.ndarray, List[List[int]]]: # Benchmark tests class TestBenchmarkSmall: """Benchmarks for small dataset""" - + def setup_method(self): self.binary_matrix, self.transaction_lists = BenchmarkDatasets.small_dataset() self.min_support = 0.05 # 5% support self.num_transactions = len(self.transaction_lists) - + def test_priors_small(self, benchmark): result = benchmark(run_priors_apriori, self.binary_matrix, self.min_support) assert len(result) > 0 - + def test_mlxtend_small(self, benchmark): result = benchmark(run_mlxtend_apriori, self.binary_matrix, self.min_support) assert len(result) > 0 - + def test_efficient_apriori_small(self, benchmark): - itemsets, rules = benchmark(run_efficient_apriori, self.transaction_lists, self.min_support, self.num_transactions) + itemsets, rules = benchmark( + run_efficient_apriori, + self.transaction_lists, + self.min_support, + self.num_transactions, + ) assert len(itemsets) > 0 class TestBenchmarkMedium: """Benchmarks for medium dataset""" - + def setup_method(self): self.binary_matrix, self.transaction_lists = BenchmarkDatasets.medium_dataset() self.min_support = 0.03 # 3% support self.num_transactions = len(self.transaction_lists) - + def test_priors_medium(self, benchmark): result = benchmark(run_priors_apriori, self.binary_matrix, self.min_support) assert len(result) > 0 - + def test_mlxtend_medium(self, benchmark): result = benchmark(run_mlxtend_apriori, self.binary_matrix, self.min_support) assert len(result) > 0 - + def test_efficient_apriori_medium(self, benchmark): - itemsets, rules = benchmark(run_efficient_apriori, self.transaction_lists, self.min_support, self.num_transactions) + itemsets, rules = benchmark( + run_efficient_apriori, + self.transaction_lists, + self.min_support, + self.num_transactions, + ) assert len(itemsets) > 0 class TestBenchmarkLarge: """Benchmarks for large dataset""" - + def setup_method(self): self.binary_matrix, self.transaction_lists = BenchmarkDatasets.large_dataset() self.min_support = 0.02 # 2% support self.num_transactions = len(self.transaction_lists) - + def test_priors_large(self, benchmark): result = benchmark(run_priors_apriori, self.binary_matrix, self.min_support) assert len(result) > 0 - + def test_mlxtend_large(self, benchmark): result = benchmark(run_mlxtend_apriori, self.binary_matrix, self.min_support) assert len(result) > 0 - + def test_efficient_apriori_large(self, benchmark): - itemsets, rules = benchmark(run_efficient_apriori, self.transaction_lists, self.min_support, self.num_transactions) + itemsets, rules = benchmark( + run_efficient_apriori, + self.transaction_lists, + self.min_support, + self.num_transactions, + ) assert len(itemsets) > 0 class TestBenchmarkCorrelated: """Benchmarks for correlated dataset (more interesting patterns)""" - + def setup_method(self): - self.binary_matrix, self.transaction_lists = BenchmarkDatasets.correlated_dataset() + self.binary_matrix, self.transaction_lists = ( + BenchmarkDatasets.correlated_dataset() + ) self.min_support = 0.05 # 5% support self.num_transactions = len(self.transaction_lists) - + def test_priors_correlated(self, benchmark): result = benchmark(run_priors_apriori, self.binary_matrix, self.min_support) assert len(result) > 0 - + def test_mlxtend_correlated(self, benchmark): result = benchmark(run_mlxtend_apriori, self.binary_matrix, self.min_support) assert len(result) > 0 - + def test_efficient_apriori_correlated(self, benchmark): - itemsets, rules = benchmark(run_efficient_apriori, self.transaction_lists, self.min_support, self.num_transactions) + itemsets, rules = benchmark( + run_efficient_apriori, + self.transaction_lists, + self.min_support, + self.num_transactions, + ) assert len(itemsets) > 0 @@ -244,35 +276,41 @@ def test_correctness_comparison(): binary_matrix, transaction_lists = BenchmarkDatasets.small_dataset() min_support = 0.1 num_transactions = len(transaction_lists) - + # Run all implementations priors_result = run_priors_apriori(binary_matrix, min_support) mlxtend_result = run_mlxtend_apriori(binary_matrix, min_support) - efficient_result, _ = run_efficient_apriori(transaction_lists, min_support, num_transactions) - + efficient_result, _ = run_efficient_apriori( + transaction_lists, min_support, num_transactions + ) + # Extract 1-itemsets for comparison priors_1_itemsets = set() if len(priors_result) > 0: level_1 = priors_result[0] for i in range(level_1.shape[0]): priors_1_itemsets.add(tuple([level_1[i, 0]])) - + mlxtend_1_itemsets = set() for _, row in mlxtend_result.iterrows(): - itemset = tuple(sorted([int(item.split('_')[1]) for item in row['itemsets'] if row['itemsets']])) + itemset = tuple( + sorted( + [int(item.split("_")[1]) for item in row["itemsets"] if row["itemsets"]] + ) + ) if len(itemset) == 1: mlxtend_1_itemsets.add(itemset) - + efficient_1_itemsets = set() if 1 in efficient_result: for itemset in efficient_result[1]: efficient_1_itemsets.add(itemset) - + # Check that we have some overlap (exact match might not be possible due to different thresholds/rounding) print(f"Priors 1-itemsets: {len(priors_1_itemsets)}") print(f"MLxtend 1-itemsets: {len(mlxtend_1_itemsets)}") print(f"Efficient-apriori 1-itemsets: {len(efficient_1_itemsets)}") - + # At least some itemsets should be found by all assert len(priors_1_itemsets) > 0 assert len(mlxtend_1_itemsets) > 0 @@ -283,32 +321,34 @@ def test_correctness_comparison(): # Run correctness test test_correctness_comparison() print("Correctness test passed!") - + # Example of manual timing comparison print("\nManual timing comparison:") binary_matrix, transaction_lists = BenchmarkDatasets.medium_dataset() min_support = 0.03 num_transactions = len(transaction_lists) - + # Time priors start = time.time() priors_result = run_priors_apriori(binary_matrix, min_support) priors_time = time.time() - start - + # Time mlxtend start = time.time() mlxtend_result = run_mlxtend_apriori(binary_matrix, min_support) mlxtend_time = time.time() - start - + # Time efficient-apriori start = time.time() - efficient_result, _ = run_efficient_apriori(transaction_lists, min_support, num_transactions) + efficient_result, _ = run_efficient_apriori( + transaction_lists, min_support, num_transactions + ) efficient_time = time.time() - start - + print(f"Priors (Rust): {priors_time:.4f}s") - print(f"MLxtend: {mlxtend_time:.4f}s") + print(f"MLxtend: {mlxtend_time:.4f}s") print(f"Efficient-apriori: {efficient_time:.4f}s") - + if priors_time > 0: print(f"MLxtend vs Priors: {mlxtend_time/priors_time:.2f}x") - print(f"Efficient-apriori vs Priors: {efficient_time/priors_time:.2f}x") \ No newline at end of file + print(f"Efficient-apriori vs Priors: {efficient_time/priors_time:.2f}x") diff --git a/priors/benchmarks/benchmark_memory.py b/priors/benchmarks/benchmark_memory.py index 267a465..750c013 100644 --- a/priors/benchmarks/benchmark_memory.py +++ b/priors/benchmarks/benchmark_memory.py @@ -3,13 +3,15 @@ Run with pytest-benchmark or as standalone script. """ +import gc +import time +from typing import Dict, List, Tuple + import numpy as np import pandas as pd import pytest + import priors -import time -import gc -from typing import Dict, List, Tuple try: import psutil @@ -23,6 +25,7 @@ # Fallback for when running without package installation import sys from pathlib import Path + parent_dir = Path(__file__).parent.parent sys.path.insert(0, str(parent_dir)) from utils import count_itemsets, generate_transactions @@ -32,6 +35,7 @@ # Helper Functions # ============================================================================ + def get_memory_usage(): """Get current memory usage in MB.""" if psutil: @@ -56,7 +60,7 @@ def run_streaming_fp_growth(transactions, min_support, chunk_size=None): chunk_size = max(1, len(transactions) // 2) # Check if lazy functions exist - if not hasattr(priors, 'create_lazy_fp_growth'): + if not hasattr(priors, "create_lazy_fp_growth"): pytest.skip("Lazy FP-Growth functions not available") pid = priors.create_lazy_fp_growth() @@ -64,7 +68,7 @@ def run_streaming_fp_growth(transactions, min_support, chunk_size=None): try: # Counting pass for i in range(0, len(transactions), chunk_size): - chunk = transactions[i:i + chunk_size] + chunk = transactions[i : i + chunk_size] priors.lazy_count_pass(pid, chunk) # Finalize counts @@ -72,7 +76,7 @@ def run_streaming_fp_growth(transactions, min_support, chunk_size=None): # Building pass for i in range(0, len(transactions), chunk_size): - chunk = transactions[i:i + chunk_size] + chunk = transactions[i : i + chunk_size] priors.lazy_build_pass(pid, chunk) priors.lazy_finalize_building(pid) @@ -92,16 +96,23 @@ def run_streaming_fp_growth(transactions, min_support, chunk_size=None): memory_benchmark_results = [] -def add_memory_benchmark(dataset_size: str, regular_memory: float, lazy_memory: float, - memory_savings: float, time_overhead: float): +def add_memory_benchmark( + dataset_size: str, + regular_memory: float, + lazy_memory: float, + memory_savings: float, + time_overhead: float, +): """Add memory benchmark result.""" - memory_benchmark_results.append({ - 'Dataset Size': dataset_size, - 'Regular Memory': f"{regular_memory:.0f} MB", - 'Lazy Memory': f"{lazy_memory:.0f} MB", - 'Memory Savings': f"{memory_savings:.1f}x", - 'Time Overhead': f"{time_overhead:.1f}x" - }) + memory_benchmark_results.append( + { + "Dataset Size": dataset_size, + "Regular Memory": f"{regular_memory:.0f} MB", + "Lazy Memory": f"{lazy_memory:.0f} MB", + "Memory Savings": f"{memory_savings:.1f}x", + "Time Overhead": f"{time_overhead:.1f}x", + } + ) class TestMemoryEfficiency: @@ -144,17 +155,20 @@ def test_memory_large_dense(self): class TestMemoryBenchmarks: """Benchmarks for memory efficiency: Lazy vs Regular FP-Growth.""" - @pytest.mark.parametrize("num_trans,num_items,avg_size,min_support", [ - (50000, 100, 15, 0.03), - (200000, 150, 20, 0.02), - (500000, 200, 25, 0.01), - ]) + @pytest.mark.parametrize( + "num_trans,num_items,avg_size,min_support", + [ + (50000, 100, 15, 0.03), + (200000, 150, 20, 0.02), + (500000, 200, 25, 0.01), + ], + ) def test_memory_comparison(self, num_trans, num_items, avg_size, min_support): """Compare memory usage and time overhead.""" if not psutil: pytest.skip("psutil not available for memory benchmarking") - if not hasattr(priors, 'create_lazy_fp_growth'): + if not hasattr(priors, "create_lazy_fp_growth"): pytest.skip("Lazy FP-Growth functions not available") transactions = generate_transactions(num_trans, num_items, avg_size, seed=42) @@ -189,8 +203,12 @@ def test_memory_comparison(self, num_trans, num_items, avg_size, min_support): lazy_mem = regular_mem * 0.4 # Estimate 40% of regular memory_savings = regular_mem / lazy_mem - add_memory_benchmark(dataset_size, regular_mem, lazy_mem, memory_savings, time_overhead) - print(f"{dataset_size}: Regular={regular_mem:.0f}MB, Lazy={lazy_mem:.0f}MB, Savings={memory_savings:.1f}x, Overhead={time_overhead:.1f}x") + add_memory_benchmark( + dataset_size, regular_mem, lazy_mem, memory_savings, time_overhead + ) + print( + f"{dataset_size}: Regular={regular_mem:.0f}MB, Lazy={lazy_mem:.0f}MB, Savings={memory_savings:.1f}x, Overhead={time_overhead:.1f}x" + ) def print_memory_benchmarks(): @@ -205,11 +223,15 @@ def print_memory_benchmarks(): df = pd.DataFrame(memory_benchmark_results) # Print formatted table - print(f"{'Dataset Size':<15} {'Regular Memory':<15} {'Lazy Memory':<15} {'Memory Savings':<15} {'Time Overhead':<15}") + print( + f"{'Dataset Size':<15} {'Regular Memory':<15} {'Lazy Memory':<15} {'Memory Savings':<15} {'Time Overhead':<15}" + ) print("-" * 80) for _, row in df.iterrows(): - print(f"{row['Dataset Size']:<15} {row['Regular Memory']:<15} {row['Lazy Memory']:<15} {row['Memory Savings']:<15} {row['Time Overhead']:<15}") + print( + f"{row['Dataset Size']:<15} {row['Regular Memory']:<15} {row['Lazy Memory']:<15} {row['Memory Savings']:<15} {row['Time Overhead']:<15}" + ) print("=" * 80) @@ -217,6 +239,7 @@ def print_memory_benchmarks(): @pytest.fixture(scope="session", autouse=True) def print_summary_on_exit(request): """Print summary table after all benchmarks complete.""" + def finalize(): print_memory_benchmarks() print("=" * 80 + "\n") diff --git a/priors/benchmarks/benchmark_performance.py b/priors/benchmarks/benchmark_performance.py index c365b0e..39734e8 100644 --- a/priors/benchmarks/benchmark_performance.py +++ b/priors/benchmarks/benchmark_performance.py @@ -3,24 +3,26 @@ Run with pytest-benchmark or as standalone script. """ +# Import shared utilities +import sys +import time +from pathlib import Path +from typing import Dict, List, Tuple + import numpy as np import pandas as pd import pytest + import priors -import time -from typing import Dict, List, Tuple -# Import shared utilities -import sys -from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent)) from utils import count_itemsets, generate_transactions - # ============================================================================ # Performance Benchmarks # ============================================================================ + class TestPerformanceBenchmarks: """Performance benchmarks comparing different libraries.""" @@ -43,11 +45,15 @@ def test_performance_small(self): mlxtend = pytest.importorskip("mlxtend") from mlxtend.frequent_patterns import fpgrowth as mlxtend_fpgrowth - df = pd.DataFrame(transactions.astype(bool), - columns=[f'item_{i}' for i in range(transactions.shape[1])]) + df = pd.DataFrame( + transactions.astype(bool), + columns=[f"item_{i}" for i in range(transactions.shape[1])], + ) start_time = time.time() - mlxtend_result = mlxtend_fpgrowth(df, min_support=min_support, use_colnames=False) + mlxtend_result = mlxtend_fpgrowth( + df, min_support=min_support, use_colnames=False + ) mlxtend_time = time.time() - start_time mlxtend_count = len(mlxtend_result) @@ -110,9 +116,11 @@ def test_scaling_transactions(self): # Report scaling for i in range(1, len(times)): - scale_factor = sizes[i] / sizes[i-1] - time_factor = times[i] / times[i-1] if times[i-1] > 0 else 1 - print(f"Scale {sizes[i-1]} -> {sizes[i]}: {scale_factor}x size took {time_factor:.2f}x time") + scale_factor = sizes[i] / sizes[i - 1] + time_factor = times[i] / times[i - 1] if times[i - 1] > 0 else 1 + print( + f"Scale {sizes[i-1]} -> {sizes[i]}: {scale_factor}x size took {time_factor:.2f}x time" + ) @pytest.mark.slow def test_scaling_items(self): @@ -124,7 +132,9 @@ def test_scaling_items(self): item_counts = [20, 50, 100] for items in item_counts: - transactions = generate_transactions(base_transactions, items, avg_size, seed=42) + transactions = generate_transactions( + base_transactions, items, avg_size, seed=42 + ) start_time = time.time() result = priors.fp_growth(transactions, min_support) @@ -142,31 +152,43 @@ def test_scaling_items(self): speed_benchmark_results = [] -def add_speed_benchmark(dataset_size: str, mlxtend_time: str, efficient_apriori_time: str, - priors_time: float, speedup: str): +def add_speed_benchmark( + dataset_size: str, + mlxtend_time: str, + efficient_apriori_time: str, + priors_time: float, + speedup: str, +): """Add speed benchmark result.""" - speed_benchmark_results.append({ - 'Dataset Size': dataset_size, - 'MLxtend': mlxtend_time, - 'Efficient-Apriori': efficient_apriori_time, - 'Priors FP-Growth': f"{priors_time:.2f}s", - 'Speedup': speedup - }) + speed_benchmark_results.append( + { + "Dataset Size": dataset_size, + "MLxtend": mlxtend_time, + "Efficient-Apriori": efficient_apriori_time, + "Priors FP-Growth": f"{priors_time:.2f}s", + "Speedup": speedup, + } + ) class TestSpeedBenchmarks: """Benchmarks for speed comparison: Regular FP-Growth vs others.""" - @pytest.mark.parametrize("num_trans,num_items,avg_size,min_support", [ - (10000, 50, 10, 0.05), - (50000, 80, 15, 0.03), - (100000, 100, 20, 0.02), - (200000, 100, 25, 0.01), - ]) + @pytest.mark.parametrize( + "num_trans,num_items,avg_size,min_support", + [ + (10000, 50, 10, 0.05), + (50000, 80, 15, 0.03), + (100000, 100, 20, 0.02), + (200000, 100, 25, 0.01), + ], + ) def test_speed_comparison(self, num_trans, num_items, avg_size, min_support): """Compare execution times across libraries.""" transactions = generate_transactions(num_trans, num_items, avg_size, seed=42) - df = pd.DataFrame(transactions.astype(bool), columns=[f'i{i}' for i in range(num_items)]) + df = pd.DataFrame( + transactions.astype(bool), columns=[f"i{i}" for i in range(num_items)] + ) dataset_size = f"{num_trans//1000}K × {num_items}" # Priors FP-Growth @@ -179,9 +201,13 @@ def test_speed_comparison(self, num_trans, num_items, avg_size, min_support): try: if num_trans <= 50000: # Avoid OOM on larger datasets mlxtend = pytest.importorskip("mlxtend") - from mlxtend.frequent_patterns import fpgrowth as mlxtend_fpgrowth + from mlxtend.frequent_patterns import \ + fpgrowth as mlxtend_fpgrowth + start = time.time() - mlxtend_result = mlxtend_fpgrowth(df, min_support=min_support, use_colnames=False) + mlxtend_result = mlxtend_fpgrowth( + df, min_support=min_support, use_colnames=False + ) mlxtend_time = f"{time.time() - start:.2f}s" except (ImportError, MemoryError, Exception): mlxtend_time = "OOM" @@ -190,16 +216,19 @@ def test_speed_comparison(self, num_trans, num_items, avg_size, min_support): ea_time = "N/A" try: import efficient_apriori + transactions_list = [tuple(np.where(row)[0]) for row in transactions] start = time.time() - itemsets, rules = efficient_apriori.apriori(transactions_list, min_support=min_support) + itemsets, rules = efficient_apriori.apriori( + transactions_list, min_support=min_support + ) ea_time = f"{time.time() - start:.2f}s" except (ImportError, Exception): ea_time = "N/A" # Calculate speedup vs Efficient-Apriori if available speedup = "N/A" - if ea_time != "N/A" and isinstance(ea_time, str) and ea_time.endswith('s'): + if ea_time != "N/A" and isinstance(ea_time, str) and ea_time.endswith("s"): try: ea_float = float(ea_time[:-1]) if priors_time > 0: @@ -208,7 +237,9 @@ def test_speed_comparison(self, num_trans, num_items, avg_size, min_support): speedup = "N/A" add_speed_benchmark(dataset_size, mlxtend_time, ea_time, priors_time, speedup) - print(f"{dataset_size}: Priors={priors_time:.2f}s, MLxtend={mlxtend_time}, EA={ea_time}, Speedup={speedup}") + print( + f"{dataset_size}: Priors={priors_time:.2f}s, MLxtend={mlxtend_time}, EA={ea_time}, Speedup={speedup}" + ) def print_speed_benchmarks(): @@ -223,11 +254,15 @@ def print_speed_benchmarks(): df = pd.DataFrame(speed_benchmark_results) # Print formatted table - print(f"{'Dataset Size':<15} {'MLxtend':<12} {'Efficient-Apriori':<18} {'Priors FP-Growth':<18} {'Speedup':<10}") + print( + f"{'Dataset Size':<15} {'MLxtend':<12} {'Efficient-Apriori':<18} {'Priors FP-Growth':<18} {'Speedup':<10}" + ) print("-" * 80) for _, row in df.iterrows(): - print(f"{row['Dataset Size']:<15} {row['MLxtend']:<12} {row['Efficient-Apriori']:<18} {row['Priors FP-Growth']:<18} {row['Speedup']:<10}") + print( + f"{row['Dataset Size']:<15} {row['MLxtend']:<12} {row['Efficient-Apriori']:<18} {row['Priors FP-Growth']:<18} {row['Speedup']:<10}" + ) print("=" * 80) print("MLxtend fails with OOM (Out of Memory) on larger datasets") @@ -236,6 +271,7 @@ def print_speed_benchmarks(): @pytest.fixture(scope="session", autouse=True) def print_summary_on_exit(request): """Print summary table after all benchmarks complete.""" + def finalize(): print_speed_benchmarks() print("=" * 80 + "\n") diff --git a/priors/benchmarks/run_benchmarks.py b/priors/benchmarks/run_benchmarks.py index ca0491e..a833728 100644 --- a/priors/benchmarks/run_benchmarks.py +++ b/priors/benchmarks/run_benchmarks.py @@ -3,27 +3,30 @@ Script to run comprehensive benchmarks and generate reports """ +import json import subprocess import sys -import json -import pandas as pd from pathlib import Path +import pandas as pd + def run_pytest_benchmark(): """Run pytest-benchmark and save results""" print("Running pytest-benchmark...") - + # Run benchmark with JSON output cmd = [ - sys.executable, "-m", "pytest", + sys.executable, + "-m", + "pytest", "benchmarks/benchmark_apriori.py", "--benchmark-json=benchmark_results.json", "--benchmark-min-rounds=3", "--benchmark-sort=mean", - "-v" + "-v", ] - + try: result = subprocess.run(cmd, capture_output=True, text=True, check=True) print("Benchmark completed successfully!") @@ -39,86 +42,88 @@ def run_pytest_benchmark(): def analyze_benchmark_results(): """Analyze and pretty-print benchmark results""" results_file = Path("benchmark_results.json") - + if not results_file.exists(): print("No benchmark results found!") return - - with open(results_file, 'r') as f: + + with open(results_file, "r") as f: data = json.load(f) - - benchmarks = data['benchmarks'] - + + benchmarks = data["benchmarks"] + # Create a summary table summary_data = [] - + for bench in benchmarks: - name = bench['name'] - + name = bench["name"] + # Extract implementation and dataset size - if 'priors' in name: - implementation = 'Priors (Rust)' - elif 'mlxtend' in name: - implementation = 'MLxtend' - elif 'efficient_apriori' in name: - implementation = 'Efficient-Apriori' + if "priors" in name: + implementation = "Priors (Rust)" + elif "mlxtend" in name: + implementation = "MLxtend" + elif "efficient_apriori" in name: + implementation = "Efficient-Apriori" else: - implementation = 'Unknown' - - if 'small' in name: - dataset = 'Small (1K tx, 20 items)' - elif 'medium' in name: - dataset = 'Medium (5K tx, 50 items)' - elif 'large' in name: - dataset = 'Large (10K tx, 100 items)' - elif 'correlated' in name: - dataset = 'Correlated (3K tx, 25 items)' + implementation = "Unknown" + + if "small" in name: + dataset = "Small (1K tx, 20 items)" + elif "medium" in name: + dataset = "Medium (5K tx, 50 items)" + elif "large" in name: + dataset = "Large (10K tx, 100 items)" + elif "correlated" in name: + dataset = "Correlated (3K tx, 25 items)" else: - dataset = 'Unknown' - - stats = bench['stats'] - summary_data.append({ - 'Implementation': implementation, - 'Dataset': dataset, - 'Mean (s)': f"{stats['mean']:.6f}", - 'Std (s)': f"{stats['stddev']:.6f}", - 'Min (s)': f"{stats['min']:.6f}", - 'Max (s)': f"{stats['max']:.6f}", - 'Rounds': stats['rounds'] - }) - + dataset = "Unknown" + + stats = bench["stats"] + summary_data.append( + { + "Implementation": implementation, + "Dataset": dataset, + "Mean (s)": f"{stats['mean']:.6f}", + "Std (s)": f"{stats['stddev']:.6f}", + "Min (s)": f"{stats['min']:.6f}", + "Max (s)": f"{stats['max']:.6f}", + "Rounds": stats["rounds"], + } + ) + # Create DataFrame and display df = pd.DataFrame(summary_data) - - print("\n" + "="*80) + + print("\n" + "=" * 80) print("BENCHMARK RESULTS SUMMARY") - print("="*80) + print("=" * 80) print(df.to_string(index=False)) - + # Calculate speedups - print("\n" + "="*80) + print("\n" + "=" * 80) print("SPEEDUP ANALYSIS (relative to Priors)") - print("="*80) - - datasets = df['Dataset'].unique() + print("=" * 80) + + datasets = df["Dataset"].unique() for dataset in datasets: - dataset_df = df[df['Dataset'] == dataset].copy() - + dataset_df = df[df["Dataset"] == dataset].copy() + # Find Priors baseline - priors_row = dataset_df[dataset_df['Implementation'] == 'Priors (Rust)'] + priors_row = dataset_df[dataset_df["Implementation"] == "Priors (Rust)"] if len(priors_row) == 0: continue - - priors_time = float(priors_row['Mean (s)'].iloc[0]) - + + priors_time = float(priors_row["Mean (s)"].iloc[0]) + print(f"\nDataset: {dataset}") print("-" * 50) - + for _, row in dataset_df.iterrows(): - impl = row['Implementation'] - time_val = float(row['Mean (s)']) - - if impl == 'Priors (Rust)': + impl = row["Implementation"] + time_val = float(row["Mean (s)"]) + + if impl == "Priors (Rust)": speedup = 1.0 status = "(baseline)" else: @@ -127,24 +132,25 @@ def analyze_benchmark_results(): status = f"({speedup:.2f}x faster than Priors)" else: status = f"({speedup:.2f}x slower than Priors)" - + print(f"{impl:20s}: {time_val:.6f}s {status}") def main(): """Main function""" print("Starting comprehensive Apriori benchmark...") - + # First run the correctness test print("Running correctness validation...") try: from benchmark_apriori import test_correctness_comparison + test_correctness_comparison() print("✓ Correctness validation passed!") except Exception as e: print(f"✗ Correctness validation failed: {e}") return - + # Run benchmarks if run_pytest_benchmark(): analyze_benchmark_results() @@ -153,4 +159,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/priors/benchmarks/simple_benchmark.py b/priors/benchmarks/simple_benchmark.py index 2e82711..4c8637c 100644 --- a/priors/benchmarks/simple_benchmark.py +++ b/priors/benchmarks/simple_benchmark.py @@ -3,63 +3,64 @@ Simple manual benchmark to test all implementations quickly """ -import numpy as np import time from typing import List -# Import implementations -import priors -from mlxtend.frequent_patterns import apriori as mlxtend_apriori -from mlxtend.preprocessing import TransactionEncoder +import numpy as np import pandas as pd from efficient_apriori import apriori as efficient_apriori +from mlxtend.frequent_patterns import apriori as mlxtend_apriori +from mlxtend.preprocessing import TransactionEncoder + +# Import implementations +import priors def create_test_data(num_transactions=1000, num_items=20, avg_size=5): """Create simple test data""" np.random.seed(42) - + # Create binary matrix binary_matrix = np.zeros((num_transactions, num_items), dtype=np.int32) transaction_lists = [] - + for i in range(num_transactions): # Random transaction size size = max(1, min(num_items, int(np.random.normal(avg_size, 2)))) - + # Random items items = np.random.choice(num_items, size, replace=False) - + # Fill binary matrix binary_matrix[i, items] = 1 - + # Store as list transaction_lists.append(items.tolist()) - + return binary_matrix, transaction_lists def benchmark_implementation(name, func, *args, **kwargs): """Benchmark a single implementation""" print(f"\nTesting {name}...") - + try: start_time = time.time() result = func(*args, **kwargs) end_time = time.time() - + duration = end_time - start_time print(f"✓ {name}: {duration:.4f}s") - + # Try to get result size - if hasattr(result, '__len__'): + if hasattr(result, "__len__"): print(f" Result size: {len(result)}") elif isinstance(result, tuple) and len(result) >= 1: - if hasattr(result[0], '__len__'): + if hasattr(result[0], "__len__"): print(f" Result size: {len(result[0])}") - + return duration, result - + except Exception as e: print(f"✗ {name} failed: {e}") return None, None @@ -72,7 +73,9 @@ def run_priors(binary_matrix, min_support): def run_mlxtend(binary_matrix, min_support): """Run MLxtend implementation""" - df = pd.DataFrame(binary_matrix, columns=[f'item_{i}' for i in range(binary_matrix.shape[1])]) + df = pd.DataFrame( + binary_matrix, columns=[f"item_{i}" for i in range(binary_matrix.shape[1])] + ) df = df.astype(bool) return mlxtend_apriori(df, min_support=min_support, use_colnames=True) @@ -89,59 +92,59 @@ def main(): """Run simple benchmark""" print("Simple Apriori Benchmark") print("=" * 50) - + # Create test data binary_matrix, transaction_lists = create_test_data(1000, 20, 5) min_support = 0.05 # 5% num_transactions = len(transaction_lists) - + print(f"Dataset: {num_transactions} transactions, {binary_matrix.shape[1]} items") - print(f"Min support: {min_support} ({min_support * num_transactions:.0f} transactions)") - + print( + f"Min support: {min_support} ({min_support * num_transactions:.0f} transactions)" + ) + # Benchmark all implementations results = {} - + # Our Rust implementation duration, result = benchmark_implementation( - "Priors (Rust)", - run_priors, - binary_matrix, min_support + "Priors (Rust)", run_priors, binary_matrix, min_support ) if duration is not None: - results['priors'] = duration - + results["priors"] = duration + # MLxtend duration, result = benchmark_implementation( - "MLxtend", - run_mlxtend, - binary_matrix, min_support + "MLxtend", run_mlxtend, binary_matrix, min_support ) if duration is not None: - results['mlxtend'] = duration - + results["mlxtend"] = duration + # Efficient-apriori duration, result = benchmark_implementation( "Efficient-Apriori", run_efficient_apriori, - transaction_lists, min_support, num_transactions + transaction_lists, + min_support, + num_transactions, ) if duration is not None: - results['efficient_apriori'] = duration - + results["efficient_apriori"] = duration + # Summary print("\n" + "=" * 50) print("SUMMARY") print("=" * 50) - + if results: fastest = min(results.items(), key=lambda x: x[1]) print(f"Fastest: {fastest[0]} ({fastest[1]:.4f}s)") - - if 'priors' in results: - baseline = results['priors'] + + if "priors" in results: + baseline = results["priors"] print(f"\nSpeedup vs Priors (Rust):") for name, duration in results.items(): - if name != 'priors': + if name != "priors": speedup = duration / baseline if speedup > 1: print(f" {name}: {speedup:.2f}x slower") @@ -152,4 +155,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/priors/run_lib.py b/priors/run_lib.py index 0517e1f..bd528a8 100644 --- a/priors/run_lib.py +++ b/priors/run_lib.py @@ -3,13 +3,11 @@ import shap import sklearn from sklearn.tree import DecisionTreeClassifier - - from treeshap import score -if __name__ == '__main__': +if __name__ == "__main__": X, y = shap.datasets.adult(n_points=1000) - clf = DecisionTreeClassifier(max_depth=3, random_state=0) + clf = DecisionTreeClassifier(max_depth=3, random_state=0) clf.fit(X, y) n_nodes = clf.tree_.node_count @@ -20,10 +18,9 @@ weighted_node_samples = clf.tree_.weighted_n_node_samples values = clf.tree_.value - result = score(np.array(X), children_left, children_right, - feature, threshold, - values - ) + result = score( + np.array(X), children_left, children_right, feature, threshold, values + ) result_predict_proba = clf.predict_proba(X) np.testing.assert_allclose(np.squeeze(result), result_predict_proba, rtol=1e-9) print("In Python: ", result) diff --git a/priors/src/fp/growth/mining.rs b/priors/src/fp/growth/mining.rs index e80ac15..bea298d 100644 --- a/priors/src/fp/growth/mining.rs +++ b/priors/src/fp/growth/mining.rs @@ -1,13 +1,19 @@ use super::builder::{build_conditional_fp_tree, build_fp_tree, get_conditional_frequent_items}; -use crate::fp::utils::FrequentLevel; use super::tree::FPTree; +use crate::fp::utils::FrequentLevel; use numpy::ndarray::ArrayView2; use rayon::prelude::*; pub fn fp_growth_algorithm(transactions: ArrayView2, min_support: f64) -> Vec { let num_transactions = transactions.shape()[0]; let (fp_tree, frequent_items) = build_fp_tree(transactions, min_support); - fp_growth_recursive(&fp_tree, &frequent_items, &[], min_support, num_transactions) + fp_growth_recursive( + &fp_tree, + &frequent_items, + &[], + min_support, + num_transactions, + ) } fn fp_growth_recursive( @@ -32,7 +38,9 @@ fn fp_growth_recursive( .par_iter() .rev() .filter_map(|&item| { - let support = fp_tree.header_table.get(&item)? + let support = fp_tree + .header_table + .get(&item)? .iter() .map(|&idx| fp_tree.nodes[idx].count) .sum::(); @@ -52,7 +60,13 @@ fn fp_growth_recursive( let cond_items = get_conditional_frequent_items(&cond_tree, min_count); if !cond_items.is_empty() { - result.extend(fp_growth_recursive(&cond_tree, &cond_items, &new_pattern, min_support, num_transactions)); + result.extend(fp_growth_recursive( + &cond_tree, + &cond_items, + &new_pattern, + min_support, + num_transactions, + )); } } @@ -71,15 +85,24 @@ fn fp_growth_recursive( // Adjust offsets when merging let current_len = merged[size - 1].storage.items.len(); - merged[size - 1].storage.items.extend_from_slice(&level.storage.items); + merged[size - 1] + .storage + .items + .extend_from_slice(&level.storage.items); // Adjust each offset by the current length of the merged items array for (start, len) in level.storage.offsets { - merged[size - 1].storage.offsets.push((current_len + start, len)); + merged[size - 1] + .storage + .offsets + .push((current_len + start, len)); } // Copy support values - merged[size - 1].storage.supports.extend_from_slice(&level.storage.supports); + merged[size - 1] + .storage + .supports + .extend_from_slice(&level.storage.supports); } } merged diff --git a/priors/src/fp/growth/tree.rs b/priors/src/fp/growth/tree.rs index cfe69e6..d4347af 100644 --- a/priors/src/fp/growth/tree.rs +++ b/priors/src/fp/growth/tree.rs @@ -17,11 +17,27 @@ pub struct FPTree { impl FPNode { pub fn new_root() -> Self { - Self { item: None, count: 0, parent: None, children: HashMap::new() } + Self { + item: None, + count: 0, + parent: None, + children: HashMap::new(), + } } pub fn new_item(item: usize, count: usize, parent: Option) -> Self { - Self { item: Some(item), count, parent, children: HashMap::new() } + Self { + item: Some(item), + count, + parent, + children: HashMap::new(), + } + } +} + +impl Default for FPTree { + fn default() -> Self { + Self::new() } } @@ -29,7 +45,11 @@ impl FPTree { pub fn new() -> Self { let mut nodes = Vec::new(); nodes.push(FPNode::new_root()); - Self { nodes, header_table: HashMap::new(), root_index: 0 } + Self { + nodes, + header_table: HashMap::new(), + root_index: 0, + } } pub fn insert_transaction(&mut self, transaction: &[usize], counts: &[usize]) { @@ -41,9 +61,10 @@ impl FPTree { current_index = child_index; } else { let new_index = self.nodes.len(); - self.nodes.push(FPNode::new_item(item, count, Some(current_index))); + self.nodes + .push(FPNode::new_item(item, count, Some(current_index))); self.nodes[current_index].children.insert(item, new_index); - self.header_table.entry(item).or_insert_with(Vec::new).push(new_index); + self.header_table.entry(item).or_default().push(new_index); current_index = new_index; } } @@ -51,20 +72,23 @@ impl FPTree { pub fn get_prefix_paths(&self, item: usize) -> Vec<(Vec, usize)> { self.header_table.get(&item).map_or(Vec::new(), |nodes| { - nodes.iter().filter_map(|&idx| { - let mut path = Vec::new(); - let mut current = self.nodes[idx].parent; - - while let Some(i) = current { - if let Some(item) = self.nodes[i].item { - path.push(item); + nodes + .iter() + .filter_map(|&idx| { + let mut path = Vec::new(); + let mut current = self.nodes[idx].parent; + + while let Some(i) = current { + if let Some(item) = self.nodes[i].item { + path.push(item); + } + current = self.nodes[i].parent; } - current = self.nodes[i].parent; - } - path.reverse(); - (!path.is_empty()).then_some((path, self.nodes[idx].count)) - }).collect() + path.reverse(); + (!path.is_empty()).then_some((path, self.nodes[idx].count)) + }) + .collect() }) } diff --git a/priors/src/fp/mod.rs b/priors/src/fp/mod.rs index cdd2e94..34bd583 100644 --- a/priors/src/fp/mod.rs +++ b/priors/src/fp/mod.rs @@ -3,6 +3,8 @@ pub mod streaming_growth; pub mod utils; pub use growth::fp_growth_algorithm; -pub use streaming_growth::{StreamingState, count_pass, finalize_counts, build_pass, finalize_building, mine_patterns}; -pub use utils::{FrequentLevel, ItemsetStorage}; pub use growth::{FPNode, FPTree}; +pub use streaming_growth::{ + StreamingState, build_pass, count_pass, finalize_building, finalize_counts, mine_patterns, +}; +pub use utils::{FrequentLevel, ItemsetStorage}; diff --git a/priors/src/fp/streaming_growth/mod.rs b/priors/src/fp/streaming_growth/mod.rs index 13f0449..fb64562 100644 --- a/priors/src/fp/streaming_growth/mod.rs +++ b/priors/src/fp/streaming_growth/mod.rs @@ -1,5 +1,5 @@ -pub mod state; pub mod processor; +pub mod state; -pub use state::{StreamingState, ProcessingPhase}; -pub use processor::{count_pass, finalize_counts, build_pass, finalize_building, mine_patterns}; +pub use processor::{build_pass, count_pass, finalize_building, finalize_counts, mine_patterns}; +pub use state::{ProcessingPhase, StreamingState}; diff --git a/priors/src/fp/streaming_growth/processor.rs b/priors/src/fp/streaming_growth/processor.rs index 7839730..00dd835 100644 --- a/priors/src/fp/streaming_growth/processor.rs +++ b/priors/src/fp/streaming_growth/processor.rs @@ -1,7 +1,7 @@ -use super::state::{StreamingState, ProcessingPhase}; use super::super::growth::builder::{build_conditional_fp_tree, get_conditional_frequent_items}; use super::super::growth::tree::FPTree; use super::super::utils::FrequentLevel; +use super::state::{ProcessingPhase, StreamingState}; use numpy::ndarray::ArrayView2; use rayon::prelude::*; @@ -46,22 +46,20 @@ pub fn build_pass(state: &mut StreamingState, transactions: ArrayView2) -> let transaction_list = matrix_to_transactions(transactions); // Build a map of item ranks to avoid borrowing conflicts - let item_ranks: std::collections::HashMap = state.frequent_items + let item_ranks: std::collections::HashMap = state + .frequent_items .iter() .enumerate() .map(|(rank, &item)| (item, rank)) .collect(); - let fp_tree = state.fp_tree.as_mut() - .ok_or("FP-Tree not initialized")?; + let fp_tree = state.fp_tree.as_mut().ok_or("FP-Tree not initialized")?; for transaction in transaction_list { // Filter and sort transaction by frequency order let mut filtered: Vec<(usize, usize)> = transaction .iter() - .filter_map(|&item| { - item_ranks.get(&item).map(|&rank| (item, rank)) - }) + .filter_map(|&item| item_ranks.get(&item).map(|&rank| (item, rank))) .collect(); if filtered.is_empty() { @@ -84,20 +82,22 @@ pub fn finalize_building(state: &mut StreamingState) -> Result<(), String> { } /// Mine patterns from the built FP-Tree -pub fn mine_patterns(state: &StreamingState, min_support: f64) -> Result, String> { +pub fn mine_patterns( + state: &StreamingState, + min_support: f64, +) -> Result, String> { if state.phase != ProcessingPhase::ReadyToMine { return Err(format!("Cannot mine in phase {:?}", state.phase)); } - let fp_tree = state.fp_tree.as_ref() - .ok_or("FP-Tree not initialized")?; + let fp_tree = state.fp_tree.as_ref().ok_or("FP-Tree not initialized")?; let result = fp_growth_recursive( fp_tree, &state.frequent_items, &[], min_support, - state.num_transactions + state.num_transactions, ); Ok(result) @@ -126,7 +126,9 @@ fn fp_growth_recursive( .par_iter() .rev() .filter_map(|&item| { - let support = fp_tree.header_table.get(&item)? + let support = fp_tree + .header_table + .get(&item)? .iter() .map(|&idx| fp_tree.nodes[idx].count) .sum::(); @@ -146,7 +148,13 @@ fn fp_growth_recursive( let cond_items = get_conditional_frequent_items(&cond_tree, min_count); if !cond_items.is_empty() { - result.extend(fp_growth_recursive(&cond_tree, &cond_items, &new_pattern, min_support, num_transactions)); + result.extend(fp_growth_recursive( + &cond_tree, + &cond_items, + &new_pattern, + min_support, + num_transactions, + )); } } @@ -165,11 +173,17 @@ fn fp_growth_recursive( // Adjust offsets when merging let current_len = merged[size - 1].storage.items.len(); - merged[size - 1].storage.items.extend_from_slice(&level.storage.items); + merged[size - 1] + .storage + .items + .extend_from_slice(&level.storage.items); // Adjust each offset by the current length of the merged items array for (start, len) in level.storage.offsets { - merged[size - 1].storage.offsets.push((current_len + start, len)); + merged[size - 1] + .storage + .offsets + .push((current_len + start, len)); } } } diff --git a/priors/src/fp/streaming_growth/state.rs b/priors/src/fp/streaming_growth/state.rs index dd51fa6..9d0f56e 100644 --- a/priors/src/fp/streaming_growth/state.rs +++ b/priors/src/fp/streaming_growth/state.rs @@ -1,5 +1,5 @@ -use std::collections::HashMap; use super::super::growth::tree::FPTree; +use std::collections::HashMap; /// State for streaming FP-Growth processing #[derive(Debug)] @@ -26,6 +26,12 @@ pub enum ProcessingPhase { ReadyToMine, } +impl Default for StreamingState { + fn default() -> Self { + Self::new() + } +} + impl StreamingState { pub fn new() -> Self { Self { @@ -58,7 +64,8 @@ impl StreamingState { let min_count = (min_support * self.num_transactions as f64) as usize; // Filter frequent items and sort by frequency (descending) - let mut frequent: Vec<(usize, usize)> = self.item_counts + let mut frequent: Vec<(usize, usize)> = self + .item_counts .iter() .filter(|&(_, &count)| count >= min_count) .map(|(&item, &count)| (item, count)) @@ -95,7 +102,10 @@ impl StreamingState { /// Complete building phase pub fn finalize_building(&mut self) -> Result<(), String> { if self.phase != ProcessingPhase::Building { - return Err(format!("Cannot finalize building in phase {:?}", self.phase)); + return Err(format!( + "Cannot finalize building in phase {:?}", + self.phase + )); } self.phase = ProcessingPhase::ReadyToMine; diff --git a/priors/src/fp/utils/storage.rs b/priors/src/fp/utils/storage.rs index 32b60db..c949709 100644 --- a/priors/src/fp/utils/storage.rs +++ b/priors/src/fp/utils/storage.rs @@ -13,7 +13,11 @@ pub struct FrequentLevel { impl ItemsetStorage { pub(crate) fn new() -> Self { - Self { items: Vec::new(), offsets: Vec::new(), supports: Vec::new() } + Self { + items: Vec::new(), + offsets: Vec::new(), + supports: Vec::new(), + } } pub(crate) fn add_itemset(&mut self, items: Vec) { @@ -41,7 +45,10 @@ impl ItemsetStorage { impl FrequentLevel { pub fn new(itemset_size: usize) -> Self { - Self { storage: ItemsetStorage::new(), itemset_size } + Self { + storage: ItemsetStorage::new(), + itemset_size, + } } pub fn add_itemset(&mut self, items: Vec) -> usize { diff --git a/priors/src/lib.rs b/priors/src/lib.rs index 3dc9085..98db08f 100644 --- a/priors/src/lib.rs +++ b/priors/src/lib.rs @@ -1,19 +1,20 @@ use numpy::ndarray::{Array2, s}; use numpy::{IntoPyArray, PyArray2, PyReadonlyArray2}; -use pyo3::{Bound, PyResult, Python, pymodule, types::PyModule}; -use std::sync::Mutex; use once_cell::sync::Lazy; +use pyo3::{Bound, PyResult, Python, pymodule, types::PyModule}; use std::collections::HashMap; +use std::sync::Mutex; pub mod fp; use fp::fp_growth_algorithm; -use fp::{StreamingState, count_pass, finalize_counts as fp_finalize_counts, - build_pass, finalize_building as fp_finalize_building, mine_patterns}; +use fp::{ + StreamingState, build_pass, count_pass, finalize_building as fp_finalize_building, + finalize_counts as fp_finalize_counts, mine_patterns, +}; // Global storage for streaming processors -static PROCESSORS: Lazy>> = Lazy::new(|| { - Mutex::new(HashMap::new()) -}); +static PROCESSORS: Lazy>> = + Lazy::new(|| Mutex::new(HashMap::new())); static NEXT_PID: Lazy> = Lazy::new(|| Mutex::new(0)); #[pymodule] @@ -60,13 +61,15 @@ fn priors<'py>(m: &Bound<'py, PyModule>) -> PyResult<()> { #[pyfn(m)] #[pyo3(name = "create_lazy_fp_growth")] fn create_lazy_fp_growth_py() -> PyResult { - let mut pid_lock = NEXT_PID.lock() + let mut pid_lock = NEXT_PID + .lock() .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(format!("Lock error: {}", e)))?; let pid = *pid_lock; *pid_lock += 1; drop(pid_lock); - let mut processors = PROCESSORS.lock() + let mut processors = PROCESSORS + .lock() .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(format!("Lock error: {}", e)))?; processors.insert(pid, StreamingState::new()); @@ -75,56 +78,51 @@ fn priors<'py>(m: &Bound<'py, PyModule>) -> PyResult<()> { #[pyfn(m)] #[pyo3(name = "lazy_count_pass")] - fn lazy_count_pass_py( - pid: usize, - transactions: PyReadonlyArray2, - ) -> PyResult<()> { + fn lazy_count_pass_py(pid: usize, transactions: PyReadonlyArray2) -> PyResult<()> { let transactions_view = transactions.as_array(); - let mut processors = PROCESSORS.lock() + let mut processors = PROCESSORS + .lock() .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(format!("Lock error: {}", e)))?; - let state = processors.get_mut(&pid) + let state = processors + .get_mut(&pid) .ok_or_else(|| pyo3::exceptions::PyValueError::new_err("Invalid processor ID"))?; - count_pass(state, transactions_view) - .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(e))?; + count_pass(state, transactions_view).map_err(pyo3::exceptions::PyRuntimeError::new_err)?; Ok(()) } #[pyfn(m)] #[pyo3(name = "lazy_finalize_counts")] - fn lazy_finalize_counts_py( - pid: usize, - min_support: f64, - ) -> PyResult<()> { - let mut processors = PROCESSORS.lock() + fn lazy_finalize_counts_py(pid: usize, min_support: f64) -> PyResult<()> { + let mut processors = PROCESSORS + .lock() .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(format!("Lock error: {}", e)))?; - let state = processors.get_mut(&pid) + let state = processors + .get_mut(&pid) .ok_or_else(|| pyo3::exceptions::PyValueError::new_err("Invalid processor ID"))?; fp_finalize_counts(state, min_support) - .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(e))?; + .map_err(pyo3::exceptions::PyRuntimeError::new_err)?; Ok(()) } #[pyfn(m)] #[pyo3(name = "lazy_build_pass")] - fn lazy_build_pass_py( - pid: usize, - transactions: PyReadonlyArray2, - ) -> PyResult<()> { + fn lazy_build_pass_py(pid: usize, transactions: PyReadonlyArray2) -> PyResult<()> { let transactions_view = transactions.as_array(); - let mut processors = PROCESSORS.lock() + let mut processors = PROCESSORS + .lock() .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(format!("Lock error: {}", e)))?; - let state = processors.get_mut(&pid) + let state = processors + .get_mut(&pid) .ok_or_else(|| pyo3::exceptions::PyValueError::new_err("Invalid processor ID"))?; - build_pass(state, transactions_view) - .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(e))?; + build_pass(state, transactions_view).map_err(pyo3::exceptions::PyRuntimeError::new_err)?; Ok(()) } @@ -132,14 +130,15 @@ fn priors<'py>(m: &Bound<'py, PyModule>) -> PyResult<()> { #[pyfn(m)] #[pyo3(name = "lazy_finalize_building")] fn lazy_finalize_building_py(pid: usize) -> PyResult<()> { - let mut processors = PROCESSORS.lock() + let mut processors = PROCESSORS + .lock() .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(format!("Lock error: {}", e)))?; - let state = processors.get_mut(&pid) + let state = processors + .get_mut(&pid) .ok_or_else(|| pyo3::exceptions::PyValueError::new_err("Invalid processor ID"))?; - fp_finalize_building(state) - .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(e))?; + fp_finalize_building(state).map_err(pyo3::exceptions::PyRuntimeError::new_err)?; Ok(()) } @@ -151,14 +150,16 @@ fn priors<'py>(m: &Bound<'py, PyModule>) -> PyResult<()> { pid: usize, min_support: f64, ) -> PyResult>>> { - let processors = PROCESSORS.lock() + let processors = PROCESSORS + .lock() .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(format!("Lock error: {}", e)))?; - let state = processors.get(&pid) + let state = processors + .get(&pid) .ok_or_else(|| pyo3::exceptions::PyValueError::new_err("Invalid processor ID"))?; - let frequent_levels = mine_patterns(state, min_support) - .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(e))?; + let frequent_levels = + mine_patterns(state, min_support).map_err(pyo3::exceptions::PyRuntimeError::new_err)?; let mut result = Vec::new(); @@ -189,10 +190,12 @@ fn priors<'py>(m: &Bound<'py, PyModule>) -> PyResult<()> { #[pyfn(m)] #[pyo3(name = "lazy_cleanup")] fn lazy_cleanup_py(pid: usize) -> PyResult<()> { - let mut processors = PROCESSORS.lock() + let mut processors = PROCESSORS + .lock() .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(format!("Lock error: {}", e)))?; - processors.remove(&pid) + processors + .remove(&pid) .ok_or_else(|| pyo3::exceptions::PyValueError::new_err("Invalid processor ID"))?; Ok(()) @@ -211,9 +214,7 @@ fn priors<'py>(m: &Bound<'py, PyModule>) -> PyResult<()> { let num_transactions = transactions_view.nrows(); // Default chunk size: max(1000, num_transactions / 10) - let chunk_size = chunk_size.unwrap_or_else(|| { - std::cmp::max(1000, num_transactions / 10) - }); + let chunk_size = chunk_size.unwrap_or_else(|| std::cmp::max(1000, num_transactions / 10)); // Create a streaming state let mut state = StreamingState::new(); @@ -223,30 +224,27 @@ fn priors<'py>(m: &Bound<'py, PyModule>) -> PyResult<()> { let end = std::cmp::min(start + chunk_size, num_transactions); let chunk = transactions_view.slice(s![start..end, ..]); - count_pass(&mut state, chunk) - .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(e))?; + count_pass(&mut state, chunk).map_err(pyo3::exceptions::PyRuntimeError::new_err)?; } // Phase 2: Finalize counts fp_finalize_counts(&mut state, min_support) - .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(e))?; + .map_err(pyo3::exceptions::PyRuntimeError::new_err)?; // Phase 3: Build pass (process in chunks) for start in (0..num_transactions).step_by(chunk_size) { let end = std::cmp::min(start + chunk_size, num_transactions); let chunk = transactions_view.slice(s![start..end, ..]); - build_pass(&mut state, chunk) - .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(e))?; + build_pass(&mut state, chunk).map_err(pyo3::exceptions::PyRuntimeError::new_err)?; } // Phase 4: Finalize building - fp_finalize_building(&mut state) - .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(e))?; + fp_finalize_building(&mut state).map_err(pyo3::exceptions::PyRuntimeError::new_err)?; // Phase 5: Mine patterns let frequent_levels = mine_patterns(&state, min_support) - .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(e))?; + .map_err(pyo3::exceptions::PyRuntimeError::new_err)?; // Convert to Python arrays (same as fp_growth) let mut result = Vec::new(); diff --git a/priors/tests/conftest.py b/priors/tests/conftest.py index de34555..e927df6 100644 --- a/priors/tests/conftest.py +++ b/priors/tests/conftest.py @@ -11,22 +11,17 @@ sys.path.insert(0, str(Path(__file__).parent.parent)) # Re-export all utilities from utils module -from utils import ( - count_itemsets, - generate_transactions, - generate_all_ones_transactions, - extract_itemsets_from_mlxtend, - extract_itemsets_from_efficient_apriori, - extract_itemsets_from_priors, - extract_itemsets_from_result, -) +from utils import (count_itemsets, extract_itemsets_from_efficient_apriori, + extract_itemsets_from_mlxtend, extract_itemsets_from_priors, + extract_itemsets_from_result, + generate_all_ones_transactions, generate_transactions) __all__ = [ - 'count_itemsets', - 'generate_transactions', - 'generate_all_ones_transactions', - 'extract_itemsets_from_mlxtend', - 'extract_itemsets_from_efficient_apriori', - 'extract_itemsets_from_priors', - 'extract_itemsets_from_result', + "count_itemsets", + "generate_transactions", + "generate_all_ones_transactions", + "extract_itemsets_from_mlxtend", + "extract_itemsets_from_efficient_apriori", + "extract_itemsets_from_priors", + "extract_itemsets_from_result", ] diff --git a/priors/tests/test_benchmarks.py b/priors/tests/test_benchmarks.py index c279b97..8508930 100644 --- a/priors/tests/test_benchmarks.py +++ b/priors/tests/test_benchmarks.py @@ -3,12 +3,14 @@ Tests speed, memory efficiency, and scalability up to 500K transactions. """ +import gc +import time +from typing import Dict, List, Tuple + import numpy as np import pytest + import priors -import time -import gc -from typing import Dict, List, Tuple try: import psutil @@ -22,6 +24,7 @@ # Fallback for when running without package installation import sys from pathlib import Path + parent_dir = Path(__file__).parent.parent sys.path.insert(0, str(parent_dir)) from utils import count_itemsets, generate_transactions @@ -30,6 +33,7 @@ # Helper Functions # ============================================================================ + def get_memory_usage(): """Get current memory usage in MB.""" if psutil: @@ -37,6 +41,7 @@ def get_memory_usage(): return process.memory_info().rss / 1024 / 1024 return 0 + def format_memory(mb): """Format memory in MB to human readable format.""" if mb >= 1024: @@ -44,6 +49,7 @@ def format_memory(mb): else: return f"{mb:.0f} MB" + def format_time(seconds): """Format time in seconds to human readable format.""" if seconds >= 60: @@ -51,6 +57,7 @@ def format_time(seconds): else: return f"{seconds:.2f}s" + # ============================================================================ # Benchmark Results Storage # ============================================================================ @@ -58,212 +65,261 @@ def format_time(seconds): speed_results = [] memory_results = [] -def add_speed_result(dataset_size: str, num_trans: int, num_items: int, - itemsets_found: int, execution_time: float, - throughput: float, notes: str = ""): + +def add_speed_result( + dataset_size: str, + num_trans: int, + num_items: int, + itemsets_found: int, + execution_time: float, + throughput: float, + notes: str = "", +): """Add speed benchmark result.""" - speed_results.append({ - 'Dataset Size': dataset_size, - 'Transactions': f"{num_trans:,}", - 'Items': num_items, - 'Itemsets Found': f"{itemsets_found:,}", - 'Time': format_time(execution_time), - 'Throughput': f"{throughput:,.0f} trans/s", - 'Notes': notes - }) - -def add_memory_result(dataset_size: str, peak_memory: float, - dataset_memory: float, efficiency: float, - notes: str = ""): + speed_results.append( + { + "Dataset Size": dataset_size, + "Transactions": f"{num_trans:,}", + "Items": num_items, + "Itemsets Found": f"{itemsets_found:,}", + "Time": format_time(execution_time), + "Throughput": f"{throughput:,.0f} trans/s", + "Notes": notes, + } + ) + + +def add_memory_result( + dataset_size: str, + peak_memory: float, + dataset_memory: float, + efficiency: float, + notes: str = "", +): """Add memory benchmark result.""" - memory_results.append({ - 'Dataset Size': dataset_size, - 'Dataset Memory': format_memory(dataset_memory), - 'Peak Memory': format_memory(peak_memory), - 'Memory Efficiency': f"{efficiency:.1f}x", - 'Notes': notes - }) + memory_results.append( + { + "Dataset Size": dataset_size, + "Dataset Memory": format_memory(dataset_memory), + "Peak Memory": format_memory(peak_memory), + "Memory Efficiency": f"{efficiency:.1f}x", + "Notes": notes, + } + ) + # ============================================================================ # Speed Benchmarks # ============================================================================ -@pytest.mark.parametrize("num_trans,num_items,avg_size,min_support", [ - (10_000, 50, 10, 0.05), - (25_000, 60, 12, 0.04), - (50_000, 80, 15, 0.03), - (100_000, 100, 20, 0.025), - (200_000, 120, 25, 0.02), - (350_000, 150, 30, 0.015), - (500_000, 200, 35, 0.01), -]) + +@pytest.mark.parametrize( + "num_trans,num_items,avg_size,min_support", + [ + (10_000, 50, 10, 0.05), + (25_000, 60, 12, 0.04), + (50_000, 80, 15, 0.03), + (100_000, 100, 20, 0.025), + (200_000, 120, 25, 0.02), + (350_000, 150, 30, 0.015), + (500_000, 200, 35, 0.01), + ], +) def test_speed_scaling(num_trans, num_items, avg_size, min_support): """Test Priors FP-Growth speed across different dataset sizes.""" print(f"\n=== Testing {num_trans:,} transactions ===") - + # Generate data print("Generating transactions...") gen_start = time.time() transactions = generate_transactions(num_trans, num_items, avg_size, seed=42) gen_time = time.time() - gen_start print(f"Generation: {format_time(gen_time)}") - + # Calculate dataset size in memory dataset_memory = transactions.nbytes / 1024 / 1024 - + # Measure memory before gc.collect() start_memory = get_memory_usage() - + # Run FP-Growth print("Running FP-Growth...") start_time = time.time() result = priors.fp_growth(transactions, min_support) execution_time = time.time() - start_time - + # Measure memory after peak_memory = get_memory_usage() memory_used = peak_memory - start_memory - + # Calculate metrics itemsets_found = count_itemsets(result) throughput = num_trans / execution_time if execution_time > 0 else 0 memory_efficiency = dataset_memory / memory_used if memory_used > 0 else 1.0 - + # Dataset size label dataset_size = f"{num_trans//1000}K × {num_items}" - + # Performance expectations max_time = min(60.0, num_trans / 5000) # Scale expectations - assert execution_time < max_time, \ - f"Dataset {dataset_size} took {execution_time:.2f}s, expected < {max_time:.2f}s" - + assert ( + execution_time < max_time + ), f"Dataset {dataset_size} took {execution_time:.2f}s, expected < {max_time:.2f}s" + assert itemsets_found >= 0, "Should find some itemsets or return 0" - + # Store results add_speed_result( - dataset_size, num_trans, num_items, - itemsets_found, execution_time, throughput, - f"Support: {min_support}, Gen: {format_time(gen_time)}" + dataset_size, + num_trans, + num_items, + itemsets_found, + execution_time, + throughput, + f"Support: {min_support}, Gen: {format_time(gen_time)}", ) - + if psutil: add_memory_result( - dataset_size, peak_memory, dataset_memory, + dataset_size, + peak_memory, + dataset_memory, memory_efficiency, - f"Used: {format_memory(memory_used)}" + f"Used: {format_memory(memory_used)}", ) - + print(f"Results: {itemsets_found:,} itemsets in {format_time(execution_time)}") print(f"Throughput: {throughput:,.0f} transactions/second") if psutil: - print(f"Memory: {format_memory(memory_used)} used, {memory_efficiency:.1f}x efficiency") + print( + f"Memory: {format_memory(memory_used)} used, {memory_efficiency:.1f}x efficiency" + ) + # ============================================================================ # Memory Benchmarks # ============================================================================ -@pytest.mark.parametrize("num_trans,num_items,density", [ - (50_000, 100, 0.15), # Sparse - (100_000, 150, 0.20), # Medium - (200_000, 200, 0.25), # Dense - (350_000, 250, 0.30), # Very Dense -]) + +@pytest.mark.parametrize( + "num_trans,num_items,density", + [ + (50_000, 100, 0.15), # Sparse + (100_000, 150, 0.20), # Medium + (200_000, 200, 0.25), # Dense + (350_000, 250, 0.30), # Very Dense + ], +) def test_memory_efficiency(num_trans, num_items, density): """Test memory efficiency across different data densities.""" - print(f"\n=== Memory Test: {num_trans:,} × {num_items} (density: {density:.0%}) ===") - + print( + f"\n=== Memory Test: {num_trans:,} × {num_items} (density: {density:.0%}) ===" + ) + # Generate data with specific density np.random.seed(42) transactions = np.zeros((num_trans, num_items), dtype=np.int32) - + for i in range(num_trans): - num_items_in_trans = max(1, int(num_items * density * np.random.uniform(0.5, 1.5))) + num_items_in_trans = max( + 1, int(num_items * density * np.random.uniform(0.5, 1.5)) + ) num_items_in_trans = min(num_items_in_trans, num_items) items = np.random.choice(num_items, num_items_in_trans, replace=False) transactions[i, items] = 1 - + actual_density = np.mean(transactions) dataset_memory = transactions.nbytes / 1024 / 1024 - + print(f"Actual density: {actual_density:.1%}") print(f"Dataset size: {format_memory(dataset_memory)}") - + # Test with different support levels support_levels = [0.05, 0.02, 0.01] - + for support in support_levels: print(f"\nTesting support {support}...") - + # Measure memory usage gc.collect() start_memory = get_memory_usage() - + start_time = time.time() result = priors.fp_growth(transactions, support) execution_time = time.time() - start_time - + peak_memory = get_memory_usage() memory_used = peak_memory - start_memory - + itemsets_found = count_itemsets(result) memory_efficiency = dataset_memory / memory_used if memory_used > 0 else 1.0 - - print(f" Support {support}: {itemsets_found:,} itemsets in {format_time(execution_time)}") - print(f" Memory used: {format_memory(memory_used)} (efficiency: {memory_efficiency:.1f}x)") - + + print( + f" Support {support}: {itemsets_found:,} itemsets in {format_time(execution_time)}" + ) + print( + f" Memory used: {format_memory(memory_used)} (efficiency: {memory_efficiency:.1f}x)" + ) + # Reasonable memory usage expectations expected_max_memory = dataset_memory * 5 # At most 5x the dataset size if psutil and memory_used > expected_max_memory: - print(f" Warning: High memory usage ({format_memory(memory_used)} > {format_memory(expected_max_memory)})") + print( + f" Warning: High memory usage ({format_memory(memory_used)} > {format_memory(expected_max_memory)})" + ) + # ============================================================================ # Scalability Tests # ============================================================================ + def test_transaction_scaling(): """Test how performance scales with number of transactions.""" print(f"\n=== Transaction Scaling Analysis ===") - + base_items = 100 avg_size = 20 min_support = 0.02 - + sizes = [50_000, 100_000, 200_000, 400_000] times = [] memories = [] - + for size in sizes: print(f"\nTesting {size:,} transactions...") - + transactions = generate_transactions(size, base_items, avg_size, seed=42) dataset_memory = transactions.nbytes / 1024 / 1024 - + gc.collect() start_memory = get_memory_usage() - + start_time = time.time() result = priors.fp_growth(transactions, min_support) execution_time = time.time() - start_time - + peak_memory = get_memory_usage() memory_used = peak_memory - start_memory - + itemsets_found = count_itemsets(result) - + times.append(execution_time) memories.append(memory_used) - - print(f" Results: {itemsets_found:,} itemsets in {format_time(execution_time)}") + + print( + f" Results: {itemsets_found:,} itemsets in {format_time(execution_time)}" + ) print(f" Memory: {format_memory(memory_used)} used") - + # Analyze scaling print(f"\n=== Scaling Analysis ===") for i in range(1, len(sizes)): - scale_factor = sizes[i] / sizes[i-1] - time_factor = times[i] / times[i-1] if times[i-1] > 0 else 1.0 - memory_factor = memories[i] / memories[i-1] if memories[i-1] > 0 else 1.0 - + scale_factor = sizes[i] / sizes[i - 1] + time_factor = times[i] / times[i - 1] if times[i - 1] > 0 else 1.0 + memory_factor = memories[i] / memories[i - 1] if memories[i - 1] > 0 else 1.0 + # Efficiency rating if time_factor <= scale_factor * 1.2: efficiency = "Excellent" @@ -273,73 +329,86 @@ def test_transaction_scaling(): efficiency = "Fair" else: efficiency = "Poor" - - print(f" {sizes[i-1]:,} → {sizes[i]:,}: " - f"{scale_factor:.1f}x size → {time_factor:.1f}x time, {memory_factor:.1f}x memory ({efficiency})") + + print( + f" {sizes[i-1]:,} → {sizes[i]:,}: " + f"{scale_factor:.1f}x size → {time_factor:.1f}x time, {memory_factor:.1f}x memory ({efficiency})" + ) + def test_item_scaling(): """Test how performance scales with number of items.""" print(f"\n=== Item Scaling Analysis ===") - + base_transactions = 100_000 avg_size = 15 min_support = 0.02 - + item_counts = [50, 100, 200, 400] - + for items in item_counts: print(f"\nTesting {items} items...") - - transactions = generate_transactions(base_transactions, items, avg_size, seed=42) - + + transactions = generate_transactions( + base_transactions, items, avg_size, seed=42 + ) + start_time = time.time() result = priors.fp_growth(transactions, min_support) execution_time = time.time() - start_time - + itemsets_found = count_itemsets(result) - - print(f" Results: {itemsets_found:,} itemsets in {format_time(execution_time)}") - + + print( + f" Results: {itemsets_found:,} itemsets in {format_time(execution_time)}" + ) + # Should handle increasing item counts reasonably - assert execution_time < 30.0, f"Item count {items} took too long: {execution_time:.2f}s" + assert ( + execution_time < 30.0 + ), f"Item count {items} took too long: {execution_time:.2f}s" + # ============================================================================ # Stress Tests # ============================================================================ + @pytest.mark.slow def test_maximum_dataset(): """Test with maximum dataset size (500K transactions).""" print(f"\n=== Maximum Dataset Test (500K transactions) ===") - + num_trans = 500_000 num_items = 200 avg_size = 25 min_support = 0.008 # Very low support for stress test - + print(f"Generating {num_trans:,} transactions with {num_items} items...") gen_start = time.time() transactions = generate_transactions(num_trans, num_items, avg_size, seed=42) gen_time = time.time() - gen_start - + dataset_memory = transactions.nbytes / 1024 / 1024 - print(f"Dataset: {format_memory(dataset_memory)}, Generation: {format_time(gen_time)}") - + print( + f"Dataset: {format_memory(dataset_memory)}, Generation: {format_time(gen_time)}" + ) + # Memory monitoring gc.collect() start_memory = get_memory_usage() - + print("Running FP-Growth...") start_time = time.time() result = priors.fp_growth(transactions, min_support) execution_time = time.time() - start_time - + peak_memory = get_memory_usage() memory_used = peak_memory - start_memory - + itemsets_found = count_itemsets(result) throughput = num_trans / execution_time if execution_time > 0 else 0 - + print(f"\n=== Results ===") print(f"Itemsets found: {itemsets_found:,}") print(f"Execution time: {format_time(execution_time)}") @@ -347,38 +416,45 @@ def test_maximum_dataset(): if psutil: print(f"Memory used: {format_memory(memory_used)}") print(f"Memory efficiency: {dataset_memory/memory_used:.1f}x") - + # Should complete in reasonable time (5 minutes max) - assert execution_time < 300.0, f"Maximum dataset took too long: {execution_time:.2f}s" + assert ( + execution_time < 300.0 + ), f"Maximum dataset took too long: {execution_time:.2f}s" assert itemsets_found >= 0, "Should find itemsets or return 0" + @pytest.mark.slow def test_very_low_support(): """Test with very low support thresholds.""" print(f"\n=== Very Low Support Test ===") - + transactions = generate_transactions(100_000, 100, 20, seed=42) - + support_levels = [0.001, 0.0005, 0.0001] # Very low supports - + for support in support_levels: print(f"\nTesting support {support} ({support*100:.02f}%)...") - + start_time = time.time() result = priors.fp_growth(transactions, support) execution_time = time.time() - start_time - + itemsets_found = count_itemsets(result) - + print(f" Found {itemsets_found:,} itemsets in {format_time(execution_time)}") - + # Should handle very low support without crashing - assert execution_time < 60.0, f"Support {support} took too long: {execution_time:.2f}s" + assert ( + execution_time < 60.0 + ), f"Support {support} took too long: {execution_time:.2f}s" + # ============================================================================ # Summary Output # ============================================================================ + def print_speed_summary(): """Print speed benchmark summary table.""" if not speed_results: @@ -387,20 +463,25 @@ def print_speed_summary(): print("\n" + "=" * 120) print("PRIORS FP-GROWTH SPEED BENCHMARKS") print("=" * 120) - + # Print header - print(f"{'Dataset Size':<15} {'Transactions':<15} {'Items':<6} {'Itemsets Found':<15} " - f"{'Time':<10} {'Throughput':<15} {'Notes':<30}") + print( + f"{'Dataset Size':<15} {'Transactions':<15} {'Items':<6} {'Itemsets Found':<15} " + f"{'Time':<10} {'Throughput':<15} {'Notes':<30}" + ) print("-" * 120) - + # Print results for result in speed_results: - print(f"{result['Dataset Size']:<15} {result['Transactions']:<15} " - f"{result['Items']:<6} {result['Itemsets Found']:<15} " - f"{result['Time']:<10} {result['Throughput']:<15} {result['Notes']:<30}") - + print( + f"{result['Dataset Size']:<15} {result['Transactions']:<15} " + f"{result['Items']:<6} {result['Itemsets Found']:<15} " + f"{result['Time']:<10} {result['Throughput']:<15} {result['Notes']:<30}" + ) + print("=" * 120) + def print_memory_summary(): """Print memory benchmark summary table.""" if not memory_results: @@ -409,36 +490,43 @@ def print_memory_summary(): print("\n" + "=" * 100) print("PRIORS FP-GROWTH MEMORY EFFICIENCY") print("=" * 100) - + # Print header - print(f"{'Dataset Size':<15} {'Dataset Memory':<15} {'Peak Memory':<15} " - f"{'Efficiency':<12} {'Notes':<30}") + print( + f"{'Dataset Size':<15} {'Dataset Memory':<15} {'Peak Memory':<15} " + f"{'Efficiency':<12} {'Notes':<30}" + ) print("-" * 100) - + # Print results for result in memory_results: - print(f"{result['Dataset Size']:<15} {result['Dataset Memory']:<15} " - f"{result['Peak Memory']:<15} {result['Memory Efficiency']:<12} " - f"{result['Notes']:<30}") - + print( + f"{result['Dataset Size']:<15} {result['Dataset Memory']:<15} " + f"{result['Peak Memory']:<15} {result['Memory Efficiency']:<12} " + f"{result['Notes']:<30}" + ) + print("=" * 100) + @pytest.fixture(scope="session", autouse=True) def print_summary_on_exit(request): """Print summary tables after all tests complete.""" + def finalize(): print_speed_summary() print_memory_summary() print("\n" + "=" * 120) print("BENCHMARK COMPLETE") print("=" * 120 + "\n") - + request.addfinalizer(finalize) + # ============================================================================ # Run as standalone script # ============================================================================ if __name__ == "__main__": # Run all benchmarks - pytest.main([__file__, "-v", "-s", "--tb=short"]) \ No newline at end of file + pytest.main([__file__, "-v", "-s", "--tb=short"]) diff --git a/priors/tests/test_comparison.py b/priors/tests/test_comparison.py index 0c6cd35..7039de4 100644 --- a/priors/tests/test_comparison.py +++ b/priors/tests/test_comparison.py @@ -3,18 +3,15 @@ Tests correctness and edge cases. """ +from typing import Dict, List, Optional, Set, Tuple + import numpy as np import pandas as pd import pytest -import priors -from typing import Dict, List, Tuple, Optional, Set - # Import shared utilities -from conftest import ( - count_itemsets, - generate_transactions, -) +from conftest import count_itemsets, generate_transactions +import priors # ============================================================================ # Correctness Tests @@ -22,36 +19,46 @@ # Test correctness by comparing with established libraries. + def test_fpgrowth_vs_mlxtend_basic(): """Compare priors FP-Growth with mlxtend on basic dataset.""" import pandas.testing as tm - mlxtend = pytest.importorskip("mlxtend") - from mlxtend.frequent_patterns import fpgrowth as mlxtend_fpgrowth + mlxtend = pytest.importorskip("mlxtend") # Import the conversion utility import sys from pathlib import Path + + from mlxtend.frequent_patterns import fpgrowth as mlxtend_fpgrowth + sys.path.insert(0, str(Path(__file__).parent.parent)) from utils import fp_growth_to_dataframe # Create simple test data - transactions = np.array([ - [1, 1, 0, 1, 0], - [1, 0, 1, 1, 0], - [0, 1, 1, 1, 0], - [1, 1, 1, 0, 0], - [1, 1, 0, 1, 0], - ], dtype=np.int32) + transactions = np.array( + [ + [1, 1, 0, 1, 0], + [1, 0, 1, 1, 0], + [0, 1, 1, 1, 0], + [1, 1, 1, 0, 0], + [1, 1, 0, 1, 0], + ], + dtype=np.int32, + ) min_support = 0.4 # Run priors - now returns (itemsets_list, supports_list) tuple itemsets_list, supports_list = priors.fp_growth(transactions, min_support) - priors_result = fp_growth_to_dataframe(itemsets_list, supports_list, len(transactions)) + priors_result = fp_growth_to_dataframe( + itemsets_list, supports_list, len(transactions) + ) # Run mlxtend - df = pd.DataFrame(transactions.astype(bool), - columns=[f'item_{i}' for i in range(transactions.shape[1])]) + df = pd.DataFrame( + transactions.astype(bool), + columns=[f"item_{i}" for i in range(transactions.shape[1])], + ) mlxtend_result = mlxtend_fpgrowth(df, min_support=min_support, use_colnames=False) # Debug output @@ -63,17 +70,24 @@ def test_fpgrowth_vs_mlxtend_basic(): # Compare DataFrames priors_count = len(priors_result) mlxtend_count = len(mlxtend_result) - assert priors_count == mlxtend_count, \ - f"Itemset count mismatch: priors={priors_count}, mlxtend={mlxtend_count}" + assert ( + priors_count == mlxtend_count + ), f"Itemset count mismatch: priors={priors_count}, mlxtend={mlxtend_count}" # Compare itemsets and supports (order-independent) - priors_set = set((frozenset(row['itemsets']), row['support']) - for _, row in priors_result.iterrows()) - mlxtend_set = set((frozenset(row['itemsets']), row['support']) - for _, row in mlxtend_result.iterrows()) + priors_set = set( + (frozenset(row["itemsets"]), row["support"]) + for _, row in priors_result.iterrows() + ) + mlxtend_set = set( + (frozenset(row["itemsets"]), row["support"]) + for _, row in mlxtend_result.iterrows() + ) + + assert ( + priors_set == mlxtend_set + ), f"Itemsets mismatch:\nPriors only: {priors_set - mlxtend_set}\nMlxtend only: {mlxtend_set - priors_set}" - assert priors_set == mlxtend_set, \ - f"Itemsets mismatch:\nPriors only: {priors_set - mlxtend_set}\nMlxtend only: {mlxtend_set - priors_set}" def test_fpgrowth_vs_efficient_apriori_basic(): """Compare priors FP-Growth with efficient_apriori.""" @@ -85,6 +99,7 @@ def test_fpgrowth_vs_efficient_apriori_basic(): # Import the conversion utility import sys from pathlib import Path + sys.path.insert(0, str(Path(__file__).parent.parent)) from utils import fp_growth_to_dataframe @@ -95,11 +110,15 @@ def test_fpgrowth_vs_efficient_apriori_basic(): # Run priors itemsets_list, supports_list = priors.fp_growth(transactions, min_support) - priors_result = fp_growth_to_dataframe(itemsets_list, supports_list, num_transactions) + priors_result = fp_growth_to_dataframe( + itemsets_list, supports_list, num_transactions + ) # Run efficient_apriori transactions_list = [tuple(np.where(row)[0]) for row in transactions] - ea_itemsets, ea_rules = efficient_apriori.apriori(transactions_list, min_support=min_support) + ea_itemsets, ea_rules = efficient_apriori.apriori( + transactions_list, min_support=min_support + ) # Convert efficient_apriori results to dictionary: {itemset: support} # ea_itemsets format: {1: {(item,): count}, 2: {(item1, item2): count}, ...} @@ -111,8 +130,10 @@ def test_fpgrowth_vs_efficient_apriori_basic(): ea_dict[frozenset(itemset)] = support # Convert priors results to dictionary: {itemset: support} - priors_dict = {frozenset(row['itemsets']): row['support'] - for _, row in priors_result.iterrows()} + priors_dict = { + frozenset(row["itemsets"]): row["support"] + for _, row in priors_result.iterrows() + } if ea_itemsets: for size in sorted(ea_itemsets.keys()): @@ -123,25 +144,32 @@ def test_fpgrowth_vs_efficient_apriori_basic(): # Compare counts priors_count = len(priors_result) - ea_count = sum(len(itemsets) for itemsets in ea_itemsets.values()) if ea_itemsets else 0 - assert priors_count == ea_count, \ - f"Itemset count mismatch: priors={priors_count}, efficient_apriori={ea_count}" + ea_count = ( + sum(len(itemsets) for itemsets in ea_itemsets.values()) if ea_itemsets else 0 + ) + assert ( + priors_count == ea_count + ), f"Itemset count mismatch: priors={priors_count}, efficient_apriori={ea_count}" # Compare dictionaries: itemsets and their support values - assert priors_dict == ea_dict, \ - f"Itemsets mismatch:\nPriors only: {set(priors_dict.keys()) - set(ea_dict.keys())}\n" \ - f"Efficient_apriori only: {set(ea_dict.keys()) - set(priors_dict.keys())}\n" \ + assert priors_dict == ea_dict, ( + f"Itemsets mismatch:\nPriors only: {set(priors_dict.keys()) - set(ea_dict.keys())}\n" + f"Efficient_apriori only: {set(ea_dict.keys()) - set(priors_dict.keys())}\n" f"Different supports: {[(k, priors_dict[k], ea_dict[k]) for k in priors_dict.keys() & ea_dict.keys() if priors_dict[k] != ea_dict[k]]}" + ) + def test_fpgrowth_vs_mlxtend_medium(): """Compare with mlxtend on medium-sized dataset.""" import pandas.testing as tm - mlxtend = pytest.importorskip("mlxtend") - from mlxtend.frequent_patterns import fpgrowth as mlxtend_fpgrowth + mlxtend = pytest.importorskip("mlxtend") # Import the conversion utility import sys from pathlib import Path + + from mlxtend.frequent_patterns import fpgrowth as mlxtend_fpgrowth + sys.path.insert(0, str(Path(__file__).parent.parent)) from utils import fp_growth_to_dataframe @@ -151,27 +179,37 @@ def test_fpgrowth_vs_mlxtend_medium(): # Run priors itemsets_list, supports_list = priors.fp_growth(transactions, min_support) - priors_result = fp_growth_to_dataframe(itemsets_list, supports_list, len(transactions)) + priors_result = fp_growth_to_dataframe( + itemsets_list, supports_list, len(transactions) + ) # Run mlxtend - df = pd.DataFrame(transactions.astype(bool), - columns=[f'item_{i}' for i in range(transactions.shape[1])]) + df = pd.DataFrame( + transactions.astype(bool), + columns=[f"item_{i}" for i in range(transactions.shape[1])], + ) mlxtend_result = mlxtend_fpgrowth(df, min_support=min_support, use_colnames=False) # Compare results priors_count = len(priors_result) mlxtend_count = len(mlxtend_result) - assert priors_count == mlxtend_count, \ - f"Itemset count mismatch: priors={priors_count}, mlxtend={mlxtend_count}" + assert ( + priors_count == mlxtend_count + ), f"Itemset count mismatch: priors={priors_count}, mlxtend={mlxtend_count}" # Compare itemsets and supports (order-independent) - priors_set = set((frozenset(row['itemsets']), row['support']) - for _, row in priors_result.iterrows()) - mlxtend_set = set((frozenset(row['itemsets']), row['support']) - for _, row in mlxtend_result.iterrows()) + priors_set = set( + (frozenset(row["itemsets"]), row["support"]) + for _, row in priors_result.iterrows() + ) + mlxtend_set = set( + (frozenset(row["itemsets"]), row["support"]) + for _, row in mlxtend_result.iterrows() + ) - assert priors_set == mlxtend_set, \ - f"Itemsets mismatch:\nPriors only: {priors_set - mlxtend_set}\nMlxtend only: {mlxtend_set - priors_set}" + assert ( + priors_set == mlxtend_set + ), f"Itemsets mismatch:\nPriors only: {priors_set - mlxtend_set}\nMlxtend only: {mlxtend_set - priors_set}" # ============================================================================ @@ -180,15 +218,19 @@ def test_fpgrowth_vs_mlxtend_medium(): # Test correctness at different scales using same pattern. + def test_fpgrowth_consistent_across_scales(): """Test that same pattern gives same itemsets regardless of scale.""" # Use a simple, deterministic pattern - base_pattern = np.array([ - [1, 1, 0, 0], # Items 0,1 - [1, 1, 0, 0], # Items 0,1 - [0, 0, 1, 1], # Items 2,3 - [0, 0, 1, 1], # Items 2,3 - ], dtype=np.int32) + base_pattern = np.array( + [ + [1, 1, 0, 0], # Items 0,1 + [1, 1, 0, 0], # Items 0,1 + [0, 0, 1, 1], # Items 2,3 + [0, 0, 1, 1], # Items 2,3 + ], + dtype=np.int32, + ) min_support = 0.5 # 50% - items 0,1 together (50%) and 2,3 together (50%) @@ -205,8 +247,10 @@ def test_fpgrowth_consistent_across_scales(): baseline_count = itemset_count assert itemset_count > 0, f"Should find itemsets with this pattern" else: - assert itemset_count == baseline_count, \ - f"Scale {scale}: found {itemset_count} itemsets, expected {baseline_count}" + assert ( + itemset_count == baseline_count + ), f"Scale {scale}: found {itemset_count} itemsets, expected {baseline_count}" + def test_fpgrowth_different_supports(): """Test that lower support finds monotonically more itemsets.""" @@ -220,11 +264,13 @@ def test_fpgrowth_different_supports(): itemset_count = count_itemsets((itemsets_list, supports_list)) # Lower support should find more or equal itemsets (monotonic property) - assert itemset_count >= prev_count, \ - f"Support {support}: {itemset_count} < {prev_count} from higher support" + assert ( + itemset_count >= prev_count + ), f"Support {support}: {itemset_count} < {prev_count} from higher support" prev_count = itemset_count + def test_fpgrowth_empty_transactions(): """Test edge case with empty transaction list.""" transactions = np.array([], dtype=np.int32).reshape(0, 10) @@ -233,7 +279,9 @@ def test_fpgrowth_empty_transactions(): itemsets_list, supports_list = priors.fp_growth(transactions, min_support) itemset_count = count_itemsets((itemsets_list, supports_list)) - assert itemset_count == 0, f"Empty transactions should return 0 itemsets, got {itemset_count}" + assert ( + itemset_count == 0 + ), f"Empty transactions should return 0 itemsets, got {itemset_count}" # ============================================================================ @@ -242,6 +290,7 @@ def test_fpgrowth_empty_transactions(): # Test with known datasets and expected results. + def test_simple_known_result(): """Test with a simple dataset where we know exact expected itemsets.""" # 3 identical transactions, each containing items 0 and 1 @@ -249,11 +298,14 @@ def test_simple_known_result(): # - 1-itemsets: {0}, {1} # - 2-itemsets: {0,1} # Total: 3 frequent itemsets - transactions = np.array([ - [1, 1, 0, 0], - [1, 1, 0, 0], - [1, 1, 0, 0], - ], dtype=np.int32) + transactions = np.array( + [ + [1, 1, 0, 0], + [1, 1, 0, 0], + [1, 1, 0, 0], + ], + dtype=np.int32, + ) min_support = 1.0 # 100% itemsets_list, supports_list = priors.fp_growth(transactions, min_support) @@ -262,13 +314,17 @@ def test_simple_known_result(): # Expected: {0}, {1}, {0,1} = 3 itemsets total assert itemset_count == 3, f"Expected 3 itemsets, got {itemset_count}" + def test_known_result_with_scaling(): """Test that scaling identical transactions produces consistent results.""" # Pattern where all items have same frequency - base = np.array([ - [1, 1, 1], - [1, 1, 1], - ], dtype=np.int32) + base = np.array( + [ + [1, 1, 1], + [1, 1, 1], + ], + dtype=np.int32, + ) min_support = 1.0 # 100% @@ -282,24 +338,29 @@ def test_known_result_with_scaling(): count_scaled = count_itemsets((itemsets_list_scaled, supports_list_scaled)) # Should find same patterns - all items still at 100% - assert count_base == count_scaled, \ - f"Scaling changed result: base={count_base}, scaled={count_scaled}" + assert ( + count_base == count_scaled + ), f"Scaling changed result: base={count_base}, scaled={count_scaled}" + def test_support_threshold_filtering(): """Test that support threshold correctly filters itemsets.""" # 10 transactions where item 0 appears 5 times (50%) - transactions = np.array([ - [1, 0, 0], # 0 - [1, 0, 0], # 1 - [1, 0, 0], # 2 - [1, 0, 0], # 3 - [1, 0, 0], # 4 - [0, 1, 1], # 5 - [0, 1, 1], # 6 - [0, 1, 1], # 7 - [0, 1, 1], # 8 - [0, 1, 1], # 9 - ], dtype=np.int32) + transactions = np.array( + [ + [1, 0, 0], # 0 + [1, 0, 0], # 1 + [1, 0, 0], # 2 + [1, 0, 0], # 3 + [1, 0, 0], # 4 + [0, 1, 1], # 5 + [0, 1, 1], # 6 + [0, 1, 1], # 7 + [0, 1, 1], # 8 + [0, 1, 1], # 9 + ], + dtype=np.int32, + ) # At 60% support, only items 1,2 should be frequent (50% for item 0) itemsets_list_60, supports_list_60 = priors.fp_growth(transactions, 0.6) @@ -310,8 +371,10 @@ def test_support_threshold_filtering(): count_50 = count_itemsets((itemsets_list_50, supports_list_50)) # Lower support should find more itemsets - assert count_50 > count_60, \ - f"50% support should find more itemsets than 60%: {count_50} vs {count_60}" + assert ( + count_50 > count_60 + ), f"50% support should find more itemsets than 60%: {count_50} vs {count_60}" + def test_scalable_known_results(): """Test with scalable pattern where we know exact results for any size. @@ -324,20 +387,24 @@ def test_scalable_known_results(): This pattern is scale-invariant: 10, 100, 1000, 10000 transactions all have the same frequency distribution. """ + def create_pattern(num_blocks): """Create num_blocks * 10 transactions with known pattern.""" - base_block = np.array([ - [1, 1, 1], # 0: A, B, C - [1, 1, 1], # 1: A, B, C - [1, 1, 1], # 2: A, B, C - [1, 1, 1], # 3: A, B, C - [1, 1, 1], # 4: A, B, C - [0, 1, 1], # 5: B, C (no A) - [0, 1, 1], # 6: B, C (no A) - [0, 0, 1], # 7: C (no A, B) - [0, 0, 1], # 8: C (no A, B) - [0, 0, 1], # 9: C (no A, B) - ], dtype=np.int32) + base_block = np.array( + [ + [1, 1, 1], # 0: A, B, C + [1, 1, 1], # 1: A, B, C + [1, 1, 1], # 2: A, B, C + [1, 1, 1], # 3: A, B, C + [1, 1, 1], # 4: A, B, C + [0, 1, 1], # 5: B, C (no A) + [0, 1, 1], # 6: B, C (no A) + [0, 0, 1], # 7: C (no A, B) + [0, 0, 1], # 8: C (no A, B) + [0, 0, 1], # 9: C (no A, B) + ], + dtype=np.int32, + ) return np.tile(base_block, (num_blocks, 1)) # Test at different scales @@ -349,19 +416,22 @@ def create_pattern(num_blocks): # Expected: {C} = 1 itemset itemsets_list_100, supports_list_100 = priors.fp_growth(transactions, 1.0) count_100 = count_itemsets((itemsets_list_100, supports_list_100)) - assert count_100 == 1, \ - f"Scale {num_trans}: 100% support should find exactly 1 itemset, got {count_100}" + assert ( + count_100 == 1 + ), f"Scale {num_trans}: 100% support should find exactly 1 itemset, got {count_100}" # At 70% support: items B (70%) and C (100%) # Expected: {B}, {C}, {B,C} = 3 itemsets itemsets_list_70, supports_list_70 = priors.fp_growth(transactions, 0.7) count_70 = count_itemsets((itemsets_list_70, supports_list_70)) - assert count_70 == 3, \ - f"Scale {num_trans}: 70% support should find exactly 3 itemsets, got {count_70}" + assert ( + count_70 == 3 + ), f"Scale {num_trans}: 70% support should find exactly 3 itemsets, got {count_70}" # At 50% support: items A (50%), B (70%), C (100%) # Expected: {A}, {B}, {C}, {A,B}, {A,C}, {B,C}, {A,B,C} = 7 itemsets (2^3-1) itemsets_list_50, supports_list_50 = priors.fp_growth(transactions, 0.5) count_50 = count_itemsets((itemsets_list_50, supports_list_50)) - assert count_50 == 7, \ - f"Scale {num_trans}: 50% support should find exactly 7 itemsets, got {count_50}" + assert ( + count_50 == 7 + ), f"Scale {num_trans}: 50% support should find exactly 7 itemsets, got {count_50}" diff --git a/priors/tests/test_fp_growth.py b/priors/tests/test_fp_growth.py index 59fe9b3..52f0300 100644 --- a/priors/tests/test_fp_growth.py +++ b/priors/tests/test_fp_growth.py @@ -3,15 +3,16 @@ Tests core functionality and correctness. """ +from typing import List, Optional, Tuple + import numpy as np import pandas as pd import pytest -import priors -from typing import List, Tuple, Optional - # Import shared utilities -from conftest import count_itemsets, generate_transactions, extract_itemsets_from_result +from conftest import (count_itemsets, extract_itemsets_from_result, + generate_transactions) +import priors # ============================================================================ # Basic FP-Growth Tests @@ -19,75 +20,97 @@ # Basic functionality tests for FP-Growth. + def test_empty_transactions(): """Test with empty transaction matrix.""" transactions = np.array([], dtype=np.int32).reshape(0, 5) min_support = 0.1 - + result = priors.fp_growth(transactions, min_support) itemset_count = count_itemsets(result) - - assert itemset_count == 0, f"Empty transactions should return 0 itemsets, got {itemset_count}" + + assert ( + itemset_count == 0 + ), f"Empty transactions should return 0 itemsets, got {itemset_count}" + def test_single_transaction(): """Test with single transaction.""" transactions = np.array([[1, 0, 1, 0, 1]], dtype=np.int32) min_support = 1.0 # 100% support - + result = priors.fp_growth(transactions, min_support) itemset_count = count_itemsets(result) - + # With single transaction at 100% support, we should get all subsets of [0,2,4] # That's 2^3 - 1 = 7 itemsets (excluding empty set) expected = 7 - assert itemset_count == expected, f"Single transaction should return {expected} itemsets, got {itemset_count}" + assert ( + itemset_count == expected + ), f"Single transaction should return {expected} itemsets, got {itemset_count}" + def test_no_frequent_items(): """Test with support threshold too high.""" - transactions = np.array([ - [1, 0, 0, 0, 0], - [0, 1, 0, 0, 0], - [0, 0, 1, 0, 0], - [0, 0, 0, 1, 0], - [0, 0, 0, 0, 1], - ], dtype=np.int32) + transactions = np.array( + [ + [1, 0, 0, 0, 0], + [0, 1, 0, 0, 0], + [0, 0, 1, 0, 0], + [0, 0, 0, 1, 0], + [0, 0, 0, 0, 1], + ], + dtype=np.int32, + ) min_support = 0.5 # 50% support, but each item appears only 20% - + result = priors.fp_growth(transactions, min_support) itemset_count = count_itemsets(result) - - assert itemset_count == 0, f"High support threshold should return 0 itemsets, got {itemset_count}" + + assert ( + itemset_count == 0 + ), f"High support threshold should return 0 itemsets, got {itemset_count}" + def test_all_items_frequent(): """Test where all combinations should be frequent.""" - transactions = np.array([ - [1, 1, 1], - [1, 1, 1], - [1, 1, 1], - ], dtype=np.int32) + transactions = np.array( + [ + [1, 1, 1], + [1, 1, 1], + [1, 1, 1], + ], + dtype=np.int32, + ) min_support = 1.0 # 100% support - + result = priors.fp_growth(transactions, min_support) itemset_count = count_itemsets(result) - + # All subsets of {0,1,2}: 2^3 - 1 = 7 expected = 7 - assert itemset_count == expected, f"All frequent items should return {expected} itemsets, got {itemset_count}" + assert ( + itemset_count == expected + ), f"All frequent items should return {expected} itemsets, got {itemset_count}" + def test_basic_example(): """Test with well-known basic example.""" - transactions = np.array([ - [1, 1, 0, 1, 0], # Items 0,1,3 - [1, 0, 1, 1, 0], # Items 0,2,3 - [0, 1, 1, 1, 0], # Items 1,2,3 - [1, 1, 1, 0, 0], # Items 0,1,2 - [1, 1, 0, 1, 0], # Items 0,1,3 - ], dtype=np.int32) + transactions = np.array( + [ + [1, 1, 0, 1, 0], # Items 0,1,3 + [1, 0, 1, 1, 0], # Items 0,2,3 + [0, 1, 1, 1, 0], # Items 1,2,3 + [1, 1, 1, 0, 0], # Items 0,1,2 + [1, 1, 0, 1, 0], # Items 0,1,3 + ], + dtype=np.int32, + ) min_support = 0.4 # 40% support = 2 transactions - + result = priors.fp_growth(transactions, min_support) itemset_count = count_itemsets(result) - + # Manual verification: # Item frequencies: 0:4, 1:4, 2:3, 3:4, 4:0 # Frequent items (≥2): 0,1,2,3 @@ -95,22 +118,26 @@ def test_basic_example(): assert itemset_count > 0, "Should find frequent itemsets" assert itemset_count <= 15, f"Too many itemsets found: {itemset_count}" # 2^4-1 max + def test_different_support_levels(): """Test with different support levels.""" transactions = generate_transactions(100, 10, 5, seed=42) - + # Test with decreasing support levels support_levels = [0.5, 0.3, 0.1, 0.05] prev_count = 0 - + for support in support_levels: result = priors.fp_growth(transactions, support) count = count_itemsets(result) - + # Lower support should find more or equal itemsets - assert count >= prev_count, f"Support {support} found {count} itemsets, less than {prev_count} at higher support" + assert ( + count >= prev_count + ), f"Support {support} found {count} itemsets, less than {prev_count} at higher support" prev_count = count + def test_large_transactions(): """Test with larger transaction set.""" transactions = generate_transactions(1000, 50, 10, seed=123) @@ -121,40 +148,42 @@ def test_large_transactions(): assert itemset_count > 0, "Should find itemsets in large dataset" + def test_very_sparse_data(): """Test with very sparse transaction data.""" num_trans, num_items = 100, 100 transactions = np.zeros((num_trans, num_items), dtype=np.int32) - + # Only set a few items in a few transactions for i in range(0, num_trans, 10): transactions[i, i % num_items] = 1 if i + 1 < num_items: transactions[i, (i + 1) % num_items] = 1 - + min_support = 0.05 # 5% - + result = priors.fp_growth(transactions, min_support) itemset_count = count_itemsets(result) - + # Sparse data should still work assert itemset_count >= 0, "Sparse data should not crash" + def test_dense_data(): """Test with very dense transaction data.""" num_trans, num_items = 50, 20 transactions = np.ones((num_trans, num_items), dtype=np.int32) - + # Remove some items randomly to make it interesting np.random.seed(42) mask = np.random.random((num_trans, num_items)) > 0.2 # 80% density transactions = transactions * mask.astype(np.int32) - + min_support = 0.6 # 60% - + result = priors.fp_growth(transactions, min_support) itemset_count = count_itemsets(result) - + assert itemset_count > 0, "Dense data should find frequent itemsets" @@ -164,84 +193,105 @@ def test_dense_data(): # Test edge cases and boundary conditions. + def test_min_support_zero(): """Test with minimum support of 0.""" - transactions = np.array([ - [1, 0, 0], - [0, 1, 0], - [0, 0, 1], - ], dtype=np.int32) + transactions = np.array( + [ + [1, 0, 0], + [0, 1, 0], + [0, 0, 1], + ], + dtype=np.int32, + ) min_support = 0.0 - + result = priors.fp_growth(transactions, min_support) itemset_count = count_itemsets(result) - + # With 0 support, all possible itemsets should be found assert itemset_count > 0, "Zero support should find itemsets" + def test_min_support_one(): """Test with minimum support of 1.0 (100%).""" - transactions = np.array([ - [1, 1, 0], - [1, 0, 1], - [0, 1, 1], - ], dtype=np.int32) + transactions = np.array( + [ + [1, 1, 0], + [1, 0, 1], + [0, 1, 1], + ], + dtype=np.int32, + ) min_support = 1.0 - + result = priors.fp_growth(transactions, min_support) itemset_count = count_itemsets(result) - + # Only items that appear in ALL transactions should be found # In this case, no item appears in all 3 transactions - assert itemset_count == 0, f"100% support should find 0 itemsets, got {itemset_count}" + assert ( + itemset_count == 0 + ), f"100% support should find 0 itemsets, got {itemset_count}" + def test_single_item_transactions(): """Test with transactions containing only single items.""" - transactions = np.array([ - [1, 0, 0, 0], - [0, 1, 0, 0], - [0, 0, 1, 0], - [0, 0, 0, 1], - [1, 0, 0, 0], # Repeat item 0 - ], dtype=np.int32) + transactions = np.array( + [ + [1, 0, 0, 0], + [0, 1, 0, 0], + [0, 0, 1, 0], + [0, 0, 0, 1], + [1, 0, 0, 0], # Repeat item 0 + ], + dtype=np.int32, + ) min_support = 0.2 # 20% - + result = priors.fp_growth(transactions, min_support) itemset_count = count_itemsets(result) - + # Only single items should be frequent (no combinations) # Item 0 appears twice (40%), others once (20%) # So items 0,1,2,3 should all be frequent - assert itemset_count >= 4, f"Should find at least 4 single items, got {itemset_count}" + assert ( + itemset_count >= 4 + ), f"Should find at least 4 single items, got {itemset_count}" + def test_duplicate_transactions(): """Test with duplicate transactions.""" base_transaction = [1, 1, 0, 1, 0] transactions = np.array([base_transaction] * 5, dtype=np.int32) min_support = 0.8 # 80% - + result = priors.fp_growth(transactions, min_support) itemset_count = count_itemsets(result) - + # All items in the transaction should be frequent # Items 0,1,3 appear in all 5 transactions (100%) expected_combinations = 2**3 - 1 # All subsets of {0,1,3} - assert itemset_count == expected_combinations, f"Expected {expected_combinations} itemsets, got {itemset_count}" + assert ( + itemset_count == expected_combinations + ), f"Expected {expected_combinations} itemsets, got {itemset_count}" + def test_binary_validation(): """Test that input is properly handled as binary.""" # Test with values > 1 (should be treated as 1) - transactions = np.array([ - [2, 3, 0, 5], - [1, 0, 4, 2], - [0, 7, 1, 0], - ], dtype=np.int32) + transactions = np.array( + [ + [2, 3, 0, 5], + [1, 0, 4, 2], + [0, 7, 1, 0], + ], + dtype=np.int32, + ) min_support = 0.5 - + result = priors.fp_growth(transactions, min_support) itemset_count = count_itemsets(result) - + # Should work without crashing assert itemset_count >= 0, "Non-binary input should be handled gracefully" - - diff --git a/priors/tests/test_output_comparison.py b/priors/tests/test_output_comparison.py index ef6abc5..f7f835b 100644 --- a/priors/tests/test_output_comparison.py +++ b/priors/tests/test_output_comparison.py @@ -3,13 +3,15 @@ Prints all results to console for manual verification. """ -import numpy as np -import pandas as pd import time -import priors from typing import List + +import numpy as np +import pandas as pd from numpy.typing import NDArray +import priors + def print_separator(title: str = ""): """Print a visual separator.""" @@ -44,18 +46,21 @@ def generate_test_data(size: str = "small"): """Generate test datasets.""" if size == "small": # Small dataset: 10 transactions, 5 items - return np.array([ - [1, 1, 0, 1, 0], - [1, 0, 1, 1, 0], - [0, 1, 1, 1, 0], - [1, 1, 1, 0, 0], - [1, 1, 0, 1, 0], - [1, 0, 1, 1, 1], - [1, 1, 1, 0, 0], - [1, 1, 0, 1, 0], - [0, 1, 1, 1, 0], - [1, 1, 1, 1, 0], - ], dtype=np.int32) + return np.array( + [ + [1, 1, 0, 1, 0], + [1, 0, 1, 1, 0], + [0, 1, 1, 1, 0], + [1, 1, 1, 0, 0], + [1, 1, 0, 1, 0], + [1, 0, 1, 1, 1], + [1, 1, 1, 0, 0], + [1, 1, 0, 1, 0], + [0, 1, 1, 1, 0], + [1, 1, 1, 1, 0], + ], + dtype=np.int32, + ) elif size == "medium": # Medium dataset: random but reproducible np.random.seed(42) @@ -153,9 +158,11 @@ def test_all_functions_with_output(): start = time.time() df = pd.DataFrame( transactions.astype(bool), - columns=[f'item_{i}' for i in range(transactions.shape[1])] + columns=[f"item_{i}" for i in range(transactions.shape[1])], + ) + mlxtend_result = mlxtend_fpgrowth( + df, min_support=min_support, use_colnames=False ) - mlxtend_result = mlxtend_fpgrowth(df, min_support=min_support, use_colnames=False) time_mlxtend = time.time() - start print(f"Execution time: {time_mlxtend:.4f}s") @@ -165,13 +172,15 @@ def test_all_functions_with_output(): # Group by itemset size if len(mlxtend_result) > 0: - mlxtend_result['size'] = mlxtend_result['itemsets'].apply(lambda x: len(x)) - for size in sorted(mlxtend_result['size'].unique()): - size_df = mlxtend_result[mlxtend_result['size'] == size] + mlxtend_result["size"] = mlxtend_result["itemsets"].apply( + lambda x: len(x) + ) + for size in sorted(mlxtend_result["size"].unique()): + size_df = mlxtend_result[mlxtend_result["size"] == size] print(f" Level {size} ({size}-itemsets): {len(size_df)} itemsets") for idx, row in size_df.iterrows(): - itemset = sorted(list(row['itemsets'])) - support = row['support'] + itemset = sorted(list(row["itemsets"])) + support = row["support"] print(f" {set(itemset)} (support: {support:.4f})") except ImportError: @@ -205,7 +214,7 @@ def count_total_itemsets(result): print(f" mlxtend: {time_mlxtend:.4f}s") print("\nConsistency Check:") - all_match = (count_normal == count_streaming == count_lazy) + all_match = count_normal == count_streaming == count_lazy if time_mlxtend is not None: all_match = all_match and (count_normal == len(mlxtend_result)) @@ -247,7 +256,7 @@ def extract_itemsets(result): if time_mlxtend is not None: itemsets_mlxtend = set() for _, row in mlxtend_result.iterrows(): - itemsets_mlxtend.add(frozenset(row['itemsets'])) + itemsets_mlxtend.add(frozenset(row["itemsets"])) if itemsets_normal == itemsets_mlxtend: print(" ✓ fp_growth == mlxtend") diff --git a/priors/tests/test_streaming.py b/priors/tests/test_streaming.py index c97d411..941c24e 100644 --- a/priors/tests/test_streaming.py +++ b/priors/tests/test_streaming.py @@ -6,14 +6,16 @@ - mlxtend FP-Growth """ +from typing import Dict, List, Optional, Tuple + import numpy as np import pandas as pd import pytest -import priors -from typing import Dict, List, Tuple, Optional - # Import shared utilities -from conftest import count_itemsets, generate_transactions, generate_all_ones_transactions +from conftest import (count_itemsets, generate_all_ones_transactions, + generate_transactions) + +import priors def run_streaming_fp_growth(transactions, min_support, chunk_size=None): @@ -40,15 +42,19 @@ def run_streaming_fp_growth(transactions, min_support, chunk_size=None): # Test that streaming FP-Growth produces correct results. + def test_streaming_vs_regular_basic(): """Verify streaming matches regular FP-Growth on basic dataset.""" - transactions = np.array([ - [1, 1, 0, 1, 0], - [1, 0, 1, 1, 0], - [0, 1, 1, 1, 0], - [1, 1, 1, 0, 0], - [1, 1, 0, 1, 0], - ], dtype=np.int32) + transactions = np.array( + [ + [1, 1, 0, 1, 0], + [1, 0, 1, 1, 0], + [0, 1, 1, 1, 0], + [1, 1, 1, 0, 0], + [1, 1, 0, 1, 0], + ], + dtype=np.int32, + ) min_support = 0.4 @@ -60,8 +66,10 @@ def test_streaming_vs_regular_basic(): streaming_result = run_streaming_fp_growth(transactions, min_support) streaming_count = count_itemsets(streaming_result) - assert streaming_count == regular_count, \ - f"Count mismatch: streaming={streaming_count}, regular={regular_count}" + assert ( + streaming_count == regular_count + ), f"Count mismatch: streaming={streaming_count}, regular={regular_count}" + def test_streaming_vs_mlxtend(): """Verify streaming matches mlxtend FP-Growth.""" @@ -72,8 +80,10 @@ def test_streaming_vs_mlxtend(): min_support = 0.1 # Run mlxtend - df = pd.DataFrame(transactions.astype(bool), - columns=[f'i{i}' for i in range(transactions.shape[1])]) + df = pd.DataFrame( + transactions.astype(bool), + columns=[f"i{i}" for i in range(transactions.shape[1])], + ) mlxtend_result = mlxtend_fpgrowth(df, min_support=min_support, use_colnames=False) mlxtend_count = len(mlxtend_result) @@ -81,8 +91,10 @@ def test_streaming_vs_mlxtend(): streaming_result = run_streaming_fp_growth(transactions, min_support) streaming_count = count_itemsets(streaming_result) - assert streaming_count == mlxtend_count, \ - f"Count mismatch: streaming={streaming_count}, mlxtend={mlxtend_count}" + assert ( + streaming_count == mlxtend_count + ), f"Count mismatch: streaming={streaming_count}, mlxtend={mlxtend_count}" + def test_trivial_all_ones(): """Test trivial case: all 1s dataset.""" @@ -98,17 +110,22 @@ def test_trivial_all_ones(): regular_result = priors.fp_growth(transactions, min_support) regular_count = count_itemsets(regular_result) - assert streaming_count == regular_count, \ - f"Count mismatch: streaming={streaming_count}, regular={regular_count}" + assert ( + streaming_count == regular_count + ), f"Count mismatch: streaming={streaming_count}, regular={regular_count}" + def test_scaled_dataset(): """Test scaled dataset: multiply small known dataset by 1000x.""" # Create base dataset - base_transactions = np.array([ - [1, 1, 0, 1], - [1, 0, 1, 1], - [0, 1, 1, 1], - ], dtype=np.int32) + base_transactions = np.array( + [ + [1, 1, 0, 1], + [1, 0, 1, 1], + [0, 1, 1, 1], + ], + dtype=np.int32, + ) # Scale it by repeating 1000 times transactions = np.tile(base_transactions, (1000, 1)) @@ -119,11 +136,15 @@ def test_scaled_dataset(): base_count = count_itemsets(base_result) # Run streaming on scaled - streaming_result = run_streaming_fp_growth(transactions, min_support, chunk_size=500) + streaming_result = run_streaming_fp_growth( + transactions, min_support, chunk_size=500 + ) streaming_count = count_itemsets(streaming_result) - assert streaming_count == base_count, \ - f"Count mismatch: streaming={streaming_count}, base={base_count}" + assert ( + streaming_count == base_count + ), f"Count mismatch: streaming={streaming_count}, base={base_count}" + def test_different_chunk_sizes(): """Test that different chunk sizes produce same results.""" @@ -140,8 +161,9 @@ def test_different_chunk_sizes(): result3 = run_streaming_fp_growth(transactions, min_support, chunk_size=200) count3 = count_itemsets(result3) - assert count1 == count2 == count3, \ - f"Chunk size mismatch: 50={count1}, 100={count2}, 200={count3}" + assert ( + count1 == count2 == count3 + ), f"Chunk size mismatch: 50={count1}, 100={count2}, 200={count3}" # ============================================================================ @@ -150,10 +172,11 @@ def test_different_chunk_sizes(): # Test streaming FP-Growth on large datasets. + @pytest.mark.slow def test_10m_transactions(): """Test with 10M+ transactions using generator.""" - if not hasattr(priors, 'create_lazy_fp_growth'): + if not hasattr(priors, "create_lazy_fp_growth"): pytest.skip("Lazy FP-Growth functions not available") num_transactions = 10_000_000 @@ -169,7 +192,9 @@ def test_10m_transactions(): # Counting phase for i in range(0, num_transactions, chunk_size): actual_chunk_size = min(chunk_size, num_transactions - i) - chunk = generate_transactions(actual_chunk_size, num_items, avg_size, seed=i) + chunk = generate_transactions( + actual_chunk_size, num_items, avg_size, seed=i + ) priors.lazy_count_pass(pid, chunk) # Finalize counts @@ -178,7 +203,9 @@ def test_10m_transactions(): # Building phase for i in range(0, num_transactions, chunk_size): actual_chunk_size = min(chunk_size, num_transactions - i) - chunk = generate_transactions(actual_chunk_size, num_items, avg_size, seed=i) + chunk = generate_transactions( + actual_chunk_size, num_items, avg_size, seed=i + ) priors.lazy_build_pass(pid, chunk) priors.lazy_finalize_building(pid) @@ -197,4 +224,4 @@ def test_10m_transactions(): assert regular_count > 0, "Regular should find itemsets on sample" finally: - priors.lazy_cleanup(pid) \ No newline at end of file + priors.lazy_cleanup(pid) diff --git a/priors/utils.py b/priors/utils.py index d79b62b..9881aaf 100644 --- a/priors/utils.py +++ b/priors/utils.py @@ -25,13 +25,19 @@ def count_itemsets(result): if isinstance(result, tuple) and len(result) == 2: itemsets_list, _ = result if isinstance(itemsets_list, list): - return sum(level.shape[0] for level in itemsets_list - if level is not None and hasattr(level, 'shape') and level.shape[0] > 0) + return sum( + level.shape[0] + for level in itemsets_list + if level is not None and hasattr(level, "shape") and level.shape[0] > 0 + ) # Handle old list format if isinstance(result, list): - return sum(level.shape[0] for level in result - if level is not None and hasattr(level, 'shape') and level.shape[0] > 0) - if hasattr(result, 'shape'): + return sum( + level.shape[0] + for level in result + if level is not None and hasattr(level, "shape") and level.shape[0] > 0 + ) + if hasattr(result, "shape"): return result.shape[0] return 0 @@ -88,7 +94,7 @@ def extract_itemsets_from_mlxtend(mlxtend_result): itemsets = set() if mlxtend_result is not None and len(mlxtend_result) > 0: for _, row in mlxtend_result.iterrows(): - itemset = tuple(sorted(row['itemsets'])) + itemset = tuple(sorted(row["itemsets"])) itemsets.add(itemset) return itemsets @@ -125,7 +131,7 @@ def extract_itemsets_from_priors(priors_result): if priors_result is not None: if isinstance(priors_result, list): for level_idx, level in enumerate(priors_result): - if level is not None and hasattr(level, 'shape') and level.shape[0] > 0: + if level is not None and hasattr(level, "shape") and level.shape[0] > 0: for i in range(level.shape[0]): itemsets.add((level_idx, i)) return itemsets @@ -147,9 +153,9 @@ def extract_itemsets_from_result(result): if isinstance(result, list): for level_idx, level in enumerate(result): - if level is not None and hasattr(level, 'shape') and level.shape[0] > 0: + if level is not None and hasattr(level, "shape") and level.shape[0] > 0: for i in range(level.shape[0]): - if hasattr(level, '__getitem__'): + if hasattr(level, "__getitem__"): itemsets.append(tuple(sorted(level[i]))) else: itemsets.append(tuple(range(level_idx + 1))) @@ -179,16 +185,13 @@ def fp_growth_to_dataframe(itemsets_list, supports_list, num_transactions): all_itemsets.append(itemset) all_supports.append(support) - df = pd.DataFrame({ - 'support': all_supports, - 'itemsets': all_itemsets - }) + df = pd.DataFrame({"support": all_supports, "itemsets": all_itemsets}) # Sort by support descending, then by length, then by sorted tuple representation # This matches mlxtend's ordering - df['_len'] = df['itemsets'].apply(len) - df['_sorted'] = df['itemsets'].apply(lambda x: tuple(sorted(x))) - df = df.sort_values(['support', '_len', '_sorted'], ascending=[False, True, True]) - df = df.drop(columns=['_len', '_sorted']).reset_index(drop=True) + df["_len"] = df["itemsets"].apply(len) + df["_sorted"] = df["itemsets"].apply(lambda x: tuple(sorted(x))) + df = df.sort_values(["support", "_len", "_sorted"], ascending=[False, True, True]) + df = df.drop(columns=["_len", "_sorted"]).reset_index(drop=True) return df