diff --git a/README.md b/README.md index 6631415..50e297b 100644 --- a/README.md +++ b/README.md @@ -1,70 +1,111 @@ +
+ ``` ___ _ ___ / __\ ___ __| | ___ /\/\ / \ / / / _ \ / _` | / _ \ / \ / /\ / / /___ | (_) || (_| || __/ / /\/\ \ / /_// \____/ \___/ \__,_| \___| \/ \//___,' -``` -# codemd -Transform code repositories into markdown-formatted strings ready for LLM prompting. Easily convert your entire codebase into a format that's optimal for code-related prompts with LLMs like GPT-4, Claude, etc. +Ver. 0.0.2 +```` + +# CodeMD + +🚀 Transform code repositories into markdown-formatted strings ready for LLM prompting [![Tests](https://github.com/dotpyu/codemd/actions/workflows/tests.yml/badge.svg)](https://github.com/dotpyu/codemd/actions/workflows/tests.yml) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) +
+ +## 📝 Overview + +CodeMD helps you convert your entire codebase into a format that's optimal for code-related prompts with Large Language Models (LLMs) like GPT-4, Claude, and others. It automatically processes your code files and outputs them in a clean, markdown-formatted structure that's perfect for LLM interactions. + +## ✨ Features + +- 🔍 **Smart Directory Scanning**: Recursively scans directories for code files +- 🎯 **Flexible Configuration**: + - Configurable file extensions + - File and pattern exclusion support + - Custom .gitignore support +- 📊 **Intelligent Output**: + - Markdown-formatted code blocks + - Preserved directory structure + - Repository structure visualization + - Token count estimation (with tiktoken) +- 📋 **Convenience**: + - Simple command-line interface + - Direct copy-to-clipboard support + - Multiple output options + +### 🎉 Recent Updates -## Features -- Recursively scans directories for code files -- Configurable file extensions -- File and pattern exclusion support -- Markdown-formatted output -- Preserves directory structure in headers -- Simple command-line interface -- Token count estimation (tiktoken package required) -- Direct copy to clipboard +- ⭐ **NEW**: Repository structure visualization (disable with `--no-structure`) +- ⭐ **NEW**: Automatic .gitignore support + - Uses project's .gitignore by default + - Custom .gitignore files via `--gitignore` + - Disable with `--ignore-gitignore` -## Installation -Install from pip +## 🚀 Installation ```bash pip install codemd ``` -or install from source + +or install from source! + ```bash git clone https://github.com/dotpyu/codemd.git cd codemd pip install -e . ``` -## Usage -Basic usage: +## 📖 Usage + +### Command Line Interface + +**Basic Usage:** ```bash codemd /path/to/your/code ``` -With custom extensions and output file: +**Custom Extensions and Output:** ```bash codemd /path/to/your/code -e py,java,sql -o output.md ``` -Exclude specific patterns or files: +**Pattern Exclusion:** ```bash codemd /path/to/your/code \ --exclude-patterns "test_,debug_" \ --exclude-extensions "test.py,spec.js" ``` -As a Python package: -```python -from codemd import CodeScanner +**.gitignore Configuration:** +```bash +# Use custom gitignore files +codemd /path/to/your/code --gitignore .gitignore .custom-ignore -scanner = CodeScanner( - extensions={'py', 'java'}, - exclude_patterns={'test_'}, - exclude_extensions={'test.py'} -) -markdown_string = scanner.scan_directory('./my_project') +# Disable gitignore processing +codemd /path/to/your/code --ignore-gitignore ``` +## 🤝 Contributing + +Contributions are welcome! Feel free to open issues or submit pull requests. + +1. Fork the repository +2. Create your feature branch (`git checkout -b feature/amazing-feature`) +3. Commit your changes (`git commit -m 'Add amazing feature'`) +4. Push to the branch (`git push origin feature/amazing-feature`) +5. Open a Pull Request + +## 📄 License + +Distributed under the Apache 2.0 License. See `LICENSE` for more information. -## License -Distributed under the Apache 2 License. See `LICENSE` for more information. +--- +
+Made with ❤️ by Peilin +
diff --git a/pyproject.toml b/pyproject.toml index f3d0f45..f17cd7d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "codemd" -version = "0.0.1" +version = "0.0.2" authors = [ { name = "Peilin Yu", email = "peilin_yu@brown.edu" }, ] diff --git a/requirements.txt b/requirements.txt index 0dd5bcc..0fc82aa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ pathlib>=1.0.1 -typing-extensions>=4.0.0 \ No newline at end of file +typing-extensions>=4.0.0 +pathspec>=0.11.0 \ No newline at end of file diff --git a/setup.py b/setup.py index c7b3d93..5f76d85 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="codemd", - version="0.0.1", + version="0.0.2", author="Peilin Yu", author_email="peilin_yu@brown.edu", description="Transform code repositories into markdown-formatted strings ready for LLM prompting", diff --git a/src/codemd/__init__.py b/src/codemd/__init__.py index b4bce7b..763546a 100644 --- a/src/codemd/__init__.py +++ b/src/codemd/__init__.py @@ -1,5 +1,5 @@ from .cli import main from .scanner import CodeScanner -__version__ = "0.0.1" +__version__ = "0.0.2" __all__ = ["CodeScanner", "main"] \ No newline at end of file diff --git a/src/codemd/cli.py b/src/codemd/cli.py index d2356c3..feece11 100644 --- a/src/codemd/cli.py +++ b/src/codemd/cli.py @@ -38,6 +38,15 @@ # Non-recursive scan with custom output codemd /path/to/code --no-recursive -o custom.md + + # Disable structure output + codemd /path/to/code --no-structure + + # Use specific gitignore files + codemd /path/to/code --gitignore .gitignore .custom-ignore + + # Disable gitignore processing + codemd /path/to/code --ignore-gitignore """ @@ -50,55 +59,37 @@ def parse_arguments() -> argparse.Namespace: epilog=EPILOG ) - parser.add_argument( - 'directory', - type=str, - help='Directory to scan' - ) + parser.add_argument('directory', type=str, help='Directory to scan') + parser.add_argument('-e', '--extensions', type=str, default='py,java,js,cpp,c,h,hpp', + help='Comma-separated list of file extensions to include (without dots)') + parser.add_argument('--exclude-patterns', type=str, default='', + help='Comma-separated list of patterns to exclude (e.g., test_,debug_)') + parser.add_argument('--exclude-extensions', type=str, default='', + help='Comma-separated list of file patterns to exclude (e.g., test.py,spec.js)') + parser.add_argument('-o', '--output', type=str, default=None, + help='Output file path (if not specified, prints to stdout)') + parser.add_argument('--no-recursive', action='store_true', + help='Disable recursive directory scanning') + parser.add_argument('-v', '--verbose', action='store_true', + help='Enable verbose output') + parser.add_argument('--no-structure', action='store_true', + help='Disable repository structure output') parser.add_argument( - '-e', '--extensions', + '--gitignore', type=str, - default='py,java,js,cpp,c,h,hpp', - help='Comma-separated list of file extensions to include (without dots)' + nargs='+', + help='Specify one or more .gitignore files to use' ) parser.add_argument( - '--exclude-patterns', - type=str, - default='', - help='Comma-separated list of patterns to exclude (e.g., test_,debug_)' - ) - - parser.add_argument( - '--exclude-extensions', - type=str, - default='', - help='Comma-separated list of file patterns to exclude (e.g., test.py,spec.js)' - ) - - parser.add_argument( - '-o', '--output', - type=str, - default=None, - help='Output file path (if not specified, prints to stdout)' - ) - - parser.add_argument( - '--no-recursive', - action='store_true', - help='Disable recursive directory scanning' - ) - - parser.add_argument( - '-v', '--verbose', + '--ignore-gitignore', action='store_true', - help='Enable verbose output' + help='Disable .gitignore processing' ) return parser.parse_args() - def str_to_set(s: str) -> Set[str]: """Convert comma-separated string to set of strings.""" return {item.strip() for item in s.split(',') if item.strip()} @@ -212,12 +203,11 @@ def format_token_info(token_count: int, model_name: str) -> str: def main() -> int: print(BANNER) - print("Version 0.0.1") + print("Version 0.0.2") print("Transform your code into LLM-ready prompts\n") try: args = parse_arguments() - directory = Path(args.directory) output_file = Path(args.output) if args.output else None @@ -232,27 +222,17 @@ def main() -> int: exclude_patterns = str_to_set(args.exclude_patterns) exclude_extensions = str_to_set(args.exclude_extensions) - if args.verbose: - print("Configuration:") - print(f" Directory: {directory}") - print(f" Extensions: {', '.join(sorted(extensions))}") - if exclude_patterns: - print(f" Exclude patterns: {', '.join(sorted(exclude_patterns))}") - if exclude_extensions: - print(f" Exclude extensions: {', '.join(sorted(exclude_extensions))}") - if output_file: - print(f" Output: {output_file}") - else: - print(" Output: stdout") - print(f" Recursive: {not args.no_recursive}") - print("\nProcessing files...") - scanner = CodeScanner( extensions=extensions, exclude_patterns=exclude_patterns, - exclude_extensions=exclude_extensions + exclude_extensions=exclude_extensions, + gitignore_files=args.gitignore, + ignore_gitignore=args.ignore_gitignore ) + scanner.no_structure = args.no_structure + + try: content = scanner.scan_directory( directory, diff --git a/src/codemd/scanner.py b/src/codemd/scanner.py index 80446bd..ebfadc9 100644 --- a/src/codemd/scanner.py +++ b/src/codemd/scanner.py @@ -1,30 +1,63 @@ import argparse import sys from pathlib import Path -from typing import Set, Optional +import pathspec +from typing import Set, Optional, List class CodeScanner: def __init__(self, extensions: Optional[Set[str]] = None, exclude_patterns: Optional[Set[str]] = None, - exclude_extensions: Optional[Set[str]] = None): - """ - Initialize the CodeScanner with optional file extensions to filter. - - Args: - extensions: Set of file extensions to include (without dots), e.g. {'py', 'java'} - exclude_patterns: Set of filename patterns to exclude, e.g. {'test_', 'debug_'} - exclude_extensions: Set of file extensions to exclude (without dots), e.g. {'test.py', 'spec.js'} - """ + exclude_extensions: Optional[Set[str]] = None, + gitignore_files: Optional[List[str]] = None, + ignore_gitignore: bool = False): + """Initialize the CodeScanner with optional file extensions to filter.""" self.extensions = extensions or {'py', 'java', 'js', 'cpp', 'c', 'h', 'hpp'} self.exclude_patterns = exclude_patterns or set() self.exclude_extensions = exclude_extensions or set() + self.no_structure = False + self.gitspec = None + self.base_dir = None + + if not ignore_gitignore: + self.load_gitignore(gitignore_files) + + def load_gitignore(self, gitignore_files: Optional[List[str]] = None): + """Load gitignore patterns from specified files or default .gitignore""" + patterns = [] + + if gitignore_files: + for gitignore_path in gitignore_files: + try: + with open(gitignore_path, 'r') as f: + patterns.extend(f.readlines()) + except Exception as e: + print(f"Warning: Could not read gitignore file {gitignore_path}: {str(e)}", + file=sys.stderr) + + elif Path('.gitignore').exists(): + try: + with open('.gitignore', 'r') as f: + patterns.extend(f.readlines()) + except Exception as e: + print(f"Warning: Could not read default .gitignore: {str(e)}", + file=sys.stderr) + + if patterns: + self.gitspec = pathspec.PathSpec.from_lines('gitwildmatch', patterns) def should_include_file(self, file_path: Path) -> bool: - """ - Check if a file should be included based on extensions and exclusion patterns. - """ + """Check if a file should be included based on extensions and exclusion patterns.""" + if self.gitspec is not None and self.base_dir is not None: + try: + rel_path = str(file_path.relative_to(self.base_dir)) + if self.gitspec.match_file(rel_path): + return False + except ValueError: + if self.gitspec.match_file(str(file_path)): + return False + if file_path.suffix.lstrip('.') not in self.extensions: return False @@ -38,23 +71,61 @@ def should_include_file(self, file_path: Path) -> bool: return True - def scan_directory(self, directory: str, recursive: bool = True) -> str: + def generate_structure(self, path: Path, depth: int = 0) -> str: """ - Scan a directory for code files and merge them into a markdown-formatted string. - - Args: - directory: Path to the directory to scan - recursive: Whether to scan subdirectories recursively - - Returns: - A markdown-formatted string containing all code files + Generate a tree-like structure of the codebase, respecting gitignore patterns. """ + structure = [] + + try: + items = sorted(path.iterdir()) + for item in items: + if item.name.startswith('.'): + continue + + if self.gitspec is not None: + try: + rel_path = str(item.relative_to(self.base_dir)) + if self.gitspec.match_file(rel_path): + continue + except ValueError: + if self.gitspec.match_file(str(item)): + continue + + indent = " " * depth + safe_name = item.name.replace('*', '\\*').replace('_', '\\_') + + if item.is_dir(): + substructure = self.generate_structure(item, depth + 1) + if substructure: + structure.append(f"{indent}* **{safe_name}/**") + structure.extend(substructure.split('\n')) + elif self.should_include_file(item): + structure.append(f"{indent}* {safe_name}") + + except PermissionError: + pass + + return '\n'.join(structure) + + def scan_directory(self, directory: str, recursive: bool = True) -> str: + """Scan a directory for code files and merge them into a markdown-formatted string.""" directory_path = Path(directory) if not directory_path.exists(): raise ValueError(f"Directory {directory} does not exist") + self.base_dir = directory_path.resolve() + merged_content = [] + if not self.no_structure: + structure = ["# Repository Structure"] + dir_structure = self.generate_structure(directory_path) + if dir_structure: + structure.append(dir_structure) + structure.append("") + merged_content.extend(structure) + if recursive: files = list(directory_path.rglob("*")) else: @@ -85,6 +156,8 @@ def scan_directory(self, directory: str, recursive: bool = True) -> str: return "\n".join(merged_content) + + def parse_arguments(): parser = argparse.ArgumentParser( description='Scan directory for code files and create a markdown-formatted output' diff --git a/tests/test_scanner.py b/tests/test_scanner.py index 9c98650..9f22d0a 100644 --- a/tests/test_scanner.py +++ b/tests/test_scanner.py @@ -7,9 +7,7 @@ class TestCodeScanner: @pytest.fixture def temp_dir(self): - """Create a temporary directory with some test files.""" with tempfile.TemporaryDirectory() as tmpdir: - # Create some test files files = { 'main.py': 'print("Hello")\n', 'test_main.py': 'def test_hello(): pass\n', @@ -17,6 +15,8 @@ def temp_dir(self): 'debug_utils.py': 'def debug(): pass\n', 'spec.js': 'describe("test", () => {})\n', 'subdir/nested.py': 'def nested(): pass\n', + 'venv/lib/site-packages/pkg.py': 'package code\n', + '__pycache__/cache.pyc': 'cache\n', } for file_path, content in files.items(): @@ -54,6 +54,19 @@ def test_scan_directory_basic(self, temp_dir): scanner = CodeScanner(extensions={'py'}) content = scanner.scan_directory(temp_dir) + expected_structure = """# Repository Structure +* debug\_utils.py +* main.py +* **subdir/** + * nested.py +* test\_main.py""" + + lines = content.split('\n') + structure_start = lines.index('# Repository Structure') + structure_end = next(i for i, line in enumerate(lines[structure_start:]) if not line) + structure_start + actual_structure = '\n'.join(lines[structure_start:structure_end]) + + assert actual_structure == expected_structure assert '# main.py' in content assert 'print("Hello")' in content assert '# subdir/nested.py' in content @@ -66,16 +79,87 @@ def test_scan_directory_exclusions(self, temp_dir): ) content = scanner.scan_directory(temp_dir) + expected_structure = """# Repository Structure +* main.py +* **subdir/** + * nested.py""" + + lines = content.split('\n') + structure_start = lines.index('# Repository Structure') + structure_end = next(i for i, line in enumerate(lines[structure_start:]) if not line) + structure_start + actual_structure = '\n'.join(lines[structure_start:structure_end]) + + assert actual_structure == expected_structure assert '# main.py' in content assert 'test_main.py' not in content assert 'debug_utils.py' not in content - def test_scan_directory_non_recursive(self, temp_dir): + def test_scan_directory_no_structure(self, temp_dir): scanner = CodeScanner(extensions={'py'}) - content = scanner.scan_directory(temp_dir, recursive=False) + scanner.no_structure = True + content = scanner.scan_directory(temp_dir) - assert '# main.py' in content - assert 'nested.py' not in content + assert '# Repository Structure' not in content + assert content.split('\n')[0].startswith('# ') + assert '.py' in content.split('\n')[0] + + def test_scan_directory_gitignore(self, temp_dir): + gitignore_content = """ +venv/ +__pycache__/ +*.pyc +""" + gitignore_path = Path(temp_dir) / '.gitignore' + gitignore_path.write_text(gitignore_content) + + scanner = CodeScanner(extensions={'py'}) + content = scanner.scan_directory(temp_dir) + + expected_structure = """# Repository Structure +* debug\_utils.py +* main.py +* **subdir/** + * nested.py +* test\_main.py""" + + lines = content.split('\n') + structure_start = lines.index('# Repository Structure') + structure_end = next(i for i, line in enumerate(lines[structure_start:]) if not line) + structure_start + actual_structure = '\n'.join(lines[structure_start:structure_end]) + + assert actual_structure == expected_structure + assert 'venv' not in content + assert '__pycache__' not in content + assert '.pyc' not in content + + def test_scan_directory_custom_gitignore(self, temp_dir): + custom_ignore = """ +*.py +!main.py +!nested.py +""" + custom_ignore_path = Path(temp_dir) / '.custom-ignore' + custom_ignore_path.write_text(custom_ignore) + + scanner = CodeScanner( + extensions={'py'}, + gitignore_files=[str(custom_ignore_path)] + ) + content = scanner.scan_directory(temp_dir) + + expected_structure = """# Repository Structure +* main.py +* **subdir/** + * nested.py""" + + lines = content.split('\n') + structure_start = lines.index('# Repository Structure') + structure_end = next(i for i, line in enumerate(lines[structure_start:]) if not line) + structure_start + actual_structure = '\n'.join(lines[structure_start:structure_end]) + + assert actual_structure == expected_structure + assert 'test_main.py' not in content + assert 'debug_utils.py' not in content def test_scan_directory_invalid_path(self): scanner = CodeScanner() @@ -83,11 +167,13 @@ def test_scan_directory_invalid_path(self): scanner.scan_directory('/nonexistent/path') def test_scan_directory_encoding_error(self, temp_dir): - # Create a file with invalid encoding bad_file = Path(temp_dir) / 'bad.py' with open(bad_file, 'wb') as f: f.write(b'\x80invalid') scanner = CodeScanner() content = scanner.scan_directory(temp_dir) - assert '# main.py' in content \ No newline at end of file + + assert '# main.py' in content + assert 'bad.py' in content + assert b'\x80invalid' not in content.encode() \ No newline at end of file