diff --git a/README.md b/README.md
index 6631415..50e297b 100644
--- a/README.md
+++ b/README.md
@@ -1,70 +1,111 @@
+
+
```
___ _ ___
/ __\ ___ __| | ___ /\/\ / \
/ / / _ \ / _` | / _ \ / \ / /\ /
/ /___ | (_) || (_| || __/ / /\/\ \ / /_//
\____/ \___/ \__,_| \___| \/ \//___,'
-```
-# codemd
-Transform code repositories into markdown-formatted strings ready for LLM prompting. Easily convert your entire codebase into a format that's optimal for code-related prompts with LLMs like GPT-4, Claude, etc.
+Ver. 0.0.2
+````
+
+# CodeMD
+
+🚀 Transform code repositories into markdown-formatted strings ready for LLM prompting
[](https://github.com/dotpyu/codemd/actions/workflows/tests.yml)
[](https://opensource.org/licenses/Apache-2.0)
+
+
+## 📝 Overview
+
+CodeMD helps you convert your entire codebase into a format that's optimal for code-related prompts with Large Language Models (LLMs) like GPT-4, Claude, and others. It automatically processes your code files and outputs them in a clean, markdown-formatted structure that's perfect for LLM interactions.
+
+## ✨ Features
+
+- 🔍 **Smart Directory Scanning**: Recursively scans directories for code files
+- 🎯 **Flexible Configuration**:
+ - Configurable file extensions
+ - File and pattern exclusion support
+ - Custom .gitignore support
+- 📊 **Intelligent Output**:
+ - Markdown-formatted code blocks
+ - Preserved directory structure
+ - Repository structure visualization
+ - Token count estimation (with tiktoken)
+- 📋 **Convenience**:
+ - Simple command-line interface
+ - Direct copy-to-clipboard support
+ - Multiple output options
+
+### 🎉 Recent Updates
-## Features
-- Recursively scans directories for code files
-- Configurable file extensions
-- File and pattern exclusion support
-- Markdown-formatted output
-- Preserves directory structure in headers
-- Simple command-line interface
-- Token count estimation (tiktoken package required)
-- Direct copy to clipboard
+- ⭐ **NEW**: Repository structure visualization (disable with `--no-structure`)
+- ⭐ **NEW**: Automatic .gitignore support
+ - Uses project's .gitignore by default
+ - Custom .gitignore files via `--gitignore`
+ - Disable with `--ignore-gitignore`
-## Installation
-Install from pip
+## 🚀 Installation
```bash
pip install codemd
```
-or install from source
+
+or install from source!
+
```bash
git clone https://github.com/dotpyu/codemd.git
cd codemd
pip install -e .
```
-## Usage
-Basic usage:
+## 📖 Usage
+
+### Command Line Interface
+
+**Basic Usage:**
```bash
codemd /path/to/your/code
```
-With custom extensions and output file:
+**Custom Extensions and Output:**
```bash
codemd /path/to/your/code -e py,java,sql -o output.md
```
-Exclude specific patterns or files:
+**Pattern Exclusion:**
```bash
codemd /path/to/your/code \
--exclude-patterns "test_,debug_" \
--exclude-extensions "test.py,spec.js"
```
-As a Python package:
-```python
-from codemd import CodeScanner
+**.gitignore Configuration:**
+```bash
+# Use custom gitignore files
+codemd /path/to/your/code --gitignore .gitignore .custom-ignore
-scanner = CodeScanner(
- extensions={'py', 'java'},
- exclude_patterns={'test_'},
- exclude_extensions={'test.py'}
-)
-markdown_string = scanner.scan_directory('./my_project')
+# Disable gitignore processing
+codemd /path/to/your/code --ignore-gitignore
```
+## 🤝 Contributing
+
+Contributions are welcome! Feel free to open issues or submit pull requests.
+
+1. Fork the repository
+2. Create your feature branch (`git checkout -b feature/amazing-feature`)
+3. Commit your changes (`git commit -m 'Add amazing feature'`)
+4. Push to the branch (`git push origin feature/amazing-feature`)
+5. Open a Pull Request
+
+## 📄 License
+
+Distributed under the Apache 2.0 License. See `LICENSE` for more information.
-## License
-Distributed under the Apache 2 License. See `LICENSE` for more information.
+---
+
+Made with ❤️ by Peilin
+
diff --git a/pyproject.toml b/pyproject.toml
index f3d0f45..f17cd7d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
[project]
name = "codemd"
-version = "0.0.1"
+version = "0.0.2"
authors = [
{ name = "Peilin Yu", email = "peilin_yu@brown.edu" },
]
diff --git a/requirements.txt b/requirements.txt
index 0dd5bcc..0fc82aa 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,3 @@
pathlib>=1.0.1
-typing-extensions>=4.0.0
\ No newline at end of file
+typing-extensions>=4.0.0
+pathspec>=0.11.0
\ No newline at end of file
diff --git a/setup.py b/setup.py
index c7b3d93..5f76d85 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@
setup(
name="codemd",
- version="0.0.1",
+ version="0.0.2",
author="Peilin Yu",
author_email="peilin_yu@brown.edu",
description="Transform code repositories into markdown-formatted strings ready for LLM prompting",
diff --git a/src/codemd/__init__.py b/src/codemd/__init__.py
index b4bce7b..763546a 100644
--- a/src/codemd/__init__.py
+++ b/src/codemd/__init__.py
@@ -1,5 +1,5 @@
from .cli import main
from .scanner import CodeScanner
-__version__ = "0.0.1"
+__version__ = "0.0.2"
__all__ = ["CodeScanner", "main"]
\ No newline at end of file
diff --git a/src/codemd/cli.py b/src/codemd/cli.py
index d2356c3..feece11 100644
--- a/src/codemd/cli.py
+++ b/src/codemd/cli.py
@@ -38,6 +38,15 @@
# Non-recursive scan with custom output
codemd /path/to/code --no-recursive -o custom.md
+
+ # Disable structure output
+ codemd /path/to/code --no-structure
+
+ # Use specific gitignore files
+ codemd /path/to/code --gitignore .gitignore .custom-ignore
+
+ # Disable gitignore processing
+ codemd /path/to/code --ignore-gitignore
"""
@@ -50,55 +59,37 @@ def parse_arguments() -> argparse.Namespace:
epilog=EPILOG
)
- parser.add_argument(
- 'directory',
- type=str,
- help='Directory to scan'
- )
+ parser.add_argument('directory', type=str, help='Directory to scan')
+ parser.add_argument('-e', '--extensions', type=str, default='py,java,js,cpp,c,h,hpp',
+ help='Comma-separated list of file extensions to include (without dots)')
+ parser.add_argument('--exclude-patterns', type=str, default='',
+ help='Comma-separated list of patterns to exclude (e.g., test_,debug_)')
+ parser.add_argument('--exclude-extensions', type=str, default='',
+ help='Comma-separated list of file patterns to exclude (e.g., test.py,spec.js)')
+ parser.add_argument('-o', '--output', type=str, default=None,
+ help='Output file path (if not specified, prints to stdout)')
+ parser.add_argument('--no-recursive', action='store_true',
+ help='Disable recursive directory scanning')
+ parser.add_argument('-v', '--verbose', action='store_true',
+ help='Enable verbose output')
+ parser.add_argument('--no-structure', action='store_true',
+ help='Disable repository structure output')
parser.add_argument(
- '-e', '--extensions',
+ '--gitignore',
type=str,
- default='py,java,js,cpp,c,h,hpp',
- help='Comma-separated list of file extensions to include (without dots)'
+ nargs='+',
+ help='Specify one or more .gitignore files to use'
)
parser.add_argument(
- '--exclude-patterns',
- type=str,
- default='',
- help='Comma-separated list of patterns to exclude (e.g., test_,debug_)'
- )
-
- parser.add_argument(
- '--exclude-extensions',
- type=str,
- default='',
- help='Comma-separated list of file patterns to exclude (e.g., test.py,spec.js)'
- )
-
- parser.add_argument(
- '-o', '--output',
- type=str,
- default=None,
- help='Output file path (if not specified, prints to stdout)'
- )
-
- parser.add_argument(
- '--no-recursive',
- action='store_true',
- help='Disable recursive directory scanning'
- )
-
- parser.add_argument(
- '-v', '--verbose',
+ '--ignore-gitignore',
action='store_true',
- help='Enable verbose output'
+ help='Disable .gitignore processing'
)
return parser.parse_args()
-
def str_to_set(s: str) -> Set[str]:
"""Convert comma-separated string to set of strings."""
return {item.strip() for item in s.split(',') if item.strip()}
@@ -212,12 +203,11 @@ def format_token_info(token_count: int, model_name: str) -> str:
def main() -> int:
print(BANNER)
- print("Version 0.0.1")
+ print("Version 0.0.2")
print("Transform your code into LLM-ready prompts\n")
try:
args = parse_arguments()
-
directory = Path(args.directory)
output_file = Path(args.output) if args.output else None
@@ -232,27 +222,17 @@ def main() -> int:
exclude_patterns = str_to_set(args.exclude_patterns)
exclude_extensions = str_to_set(args.exclude_extensions)
- if args.verbose:
- print("Configuration:")
- print(f" Directory: {directory}")
- print(f" Extensions: {', '.join(sorted(extensions))}")
- if exclude_patterns:
- print(f" Exclude patterns: {', '.join(sorted(exclude_patterns))}")
- if exclude_extensions:
- print(f" Exclude extensions: {', '.join(sorted(exclude_extensions))}")
- if output_file:
- print(f" Output: {output_file}")
- else:
- print(" Output: stdout")
- print(f" Recursive: {not args.no_recursive}")
- print("\nProcessing files...")
-
scanner = CodeScanner(
extensions=extensions,
exclude_patterns=exclude_patterns,
- exclude_extensions=exclude_extensions
+ exclude_extensions=exclude_extensions,
+ gitignore_files=args.gitignore,
+ ignore_gitignore=args.ignore_gitignore
)
+ scanner.no_structure = args.no_structure
+
+
try:
content = scanner.scan_directory(
directory,
diff --git a/src/codemd/scanner.py b/src/codemd/scanner.py
index 80446bd..ebfadc9 100644
--- a/src/codemd/scanner.py
+++ b/src/codemd/scanner.py
@@ -1,30 +1,63 @@
import argparse
import sys
from pathlib import Path
-from typing import Set, Optional
+import pathspec
+from typing import Set, Optional, List
class CodeScanner:
def __init__(self,
extensions: Optional[Set[str]] = None,
exclude_patterns: Optional[Set[str]] = None,
- exclude_extensions: Optional[Set[str]] = None):
- """
- Initialize the CodeScanner with optional file extensions to filter.
-
- Args:
- extensions: Set of file extensions to include (without dots), e.g. {'py', 'java'}
- exclude_patterns: Set of filename patterns to exclude, e.g. {'test_', 'debug_'}
- exclude_extensions: Set of file extensions to exclude (without dots), e.g. {'test.py', 'spec.js'}
- """
+ exclude_extensions: Optional[Set[str]] = None,
+ gitignore_files: Optional[List[str]] = None,
+ ignore_gitignore: bool = False):
+ """Initialize the CodeScanner with optional file extensions to filter."""
self.extensions = extensions or {'py', 'java', 'js', 'cpp', 'c', 'h', 'hpp'}
self.exclude_patterns = exclude_patterns or set()
self.exclude_extensions = exclude_extensions or set()
+ self.no_structure = False
+ self.gitspec = None
+ self.base_dir = None
+
+ if not ignore_gitignore:
+ self.load_gitignore(gitignore_files)
+
+ def load_gitignore(self, gitignore_files: Optional[List[str]] = None):
+ """Load gitignore patterns from specified files or default .gitignore"""
+ patterns = []
+
+ if gitignore_files:
+ for gitignore_path in gitignore_files:
+ try:
+ with open(gitignore_path, 'r') as f:
+ patterns.extend(f.readlines())
+ except Exception as e:
+ print(f"Warning: Could not read gitignore file {gitignore_path}: {str(e)}",
+ file=sys.stderr)
+
+ elif Path('.gitignore').exists():
+ try:
+ with open('.gitignore', 'r') as f:
+ patterns.extend(f.readlines())
+ except Exception as e:
+ print(f"Warning: Could not read default .gitignore: {str(e)}",
+ file=sys.stderr)
+
+ if patterns:
+ self.gitspec = pathspec.PathSpec.from_lines('gitwildmatch', patterns)
def should_include_file(self, file_path: Path) -> bool:
- """
- Check if a file should be included based on extensions and exclusion patterns.
- """
+ """Check if a file should be included based on extensions and exclusion patterns."""
+ if self.gitspec is not None and self.base_dir is not None:
+ try:
+ rel_path = str(file_path.relative_to(self.base_dir))
+ if self.gitspec.match_file(rel_path):
+ return False
+ except ValueError:
+ if self.gitspec.match_file(str(file_path)):
+ return False
+
if file_path.suffix.lstrip('.') not in self.extensions:
return False
@@ -38,23 +71,61 @@ def should_include_file(self, file_path: Path) -> bool:
return True
- def scan_directory(self, directory: str, recursive: bool = True) -> str:
+ def generate_structure(self, path: Path, depth: int = 0) -> str:
"""
- Scan a directory for code files and merge them into a markdown-formatted string.
-
- Args:
- directory: Path to the directory to scan
- recursive: Whether to scan subdirectories recursively
-
- Returns:
- A markdown-formatted string containing all code files
+ Generate a tree-like structure of the codebase, respecting gitignore patterns.
"""
+ structure = []
+
+ try:
+ items = sorted(path.iterdir())
+ for item in items:
+ if item.name.startswith('.'):
+ continue
+
+ if self.gitspec is not None:
+ try:
+ rel_path = str(item.relative_to(self.base_dir))
+ if self.gitspec.match_file(rel_path):
+ continue
+ except ValueError:
+ if self.gitspec.match_file(str(item)):
+ continue
+
+ indent = " " * depth
+ safe_name = item.name.replace('*', '\\*').replace('_', '\\_')
+
+ if item.is_dir():
+ substructure = self.generate_structure(item, depth + 1)
+ if substructure:
+ structure.append(f"{indent}* **{safe_name}/**")
+ structure.extend(substructure.split('\n'))
+ elif self.should_include_file(item):
+ structure.append(f"{indent}* {safe_name}")
+
+ except PermissionError:
+ pass
+
+ return '\n'.join(structure)
+
+ def scan_directory(self, directory: str, recursive: bool = True) -> str:
+ """Scan a directory for code files and merge them into a markdown-formatted string."""
directory_path = Path(directory)
if not directory_path.exists():
raise ValueError(f"Directory {directory} does not exist")
+ self.base_dir = directory_path.resolve()
+
merged_content = []
+ if not self.no_structure:
+ structure = ["# Repository Structure"]
+ dir_structure = self.generate_structure(directory_path)
+ if dir_structure:
+ structure.append(dir_structure)
+ structure.append("")
+ merged_content.extend(structure)
+
if recursive:
files = list(directory_path.rglob("*"))
else:
@@ -85,6 +156,8 @@ def scan_directory(self, directory: str, recursive: bool = True) -> str:
return "\n".join(merged_content)
+
+
def parse_arguments():
parser = argparse.ArgumentParser(
description='Scan directory for code files and create a markdown-formatted output'
diff --git a/tests/test_scanner.py b/tests/test_scanner.py
index 9c98650..9f22d0a 100644
--- a/tests/test_scanner.py
+++ b/tests/test_scanner.py
@@ -7,9 +7,7 @@
class TestCodeScanner:
@pytest.fixture
def temp_dir(self):
- """Create a temporary directory with some test files."""
with tempfile.TemporaryDirectory() as tmpdir:
- # Create some test files
files = {
'main.py': 'print("Hello")\n',
'test_main.py': 'def test_hello(): pass\n',
@@ -17,6 +15,8 @@ def temp_dir(self):
'debug_utils.py': 'def debug(): pass\n',
'spec.js': 'describe("test", () => {})\n',
'subdir/nested.py': 'def nested(): pass\n',
+ 'venv/lib/site-packages/pkg.py': 'package code\n',
+ '__pycache__/cache.pyc': 'cache\n',
}
for file_path, content in files.items():
@@ -54,6 +54,19 @@ def test_scan_directory_basic(self, temp_dir):
scanner = CodeScanner(extensions={'py'})
content = scanner.scan_directory(temp_dir)
+ expected_structure = """# Repository Structure
+* debug\_utils.py
+* main.py
+* **subdir/**
+ * nested.py
+* test\_main.py"""
+
+ lines = content.split('\n')
+ structure_start = lines.index('# Repository Structure')
+ structure_end = next(i for i, line in enumerate(lines[structure_start:]) if not line) + structure_start
+ actual_structure = '\n'.join(lines[structure_start:structure_end])
+
+ assert actual_structure == expected_structure
assert '# main.py' in content
assert 'print("Hello")' in content
assert '# subdir/nested.py' in content
@@ -66,16 +79,87 @@ def test_scan_directory_exclusions(self, temp_dir):
)
content = scanner.scan_directory(temp_dir)
+ expected_structure = """# Repository Structure
+* main.py
+* **subdir/**
+ * nested.py"""
+
+ lines = content.split('\n')
+ structure_start = lines.index('# Repository Structure')
+ structure_end = next(i for i, line in enumerate(lines[structure_start:]) if not line) + structure_start
+ actual_structure = '\n'.join(lines[structure_start:structure_end])
+
+ assert actual_structure == expected_structure
assert '# main.py' in content
assert 'test_main.py' not in content
assert 'debug_utils.py' not in content
- def test_scan_directory_non_recursive(self, temp_dir):
+ def test_scan_directory_no_structure(self, temp_dir):
scanner = CodeScanner(extensions={'py'})
- content = scanner.scan_directory(temp_dir, recursive=False)
+ scanner.no_structure = True
+ content = scanner.scan_directory(temp_dir)
- assert '# main.py' in content
- assert 'nested.py' not in content
+ assert '# Repository Structure' not in content
+ assert content.split('\n')[0].startswith('# ')
+ assert '.py' in content.split('\n')[0]
+
+ def test_scan_directory_gitignore(self, temp_dir):
+ gitignore_content = """
+venv/
+__pycache__/
+*.pyc
+"""
+ gitignore_path = Path(temp_dir) / '.gitignore'
+ gitignore_path.write_text(gitignore_content)
+
+ scanner = CodeScanner(extensions={'py'})
+ content = scanner.scan_directory(temp_dir)
+
+ expected_structure = """# Repository Structure
+* debug\_utils.py
+* main.py
+* **subdir/**
+ * nested.py
+* test\_main.py"""
+
+ lines = content.split('\n')
+ structure_start = lines.index('# Repository Structure')
+ structure_end = next(i for i, line in enumerate(lines[structure_start:]) if not line) + structure_start
+ actual_structure = '\n'.join(lines[structure_start:structure_end])
+
+ assert actual_structure == expected_structure
+ assert 'venv' not in content
+ assert '__pycache__' not in content
+ assert '.pyc' not in content
+
+ def test_scan_directory_custom_gitignore(self, temp_dir):
+ custom_ignore = """
+*.py
+!main.py
+!nested.py
+"""
+ custom_ignore_path = Path(temp_dir) / '.custom-ignore'
+ custom_ignore_path.write_text(custom_ignore)
+
+ scanner = CodeScanner(
+ extensions={'py'},
+ gitignore_files=[str(custom_ignore_path)]
+ )
+ content = scanner.scan_directory(temp_dir)
+
+ expected_structure = """# Repository Structure
+* main.py
+* **subdir/**
+ * nested.py"""
+
+ lines = content.split('\n')
+ structure_start = lines.index('# Repository Structure')
+ structure_end = next(i for i, line in enumerate(lines[structure_start:]) if not line) + structure_start
+ actual_structure = '\n'.join(lines[structure_start:structure_end])
+
+ assert actual_structure == expected_structure
+ assert 'test_main.py' not in content
+ assert 'debug_utils.py' not in content
def test_scan_directory_invalid_path(self):
scanner = CodeScanner()
@@ -83,11 +167,13 @@ def test_scan_directory_invalid_path(self):
scanner.scan_directory('/nonexistent/path')
def test_scan_directory_encoding_error(self, temp_dir):
- # Create a file with invalid encoding
bad_file = Path(temp_dir) / 'bad.py'
with open(bad_file, 'wb') as f:
f.write(b'\x80invalid')
scanner = CodeScanner()
content = scanner.scan_directory(temp_dir)
- assert '# main.py' in content
\ No newline at end of file
+
+ assert '# main.py' in content
+ assert 'bad.py' in content
+ assert b'\x80invalid' not in content.encode()
\ No newline at end of file