diff --git a/README.md b/README.md index 0c90706..aadc38b 100644 --- a/README.md +++ b/README.md @@ -117,6 +117,8 @@ Course PDFs can be ingested with Docling and indexed into a local Chroma store. For explicit maze-search assignments that require `maze_solvers.py`, the generator now emits a working Python maze project with BFS, DFS, A* implementations, a benchmark script, a sample maze file, generated tests, and downloaded linked maze artifacts when the assignment brief exposes maze text files. +For ML project briefs, the generator now produces a project-oriented Python scaffold with a tailored report template, a presentation outline, NSL-KDD plus `kagglehub` starter code, EDA helpers, baseline model-training scripts, and validation tests instead of only the generic `main.py` placeholder. + ## MCP Server The project also exposes an MCP stdio server so other agents and MCP-compatible clients can invoke the workflow directly. diff --git a/scaffolding/templates.py b/scaffolding/templates.py index 200f0ee..68cb278 100644 --- a/scaffolding/templates.py +++ b/scaffolding/templates.py @@ -930,6 +930,442 @@ def _extend_python_requirements(existing_requirements: str, extra_requirements: return "\n".join(lines) + "\n" +def _is_python_ml_project_assignment(requested_files: List[str], assignment_description: str) -> bool: + requested_lower = {path.lower() for path in requested_files} + description_lower = assignment_description.lower() + ml_signals = [ + "machine learning", + "ml project", + "group project", + "dataset", + "eda", + "exploratory data analysis", + "model development", + "predictive", + "prescriptive", + "classification", + "malware", + "intrusion", + "nsl-kdd", + "kagglehub", + ] + return ( + any(signal in description_lower for signal in ml_signals) + or "presentation.md" in requested_lower + or "slides.md" in requested_lower + ) + + +def _is_nsl_kdd_assignment(assignment_description: str) -> bool: + description_lower = assignment_description.lower() + return any( + phrase in description_lower + for phrase in [ + "nsl-kdd", + "nsl kdd", + "network intrusion detection", + "malware and network intrusion detection", + ] + ) + + +def _build_generic_report_template(assignment_name: str) -> str: + return ( + f"# {assignment_name} Report\n\n" + "## Executive Summary\n\n" + "Summarize the purpose of the work, the approach you followed, and the most important findings.\n\n" + "## Problem Statement\n\n" + "Explain the problem being addressed and why it matters.\n\n" + "## Methods\n\n" + "Document the workflow, tools, and analysis steps used to complete the assignment.\n\n" + "## Findings\n\n" + "Summarize the main results, tables, graphs, or outputs that support your conclusions.\n\n" + "## Recommendations\n\n" + "Describe the actions or follow-up steps suggested by the results.\n\n" + "## Appendix\n\n" + "Include referenced code snippets, tables, figures, or supporting material.\n" + ) + + +def _build_ml_project_report_template(assignment_name: str, dataset_name: str) -> str: + return ( + f"# {assignment_name} Report\n\n" + "## Real-World Problem\n\n" + "Explain the cybersecurity or business problem your team is solving and why big-data ML techniques are appropriate.\n\n" + f"## Dataset Selection: {dataset_name}\n\n" + "Explain why this dataset was selected, what it contains, and how it supports the project objectives.\n\n" + "## EDA Workflow\n\n" + "Describe the exploratory data analysis process, dataset size, feature types, missingness, duplicates, outliers, and the most important patterns discovered.\n\n" + "## Data Preparation\n\n" + "Summarize the preprocessing, feature engineering, encoding, scaling, and dataset split decisions used before model training.\n\n" + "## ML Methodology and Algorithms\n\n" + "Explain which ML algorithms were used, why they were selected, and how the code transforms the data step by step.\n\n" + "## Results and Metrics\n\n" + "Compare model performance using the selected metrics and visualizations. Include insights from the EDA and advanced analytics.\n\n" + "## Interpretation and Recommendations\n\n" + "Interpret what the results mean for stakeholders, identify the important variables, and recommend next actions.\n\n" + "## Weekly Code Walkthrough Notes\n\n" + "Capture concise talking points for the weekly in-class walkthroughs and note which code snippets to demonstrate.\n\n" + "## Appendix\n\n" + "Include referenced code snippets, tables, plots, and any generated model artifact summaries.\n" + ) + + +def _build_ml_project_runner_file(dataset_name: str) -> str: + return ( + f'"""Pipeline entrypoint for the {dataset_name} ML project scaffold."""\n\n' + "from pathlib import Path\n\n" + "from src.data_loader import load_nsl_kdd_frames\n" + "from src.eda import build_eda_summary, render_eda_artifacts\n" + "from src.train_models import train_and_evaluate_models, write_model_artifacts\n\n" + "def main() -> None:\n" + ' """Download data, run EDA, train baseline models, and persist artifacts."""\n' + " output_dir = Path('outputs')\n" + " output_dir.mkdir(exist_ok=True)\n" + " train_df, test_df, metadata = load_nsl_kdd_frames()\n" + " eda_summary = build_eda_summary(train_df, test_df, metadata)\n" + " render_eda_artifacts(train_df, eda_summary, output_dir=output_dir)\n" + " model_results = train_and_evaluate_models(train_df, test_df)\n" + " write_model_artifacts(model_results, output_dir=output_dir)\n" + " print('EDA and model artifacts written to outputs/.')\n\n" + 'if __name__ == "__main__":\n' + " main()\n" + ) + + +def _build_nsl_kdd_data_loader_file() -> str: + return '''"""Dataset-loading utilities for the NSL-KDD ML project scaffold.""" + +from __future__ import annotations + +import os +from pathlib import Path +from typing import Sequence + +import kagglehub +import pandas as pd + +DEFAULT_DATASET_HANDLE = os.getenv( + "KAGGLEHUB_DATASET", + "", +) + +NSL_KDD_COLUMNS = [ + "duration", + "protocol_type", + "service", + "flag", + "src_bytes", + "dst_bytes", + "land", + "wrong_fragment", + "urgent", + "hot", + "num_failed_logins", + "logged_in", + "num_compromised", + "root_shell", + "su_attempted", + "num_root", + "num_file_creations", + "num_shells", + "num_access_files", + "num_outbound_cmds", + "is_host_login", + "is_guest_login", + "count", + "srv_count", + "serror_rate", + "srv_serror_rate", + "rerror_rate", + "srv_rerror_rate", + "same_srv_rate", + "diff_srv_rate", + "srv_diff_host_rate", + "dst_host_count", + "dst_host_srv_count", + "dst_host_same_srv_rate", + "dst_host_diff_srv_rate", + "dst_host_same_src_port_rate", + "dst_host_srv_diff_host_rate", + "dst_host_serror_rate", + "dst_host_srv_serror_rate", + "dst_host_rerror_rate", + "dst_host_srv_rerror_rate", + "label", + "difficulty", +] + + +def _find_matching_file(root: Path, candidates: Sequence[str]) -> Path: + lowered = {candidate.lower() for candidate in candidates} + for path in root.rglob("*"): + if path.is_file() and path.name.lower() in lowered: + return path + raise FileNotFoundError(f"Could not find any of {sorted(lowered)} under {root}") + + +def download_dataset(dataset_handle: str | None = None) -> Path: + handle = (dataset_handle or DEFAULT_DATASET_HANDLE).strip() + if not handle or handle.startswith("<"): + raise ValueError( + "Set KAGGLEHUB_DATASET to the KaggleHub handle for your NSL-KDD dataset before running this scaffold." + ) + return Path(kagglehub.dataset_download(handle)) + + +def load_nsl_kdd_frames(dataset_handle: str | None = None) -> tuple[pd.DataFrame, pd.DataFrame, dict[str, str]]: + dataset_root = download_dataset(dataset_handle) + train_path = _find_matching_file(dataset_root, ["KDDTrain+.txt", "KDDTrain+.csv"]) + test_path = _find_matching_file(dataset_root, ["KDDTest+.txt", "KDDTest+.csv"]) + + train_df = pd.read_csv(train_path, names=NSL_KDD_COLUMNS) + test_df = pd.read_csv(test_path, names=NSL_KDD_COLUMNS) + metadata = { + "dataset_root": str(dataset_root), + "train_path": str(train_path), + "test_path": str(test_path), + } + return train_df, test_df, metadata +''' + + +def _build_ml_eda_file(dataset_name: str) -> str: + return f'''"""EDA helpers for the {dataset_name} ML project scaffold.""" + +from __future__ import annotations + +import json +from pathlib import Path + +import matplotlib.pyplot as plt +import pandas as pd +import seaborn as sns + + +def build_eda_summary(train_df: pd.DataFrame, test_df: pd.DataFrame, metadata: dict[str, str]) -> dict: + combined_df = pd.concat([train_df.assign(split="train"), test_df.assign(split="test")], ignore_index=True) + numeric_columns = combined_df.select_dtypes(include=["number"]).columns.tolist() + return {{ + "dataset_root": metadata.get("dataset_root", ""), + "train_rows": int(len(train_df)), + "test_rows": int(len(test_df)), + "combined_rows": int(len(combined_df)), + "feature_count": int(combined_df.shape[1] - 2), + "missing_values": int(combined_df.isna().sum().sum()), + "duplicate_rows": int(combined_df.duplicated().sum()), + "label_distribution": combined_df["label"].value_counts().to_dict(), + "numeric_columns": numeric_columns, + }} + + +def render_eda_artifacts(train_df: pd.DataFrame, summary: dict, output_dir: str | Path = "outputs") -> None: + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + + (output_path / "eda_summary.json").write_text(json.dumps(summary, indent=2) + "\\n", encoding="utf-8") + + plt.figure(figsize=(10, 5)) + train_df["label"].value_counts().head(10).plot(kind="bar") + plt.title("Top NSL-KDD Class Labels") + plt.tight_layout() + plt.savefig(output_path / "label_distribution.png") + plt.close() + + numeric_columns = train_df.select_dtypes(include=["number"]).columns.tolist()[:6] + if numeric_columns: + plt.figure(figsize=(10, 6)) + sns.boxplot(data=train_df[numeric_columns]) + plt.xticks(rotation=30, ha="right") + plt.title("Sample Numeric Feature Distribution") + plt.tight_layout() + plt.savefig(output_path / "numeric_feature_boxplot.png") + plt.close() +''' + + +def _build_ml_training_file() -> str: + return '''"""Baseline model-training helpers for the NSL-KDD ML project scaffold.""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pandas as pd +from sklearn.compose import ColumnTransformer +from sklearn.ensemble import RandomForestClassifier +from sklearn.impute import SimpleImputer +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder, StandardScaler + +TARGET_COLUMN = "label" +CATEGORICAL_COLUMNS = ["protocol_type", "service", "flag"] + + +def _binary_target(series: pd.Series) -> pd.Series: + return (series.astype(str).str.lower() != "normal").astype(int) + + +def _split_features(frame: pd.DataFrame) -> tuple[pd.DataFrame, pd.Series]: + features = frame.drop(columns=[TARGET_COLUMN, "difficulty"], errors="ignore") + target = _binary_target(frame[TARGET_COLUMN]) + return features, target + + +def build_preprocessor(features: pd.DataFrame) -> ColumnTransformer: + categorical_columns = [column for column in CATEGORICAL_COLUMNS if column in features.columns] + numeric_columns = [column for column in features.columns if column not in categorical_columns] + + return ColumnTransformer( + transformers=[ + ( + "categorical", + Pipeline( + steps=[ + ("imputer", SimpleImputer(strategy="most_frequent")), + ("encoder", OneHotEncoder(handle_unknown="ignore")), + ] + ), + categorical_columns, + ), + ( + "numeric", + Pipeline( + steps=[ + ("imputer", SimpleImputer(strategy="median")), + ("scaler", StandardScaler()), + ] + ), + numeric_columns, + ), + ], + remainder="drop", + ) + + +def build_models(preprocessor: ColumnTransformer) -> dict[str, Pipeline]: + return { + "logistic_regression": Pipeline( + steps=[ + ("preprocessor", preprocessor), + ("model", LogisticRegression(max_iter=1000)), + ] + ), + "random_forest": Pipeline( + steps=[ + ("preprocessor", preprocessor), + ("model", RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)), + ] + ), + } + + +def _evaluate_model(name: str, model: Pipeline, x_train: pd.DataFrame, y_train: pd.Series, x_test: pd.DataFrame, y_test: pd.Series) -> dict: + model.fit(x_train, y_train) + predictions = model.predict(x_test) + precision, recall, f1, _ = precision_recall_fscore_support(y_test, predictions, average="binary", zero_division=0) + return { + "model": name, + "accuracy": round(float(accuracy_score(y_test, predictions)), 4), + "precision": round(float(precision), 4), + "recall": round(float(recall), 4), + "f1": round(float(f1), 4), + "confusion_matrix": confusion_matrix(y_test, predictions).tolist(), + } + + +def train_and_evaluate_models(train_df: pd.DataFrame, test_df: pd.DataFrame) -> list[dict]: + x_train, y_train = _split_features(train_df) + x_test, y_test = _split_features(test_df) + preprocessor = build_preprocessor(x_train) + models = build_models(preprocessor) + return [ + _evaluate_model(name, model, x_train, y_train, x_test, y_test) + for name, model in models.items() + ] + + +def write_model_artifacts(results: list[dict], output_dir: str | Path = "outputs") -> None: + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + + (output_path / "model_metrics.json").write_text(json.dumps(results, indent=2) + "\\n", encoding="utf-8") + + lines = [ + "# Model Metrics", + "", + "| Model | Accuracy | Precision | Recall | F1 |", + "| --- | ---: | ---: | ---: | ---: |", + ] + for result in results: + lines.append( + f"| {result['model']} | {result['accuracy']:.4f} | {result['precision']:.4f} | {result['recall']:.4f} | {result['f1']:.4f} |" + ) + + (output_path / "MODEL_METRICS.md").write_text("\\n".join(lines) + "\\n", encoding="utf-8") +''' + + +def _build_ml_project_tests_file() -> str: + return '''import pandas as pd + +from src.train_models import _binary_target, build_preprocessor + + +def test_binary_target_marks_attacks_as_one(): + result = _binary_target(pd.Series(["normal", "neptune", "smurf"])) + assert result.tolist() == [0, 1, 1] + + +def test_build_preprocessor_handles_known_nsl_kdd_columns(): + frame = pd.DataFrame( + { + "protocol_type": ["tcp", "udp"], + "service": ["http", "domain_u"], + "flag": ["SF", "S0"], + "src_bytes": [181, 239], + "dst_bytes": [5450, 486], + } + ) + preprocessor = build_preprocessor(frame) + assert preprocessor is not None +''' + + +def _build_ml_presentation_outline(assignment_name: str, dataset_name: str) -> str: + return ( + f"# {assignment_name} Presentation Outline\n\n" + "## Slide 1: Business Problem\n" + "- Explain the malware or intrusion-detection problem and why it matters.\n\n" + f"## Slide 2: Dataset Selection ({dataset_name})\n" + "- Describe the dataset, why it was chosen, and any limitations.\n\n" + "## Slide 3: EDA Highlights\n" + "- Show dataset size, data quality findings, and the most important graphs.\n\n" + "## Slide 4: Feature Engineering and Preprocessing\n" + "- Explain categorical encoding, scaling, and target construction.\n\n" + "## Slide 5: Models and Metrics\n" + "- Compare the baseline models and justify the evaluation metrics.\n\n" + "## Slide 6: Results and Recommendations\n" + "- Interpret the results and recommend next steps for the stakeholder.\n\n" + "## Slide 7: Weekly Code Walkthrough Snippets\n" + "- List the code snippets each group member should be ready to explain in class.\n" + ) + + +def _append_ml_readme_notes(existing_readme: str, dataset_name: str) -> str: + return existing_readme.rstrip() + ( + "\n\n## ML Project Scaffold\n" + f"- This scaffold is tailored for the {dataset_name} workflow with EDA, baseline model training, report writing, and presentation prep.\n" + "- Set `KAGGLEHUB_DATASET` to the KaggleHub handle for your approved dataset before running the project.\n" + "- Run `python main.py` to execute the end-to-end starter workflow and populate `outputs/`.\n" + "- Review `PRESENTATION.md` for the slide-deck outline and `Report.md` for the written summary structure.\n" + "- Use `pytest tests/test_ml_project.py` to validate the generated ML helper modules.\n" + ) + + def _is_python_maze_assignment(requested_files: List[str], assignment_description: str) -> bool: requested_lower = {path.lower() for path in requested_files} description_lower = assignment_description.lower() @@ -1368,6 +1804,12 @@ def build_assignment_specific_files( requested_files, assignment_description, ) + is_ml_project_assignment = ( + language_lower in {"python", "py"} + and not is_maze_assignment + and _is_python_ml_project_assignment(requested_files, assignment_description) + ) + dataset_name = "NSL-KDD" if _is_nsl_kdd_assignment(assignment_description) else "selected dataset" if is_maze_assignment: maze_functions = [ @@ -1395,8 +1837,21 @@ def build_assignment_specific_files( "X E\n" ) - if is_maze_assignment or "report.md" in requested_lower or "report" in assignment_description.lower(): + if is_maze_assignment: files["Report.md"] = _build_maze_report_template(assignment_name) + elif is_ml_project_assignment: + files["Report.md"] = _build_ml_project_report_template(assignment_name, dataset_name) + elif "report.md" in requested_lower or "report" in assignment_description.lower(): + files["Report.md"] = _build_generic_report_template(assignment_name) + + if is_ml_project_assignment: + files["main.py"] = _build_ml_project_runner_file(dataset_name) + files["src/__init__.py"] = '"""Generated ML project helpers."""\n' + files["src/data_loader.py"] = _build_nsl_kdd_data_loader_file() + files["src/eda.py"] = _build_ml_eda_file(dataset_name) + files["src/train_models.py"] = _build_ml_training_file() + files["tests/test_ml_project.py"] = _build_ml_project_tests_file() + files["PRESENTATION.md"] = _build_ml_presentation_outline(assignment_name, dataset_name) if language_lower in {"python", "py"} and assignment_mentions_jupyter_notebook(assignment_description): notebook_imports = inferred_python_imports @@ -1521,6 +1976,20 @@ def generate_starter_files( if maze_artifacts: files["artifacts/README.md"] = _build_artifact_readme(maze_artifacts) + if language.lower() in {"python", "py"} and "src/train_models.py" in files: + dataset_name = "NSL-KDD" if _is_nsl_kdd_assignment(assignment_description) else "selected dataset" + files["README.md"] = _append_ml_readme_notes(files["README.md"], dataset_name) + files["requirements.txt"] = _extend_python_requirements( + files.get("requirements.txt", PYTHON_TEMPLATES["requirements.txt"]), + [ + "kagglehub>=0.3.0", + "pandas>=2.2.0", + "matplotlib>=3.8.0", + "seaborn>=0.13.0", + "scikit-learn>=1.4.0", + ], + ) + if language.lower() in {"python", "py"}: inferred_imports = infer_python_assignment_imports(assignment_description) inferred_requirements = infer_python_assignment_requirements(assignment_description) diff --git a/tests/test_agent.py b/tests/test_agent.py index ca3e9fc..e65c820 100644 --- a/tests/test_agent.py +++ b/tests/test_agent.py @@ -284,6 +284,54 @@ def test_python_project_adds_imports_and_requirements_from_assignment(self): assert "pandas>=2.2.0" in files["requirements.txt"] assert "matplotlib>=3.8.0" in files["requirements.txt"] + def test_non_maze_report_template_uses_generic_sections(self): + """Generic report assignments should not receive the maze report template.""" + files = generate_starter_files( + assignment_name="Data Story Summary", + assignment_description="Write a report that summarizes your findings and recommendations.", + due_date="2026-03-19", + language="python", + ) + + assert "Report.md" in files + assert "## Executive Summary" in files["Report.md"] + assert "## Problem Statement" in files["Report.md"] + assert "Introduction to Search Algorithms" not in files["Report.md"] + + def test_ml_project_scaffold_files(self): + """ML project briefs should generate a project-oriented NSL-KDD scaffold.""" + assignment_description = ( + "ML Group Project. Use the NSL-KDD dataset imported from kagglehub. " + "Complete an EDA, build machine learning models, write a report, and prepare a presentation." + ) + files = generate_starter_files( + assignment_name="Malware and Network Intrusion Detection and Analysis", + assignment_description=assignment_description, + due_date="2026-03-20", + language="python", + ) + + assert "Report.md" in files + assert "PRESENTATION.md" in files + assert "src/__init__.py" in files + assert "src/data_loader.py" in files + assert "src/eda.py" in files + assert "src/train_models.py" in files + assert "tests/test_ml_project.py" in files + assert "NSL-KDD" in files["Report.md"] + assert "## EDA Workflow" in files["Report.md"] + assert "kagglehub" in files["src/data_loader.py"] + assert "NSL_KDD_COLUMNS" in files["src/data_loader.py"] + assert "RandomForestClassifier" in files["src/train_models.py"] + assert "LogisticRegression" in files["src/train_models.py"] + assert "python main.py" in files["README.md"] + assert "PRESENTATION.md" in files["README.md"] + assert "kagglehub>=0.3.0" in files["requirements.txt"] + assert "scikit-learn>=1.4.0" in files["requirements.txt"] + + for path in ["main.py", "src/data_loader.py", "src/eda.py", "src/train_models.py", "tests/test_ml_project.py"]: + compile(files[path], path, "exec") + def test_extract_required_function_names(self): """Extract required function names from assignment examples.""" text = (