Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 43 additions & 2 deletions boost_usage_tracker/admin.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from django.contrib import admin
from django.contrib.admin import ModelAdmin

from boost_usage_tracker import services as boost_usage_services

from .models import BoostExternalRepository, BoostMissingHeaderTmp, BoostUsage


Expand Down Expand Up @@ -41,7 +43,46 @@ class BoostUsageAdmin(ModelAdmin):

@admin.register(BoostMissingHeaderTmp)
class BoostMissingHeaderTmpAdmin(ModelAdmin):
list_display = ("id", "usage", "header_name", "created_at")
list_display = (
"id",
"header_name",
"usage_repo",
"usage_file_path",
"usage",
"created_at",
)
list_filter = ("created_at",)
search_fields = ("header_name",)
search_fields = (
"header_name",
"usage__repo__repo_name",
"usage__file_path__filename",
)
raw_id_fields = ("usage",)
list_select_related = ("usage__repo__owner_account", "usage__file_path")
actions = ("resolve_selected_if_in_catalog",)

@admin.display(description="External repo", ordering="usage__repo__repo_name")
def usage_repo(self, obj):
r = obj.usage.repo
return r.full_name if r else "—"

@admin.display(description="File path", ordering="usage__file_path__filename")
def usage_file_path(self, obj):
fp = obj.usage.file_path
return fp.filename if fp else "—"

@admin.action(
description="Resolve selected (if header exists in BoostFile catalog)"
)
def resolve_selected_if_in_catalog(self, request, queryset):
counts: dict[str, int] = {}
for tmp in queryset.select_related("usage__repo", "usage__file_path"):
outcome = boost_usage_services.resolve_missing_header_tmp_auto(tmp)
counts[outcome] = counts.get(outcome, 0) + 1
parts = [
f"resolved: {counts.get('resolved', 0)}",
f"skipped (no catalog match): {counts.get('skipped_no_match', 0)}",
f"skipped (ambiguous): {counts.get('skipped_ambiguous', 0)}",
f"errors: {counts.get('error', 0)}",
]
self.message_user(request, "; ".join(parts))
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from django.utils.dateparse import parse_datetime

from boost_usage_tracker.models import BoostExternalRepository
from boost_usage_tracker.services import resolve_all_missing_header_tmp_batch
from github_activity_tracker.models import GitHubRepository
from boost_usage_tracker.boost_searcher import (
BOOST_INCLUDE_SEARCH_BATCH_SIZE,
Expand Down Expand Up @@ -390,6 +391,12 @@ def _parse_ymd_or_none(value, opt_name):

try:
if not task_filter or task_filter == "monitor_content":
prepass_stats = resolve_all_missing_header_tmp_batch(dry_run=dry_run)
logger.info(
"missing_header_tmp prepass (dry_run=%s): %s",
dry_run,
prepass_stats,
)
task_monitor_content(since, until, min_stars, dry_run)

if not task_filter or task_filter == "monitor_stars":
Expand Down
48 changes: 12 additions & 36 deletions boost_usage_tracker/post_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,16 @@
from datetime import datetime
from typing import TYPE_CHECKING

from boost_library_tracker.models import BoostFile
from boost_usage_tracker.boost_searcher import (
detect_boost_version_in_repo,
extract_boost_includes,
)
from boost_usage_tracker.repo_searcher import RepoSearchResult
from boost_usage_tracker.services import (
boost_catalog_filename,
bulk_create_or_update_boost_usage,
find_boost_file_for_header_name_detailed,
find_boost_files_exact_by_catalog_names,
get_active_usages_for_repo,
get_or_create_boost_external_repo,
get_or_create_missing_header_usage,
Expand All @@ -36,59 +38,33 @@
logger = logging.getLogger(__name__)


def _resolve_boost_header(header_path: str):
"""Resolve a Boost include path to a :class:`BoostFile` or *None*."""
parts = header_path.split("/")
for i in range(len(parts)):
suffix = "/".join(parts[i:])
boost_file = (
BoostFile.objects.filter( # pylint: disable=no-member
github_file__filename__endswith=suffix
)
.select_related("github_file")
.first()
) # pylint: disable=no-member
if boost_file:
return boost_file
return None


def _resolve_boost_headers_bulk(header_paths: set[str]) -> dict[str, object]:
"""Resolve a set of Boost include paths to BoostFile instances in one pass.

Returns a dict ``{header_path: BoostFile | None}``. Deduplicates the
incoming paths and performs one bulk exact-match query first; unresolved
paths are then handled by suffix fallback.
incoming paths and performs one bulk exact-match query on
``include/<header_path>`` first; unresolved paths are then handled by
suffix fallback.
"""
if not header_paths:
return {}

# Fast path: one bulk query for exact filename matches.
exact_rows = (
BoostFile.objects.filter(
github_file__filename__in=header_paths
) # pylint: disable=no-member
.select_related("github_file")
.order_by("github_file_id")
)
by_filename: dict[str, object] = {}
for row in exact_rows:
filename = row.github_file.filename
if filename not in by_filename:
by_filename[filename] = row
catalog_names = {boost_catalog_filename(p) for p in header_paths}
exact_map = find_boost_files_exact_by_catalog_names(catalog_names)

resolved: dict[str, object] = {}
unresolved: list[str] = []
for path in header_paths:
boost_file = by_filename.get(path)
cn = boost_catalog_filename(path)
boost_file = exact_map.get(cn)
if boost_file is not None:
resolved[path] = boost_file
else:
unresolved.append(path)

# Fallback for non-exact cases (still deduplicated by unique header path).
for path in unresolved:
resolved[path] = _resolve_boost_header(path)
bf, _ = find_boost_file_for_header_name_detailed(path)
resolved[path] = bf

return resolved

Expand Down
168 changes: 164 additions & 4 deletions boost_usage_tracker/services.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,13 @@

import logging
from datetime import date, datetime
from typing import TYPE_CHECKING, Optional
from typing import TYPE_CHECKING, Literal, Optional

from boost_library_tracker.models import BoostFile

from .models import BoostExternalRepository, BoostMissingHeaderTmp, BoostUsage

if TYPE_CHECKING:
from boost_library_tracker.models import BoostFile
from github_activity_tracker.models import GitHubFile, GitHubRepository

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -124,7 +125,7 @@ def update_boost_external_repo(

def create_or_update_boost_usage(
repo: BoostExternalRepository,
boost_header: "BoostFile",
boost_header: BoostFile,
file_path: "GitHubFile",
last_commit_date: Optional[datetime] = None,
) -> tuple[BoostUsage, bool]:
Expand Down Expand Up @@ -171,6 +172,165 @@ def get_active_usages_for_repo(
)


def boost_catalog_filename(header_path: str) -> str:
"""Normalize a Boost include path to ``GitHubFile.filename`` in the Boost tree.

Catalog rows use ``include/<header_path>`` (e.g. ``include/boost/asio.hpp``).
"""
if header_path.startswith("include/"):
return header_path
return f"include/{header_path}"


def _disambiguate_boost_file_candidates(
candidates: list[BoostFile],
) -> Optional[BoostFile]:
"""Pick one :class:`~boost_library_tracker.models.BoostFile` when several match.

Rules:
- Exactly one non-deleted ``GitHubFile`` → return that ``BoostFile``.
- More than one non-deleted → ambiguous, return ``None``.
- None non-deleted: exactly one candidate total (even if deleted) → return it;
otherwise ambiguous or empty → ``None``.
"""
if not candidates:
return None
active = [c for c in candidates if not c.github_file.is_deleted]
all_n = len(candidates)
if len(active) == 1:
return active[0]
if len(active) > 1:
return None
if all_n == 1:
return candidates[0]
return None


def find_boost_files_exact_by_catalog_names(
catalog_names: set[str],
) -> dict[str, Optional[BoostFile]]:
"""Map each catalog filename to a disambiguated ``BoostFile`` (or ``None``)."""
if not catalog_names:
return {}
rows = list(
BoostFile.objects.filter(
github_file__filename__in=catalog_names
).select_related("github_file")
)
by_filename: dict[str, list[BoostFile]] = {}
for row in rows:
by_filename.setdefault(row.github_file.filename, []).append(row)
return {
name: _disambiguate_boost_file_candidates(by_filename.get(name, []))
for name in catalog_names
}


def find_boost_file_for_header_name_detailed(
header_path: str,
) -> tuple[Optional[BoostFile], Literal["found", "not_found", "ambiguous"]]:
"""Resolve a Boost include path to ``BoostFile`` with a status for metrics."""
full_path = boost_catalog_filename(header_path)
exact = list(
BoostFile.objects.filter(github_file__filename=full_path).select_related(
"github_file"
)
)
picked = _disambiguate_boost_file_candidates(exact)
if picked is not None:
return picked, "found"
if len(exact) > 0:
return None, "ambiguous"

# Do not use substring or ``endswith`` on ``full_path``: a longer path such as
# ``libs/asio/include/boost/asio.hpp`` is a different file than
# ``include/boost/asio.hpp`` and must not be treated as the same header.
return None, "not_found"


def find_boost_file_for_header_name(header_path: str) -> Optional[BoostFile]:
"""Resolve a Boost include path to a ``BoostFile`` or ``None``."""
bf, _ = find_boost_file_for_header_name_detailed(header_path)
return bf


def delete_boost_missing_header_tmp(tmp: BoostMissingHeaderTmp) -> None:
"""Delete a temporary missing-header row (service-layer delete)."""
tmp.delete()


def maybe_delete_placeholder_boost_usage_after_tmp_removed(usage_pk: int) -> bool:
"""If *usage* is still a null-header placeholder with no tmp rows, delete it.

Returns ``True`` if a row was deleted.
"""
usage = BoostUsage.objects.filter(pk=usage_pk).first()
if usage is None:
return False
if usage.boost_header_id is not None:
return False
if usage.missing_header_tmp.exists():
return False
usage.delete()
return True


def resolve_missing_header_tmp_auto(tmp: BoostMissingHeaderTmp) -> str:
"""Resolve one tmp row when the header exists unambiguously in the catalog.

Creates/updates real ``BoostUsage``, deletes *tmp*, and drops the placeholder
usage when it has no remaining tmp rows.

Returns one of: ``resolved``, ``skipped_no_match``, ``skipped_ambiguous``,
``error`` (logged on exception).
"""
boost_file, status = find_boost_file_for_header_name_detailed(tmp.header_name)
if status == "ambiguous":
return "skipped_ambiguous"
if boost_file is None:
return "skipped_no_match"
usage_pk = tmp.usage_id
try:
usage = tmp.usage
create_or_update_boost_usage(
usage.repo,
boost_file,
usage.file_path,
last_commit_date=usage.last_commit_date,
)
delete_boost_missing_header_tmp(tmp)
maybe_delete_placeholder_boost_usage_after_tmp_removed(usage_pk)
except Exception: # pylint: disable=broad-exception-caught
logger.exception("resolve_missing_header_tmp_auto failed for tmp_id=%s", tmp.pk)
return "error"
return "resolved"


def resolve_all_missing_header_tmp_batch(*, dry_run: bool = False) -> dict[str, int]:
"""Process every ``BoostMissingHeaderTmp`` row (iterator, chunk-friendly).

When *dry_run* is ``True``, no writes; counts ``would_resolve`` / ``skipped_*``.
"""
from collections import Counter

counts: Counter[str] = Counter()
qs = BoostMissingHeaderTmp.objects.all().select_related(
"usage__repo", "usage__file_path"
)
for tmp in qs.iterator(chunk_size=500):
if dry_run:
_, status = find_boost_file_for_header_name_detailed(tmp.header_name)
if status == "found":
counts["would_resolve"] += 1
elif status == "ambiguous":
counts["skipped_ambiguous"] += 1
else:
counts["skipped_no_match"] += 1
else:
counts[resolve_missing_header_tmp_auto(tmp)] += 1
return dict(counts)


def get_or_create_missing_header_usage(
repo: BoostExternalRepository,
file_path: "GitHubFile",
Expand Down Expand Up @@ -208,7 +368,7 @@ def get_or_create_missing_header_usage(

def bulk_create_or_update_boost_usage(
repo: BoostExternalRepository,
items: list[tuple["BoostFile", "GitHubFile", Optional[datetime]]],
items: list[tuple[BoostFile, "GitHubFile", Optional[datetime]]],
) -> tuple[int, int]:
"""Create or update many BoostUsage rows in bulk.

Expand Down
Loading
Loading