From a40df8b049ee017fee89c91a4e5bf558e0b87f40 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 3 May 2026 20:55:55 +0200 Subject: [PATCH] perf(dupe-delete): use bulk_delete_findings + correlated subquery in async_dupe_delete Replace per-row Finding.delete() loop with bulk_delete_findings (raw SQL cascade) and move excess-duplicate selection fully into the DB via a correlated subquery that counts newer siblings per original. select_related + only() eliminate the N+1 product lookup. --- dojo/tasks.py | 132 +++++++++++++++++++++++++++++--------------------- 1 file changed, 76 insertions(+), 56 deletions(-) diff --git a/dojo/tasks.py b/dojo/tasks.py index e857ba7967a..c49c69ab57c 100644 --- a/dojo/tasks.py +++ b/dojo/tasks.py @@ -7,12 +7,13 @@ from django.conf import settings from django.core.exceptions import SuspiciousOperation from django.core.management import call_command -from django.db.models import Count, Prefetch +from django.db.models import Count, IntegerField, OuterRef, Q, Subquery +from django.db.models.functions import Coalesce from dojo.auditlog import run_flush_auditlog from dojo.celery import app from dojo.celery_dispatch import dojo_dispatch_task -from dojo.finding.helper import fix_loop_duplicates +from dojo.finding.helper import bulk_delete_findings, fix_loop_duplicates from dojo.management.commands.jira_status_reconciliation import jira_status_reconciliation from dojo.models import Finding, System_Settings from dojo.utils import calculate_grade @@ -54,60 +55,79 @@ def _async_dupe_delete_impl(): logger.info("skipping deletion of excess duplicates: max_dupes not configured") return - if enabled: - logger.info("delete excess duplicates (max_dupes per finding: %s, max deletes per run: %s)", dupe_max, total_duplicate_delete_count_max_per_run) - deduplicationLogger.info("delete excess duplicates (max_dupes per finding: %s, max deletes per run: %s)", dupe_max, total_duplicate_delete_count_max_per_run) - - # limit to settings.DUPE_DELETE_MAX_PER_RUN to prevent overlapping jobs - results = Finding.objects \ - .filter(duplicate=True) \ - .order_by() \ - .values("duplicate_finding") \ - .annotate(num_dupes=Count("id")) \ - .filter(num_dupes__gt=dupe_max)[:total_duplicate_delete_count_max_per_run] - - originals_with_too_many_duplicates_ids = [result["duplicate_finding"] for result in results] - - originals_with_too_many_duplicates = Finding.objects.filter(id__in=originals_with_too_many_duplicates_ids).order_by("id") - - # prefetch to make it faster - # Oldest-first: delete from the front of the list until dupe_count <= 0, keeping the last max_dupes. - # order_by("date") alone leaves ties undefined when many duplicates share the same date (e.g. tool date); - # add id so we always drop lower-id (older) rows first and retain higher-id (newer) imports. - originals_with_too_many_duplicates = originals_with_too_many_duplicates.prefetch_related(Prefetch("original_finding", - queryset=Finding.objects.filter(duplicate=True).order_by("date", "id"))) - - total_deleted_count = 0 - affected_products = set() - for original in originals_with_too_many_duplicates: - duplicate_list = original.original_finding.all() - dupe_count = len(duplicate_list) - dupe_max - - for finding in duplicate_list: - deduplicationLogger.debug(f"deleting finding {finding.id}:{finding.title} ({finding.hash_code}))") - # Collect the product for batch grading later - affected_products.add(finding.test.engagement.product) - # Skip individual product grading during deletion - finding.delete(product_grading_option=False) - total_deleted_count += 1 - dupe_count -= 1 - if dupe_count <= 0: - break - if total_deleted_count >= total_duplicate_delete_count_max_per_run: - break - - if total_deleted_count >= total_duplicate_delete_count_max_per_run: - break - - logger.info("total number of excess duplicates deleted: %s", total_deleted_count) - - # Batch product grading for all affected products - if affected_products: - system_settings = System_Settings.objects.get() - if system_settings.enable_product_grade: - logger.info("performing batch product grading for %s products", len(affected_products)) - for product in affected_products: - dojo_dispatch_task(calculate_grade, product.id) + if not enabled: + return + + logger.info("delete excess duplicates (max_dupes per finding: %s, max deletes per run: %s)", dupe_max, total_duplicate_delete_count_max_per_run) + deduplicationLogger.info("delete excess duplicates (max_dupes per finding: %s, max deletes per run: %s)", dupe_max, total_duplicate_delete_count_max_per_run) + + # Originals that currently have more duplicates than dupe_max allows. + originals_with_excess = ( + Finding.objects + .filter(duplicate=True) + .order_by() + .values("duplicate_finding") + .annotate(num_dupes=Count("id")) + .filter(num_dupes__gt=dupe_max) + .values("duplicate_finding") + ) + + # For each candidate duplicate, count siblings of the same original that are strictly newer + # (later date, or same date but higher id — matches the keep-newest ordering). + # A finding is excess when >= dupe_max newer siblings exist: it falls outside the kept window. + # Coalesce(..., 0) handles the no-newer-siblings case (subquery returns NULL via empty GROUP BY). + newer_siblings_subq = Subquery( + Finding.objects + .filter( + duplicate=True, + duplicate_finding_id=OuterRef("duplicate_finding_id"), + ) + .filter( + Q(date__gt=OuterRef("date")) | Q(date=OuterRef("date"), id__gt=OuterRef("id")), + ) + .order_by() + .values("duplicate_finding_id") + .annotate(cnt=Count("id")) + .values("cnt"), + output_field=IntegerField(), + ) + + # Single query: excess (oldest) duplicates, newest-first within each original group. + # select_related avoids N+1 queries when collecting affected products below. + # only() limits Finding columns fetched; test_id is required for the select_related join. + excess_dupes = ( + Finding.objects + .filter(duplicate=True, duplicate_finding__in=originals_with_excess) + .annotate(newer_cnt=Coalesce(newer_siblings_subq, 0)) + .filter(newer_cnt__gte=dupe_max) + .select_related("test__engagement__product") + .only("id", "test_id") + .order_by("id") + [:total_duplicate_delete_count_max_per_run] + ) + + ids_to_delete = [] + affected_products = set() + for finding in excess_dupes: + ids_to_delete.append(finding.id) + affected_products.add(finding.test.engagement.product) + + logger.info("total number of excess duplicates to delete: %s", len(ids_to_delete)) + + if ids_to_delete: + # order_desc=True deletes higher ids before lower ids, consistent with how + # finding_delete handles duplicate clusters (duplicate_cluster.order_by("-id").delete()). + bulk_delete_findings(Finding.objects.filter(id__in=ids_to_delete), order_desc=True) + + logger.info("total number of excess duplicates deleted: %s", len(ids_to_delete)) + + # Batch product grading for all affected products + if affected_products: + system_settings = System_Settings.objects.get() + if system_settings.enable_product_grade: + logger.info("performing batch product grading for %s products", len(affected_products)) + for product in affected_products: + dojo_dispatch_task(calculate_grade, product.id) @app.task(ignore_result=False, base=Task)