From a40df8b049ee017fee89c91a4e5bf558e0b87f40 Mon Sep 17 00:00:00 2001
From: Valentijn Scholten <valentijnscholten@gmail.com>
Date: Sun, 3 May 2026 20:55:55 +0200
Subject: [PATCH] perf(dupe-delete): use bulk_delete_findings + correlated
 subquery in async_dupe_delete

Replace per-row Finding.delete() loop with bulk_delete_findings (raw SQL
cascade) and move excess-duplicate selection fully into the DB via a
correlated subquery that counts newer siblings per original.
select_related + only() eliminate the N+1 product lookup.
---
 dojo/tasks.py | 132 +++++++++++++++++++++++++++++---------------------
 1 file changed, 76 insertions(+), 56 deletions(-)

diff --git a/dojo/tasks.py b/dojo/tasks.py
index e857ba7967a..c49c69ab57c 100644
--- a/dojo/tasks.py
+++ b/dojo/tasks.py
@@ -7,12 +7,13 @@
 from django.conf import settings
 from django.core.exceptions import SuspiciousOperation
 from django.core.management import call_command
-from django.db.models import Count, Prefetch
+from django.db.models import Count, IntegerField, OuterRef, Q, Subquery
+from django.db.models.functions import Coalesce
 
 from dojo.auditlog import run_flush_auditlog
 from dojo.celery import app
 from dojo.celery_dispatch import dojo_dispatch_task
-from dojo.finding.helper import fix_loop_duplicates
+from dojo.finding.helper import bulk_delete_findings, fix_loop_duplicates
 from dojo.management.commands.jira_status_reconciliation import jira_status_reconciliation
 from dojo.models import Finding, System_Settings
 from dojo.utils import calculate_grade
@@ -54,60 +55,79 @@ def _async_dupe_delete_impl():
         logger.info("skipping deletion of excess duplicates: max_dupes not configured")
         return
 
-    if enabled:
-        logger.info("delete excess duplicates (max_dupes per finding: %s, max deletes per run: %s)", dupe_max, total_duplicate_delete_count_max_per_run)
-        deduplicationLogger.info("delete excess duplicates (max_dupes per finding: %s, max deletes per run: %s)", dupe_max, total_duplicate_delete_count_max_per_run)
-
-        # limit to settings.DUPE_DELETE_MAX_PER_RUN to prevent overlapping jobs
-        results = Finding.objects \
-                .filter(duplicate=True) \
-                .order_by() \
-                .values("duplicate_finding") \
-                .annotate(num_dupes=Count("id")) \
-                .filter(num_dupes__gt=dupe_max)[:total_duplicate_delete_count_max_per_run]
-
-        originals_with_too_many_duplicates_ids = [result["duplicate_finding"] for result in results]
-
-        originals_with_too_many_duplicates = Finding.objects.filter(id__in=originals_with_too_many_duplicates_ids).order_by("id")
-
-        # prefetch to make it faster
-        # Oldest-first: delete from the front of the list until dupe_count <= 0, keeping the last max_dupes.
-        # order_by("date") alone leaves ties undefined when many duplicates share the same date (e.g. tool date);
-        # add id so we always drop lower-id (older) rows first and retain higher-id (newer) imports.
-        originals_with_too_many_duplicates = originals_with_too_many_duplicates.prefetch_related(Prefetch("original_finding",
-            queryset=Finding.objects.filter(duplicate=True).order_by("date", "id")))
-
-        total_deleted_count = 0
-        affected_products = set()
-        for original in originals_with_too_many_duplicates:
-            duplicate_list = original.original_finding.all()
-            dupe_count = len(duplicate_list) - dupe_max
-
-            for finding in duplicate_list:
-                deduplicationLogger.debug(f"deleting finding {finding.id}:{finding.title} ({finding.hash_code}))")
-                # Collect the product for batch grading later
-                affected_products.add(finding.test.engagement.product)
-                # Skip individual product grading during deletion
-                finding.delete(product_grading_option=False)
-                total_deleted_count += 1
-                dupe_count -= 1
-                if dupe_count <= 0:
-                    break
-                if total_deleted_count >= total_duplicate_delete_count_max_per_run:
-                    break
-
-            if total_deleted_count >= total_duplicate_delete_count_max_per_run:
-                break
-
-        logger.info("total number of excess duplicates deleted: %s", total_deleted_count)
-
-        # Batch product grading for all affected products
-        if affected_products:
-            system_settings = System_Settings.objects.get()
-            if system_settings.enable_product_grade:
-                logger.info("performing batch product grading for %s products", len(affected_products))
-                for product in affected_products:
-                    dojo_dispatch_task(calculate_grade, product.id)
+    if not enabled:
+        return
+
+    logger.info("delete excess duplicates (max_dupes per finding: %s, max deletes per run: %s)", dupe_max, total_duplicate_delete_count_max_per_run)
+    deduplicationLogger.info("delete excess duplicates (max_dupes per finding: %s, max deletes per run: %s)", dupe_max, total_duplicate_delete_count_max_per_run)
+
+    # Originals that currently have more duplicates than dupe_max allows.
+    originals_with_excess = (
+        Finding.objects
+        .filter(duplicate=True)
+        .order_by()
+        .values("duplicate_finding")
+        .annotate(num_dupes=Count("id"))
+        .filter(num_dupes__gt=dupe_max)
+        .values("duplicate_finding")
+    )
+
+    # For each candidate duplicate, count siblings of the same original that are strictly newer
+    # (later date, or same date but higher id — matches the keep-newest ordering).
+    # A finding is excess when >= dupe_max newer siblings exist: it falls outside the kept window.
+    # Coalesce(..., 0) handles the no-newer-siblings case (subquery returns NULL via empty GROUP BY).
+    newer_siblings_subq = Subquery(
+        Finding.objects
+        .filter(
+            duplicate=True,
+            duplicate_finding_id=OuterRef("duplicate_finding_id"),
+        )
+        .filter(
+            Q(date__gt=OuterRef("date")) | Q(date=OuterRef("date"), id__gt=OuterRef("id")),
+        )
+        .order_by()
+        .values("duplicate_finding_id")
+        .annotate(cnt=Count("id"))
+        .values("cnt"),
+        output_field=IntegerField(),
+    )
+
+    # Single query: excess (oldest) duplicates, newest-first within each original group.
+    # select_related avoids N+1 queries when collecting affected products below.
+    # only() limits Finding columns fetched; test_id is required for the select_related join.
+    excess_dupes = (
+        Finding.objects
+        .filter(duplicate=True, duplicate_finding__in=originals_with_excess)
+        .annotate(newer_cnt=Coalesce(newer_siblings_subq, 0))
+        .filter(newer_cnt__gte=dupe_max)
+        .select_related("test__engagement__product")
+        .only("id", "test_id")
+        .order_by("id")
+        [:total_duplicate_delete_count_max_per_run]
+    )
+
+    ids_to_delete = []
+    affected_products = set()
+    for finding in excess_dupes:
+        ids_to_delete.append(finding.id)
+        affected_products.add(finding.test.engagement.product)
+
+    logger.info("total number of excess duplicates to delete: %s", len(ids_to_delete))
+
+    if ids_to_delete:
+        # order_desc=True deletes higher ids before lower ids, consistent with how
+        # finding_delete handles duplicate clusters (duplicate_cluster.order_by("-id").delete()).
+        bulk_delete_findings(Finding.objects.filter(id__in=ids_to_delete), order_desc=True)
+
+    logger.info("total number of excess duplicates deleted: %s", len(ids_to_delete))
+
+    # Batch product grading for all affected products
+    if affected_products:
+        system_settings = System_Settings.objects.get()
+        if system_settings.enable_product_grade:
+            logger.info("performing batch product grading for %s products", len(affected_products))
+            for product in affected_products:
+                dojo_dispatch_task(calculate_grade, product.id)
 
 
 @app.task(ignore_result=False, base=Task)