Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 76 additions & 56 deletions dojo/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@
from django.conf import settings
from django.core.exceptions import SuspiciousOperation
from django.core.management import call_command
from django.db.models import Count, Prefetch
from django.db.models import Count, IntegerField, OuterRef, Q, Subquery
from django.db.models.functions import Coalesce

from dojo.auditlog import run_flush_auditlog
from dojo.celery import app
from dojo.celery_dispatch import dojo_dispatch_task
from dojo.finding.helper import fix_loop_duplicates
from dojo.finding.helper import bulk_delete_findings, fix_loop_duplicates
from dojo.management.commands.jira_status_reconciliation import jira_status_reconciliation
from dojo.models import Finding, System_Settings
from dojo.utils import calculate_grade
Expand Down Expand Up @@ -54,60 +55,79 @@ def _async_dupe_delete_impl():
logger.info("skipping deletion of excess duplicates: max_dupes not configured")
return

if enabled:
logger.info("delete excess duplicates (max_dupes per finding: %s, max deletes per run: %s)", dupe_max, total_duplicate_delete_count_max_per_run)
deduplicationLogger.info("delete excess duplicates (max_dupes per finding: %s, max deletes per run: %s)", dupe_max, total_duplicate_delete_count_max_per_run)

# limit to settings.DUPE_DELETE_MAX_PER_RUN to prevent overlapping jobs
results = Finding.objects \
.filter(duplicate=True) \
.order_by() \
.values("duplicate_finding") \
.annotate(num_dupes=Count("id")) \
.filter(num_dupes__gt=dupe_max)[:total_duplicate_delete_count_max_per_run]

originals_with_too_many_duplicates_ids = [result["duplicate_finding"] for result in results]

originals_with_too_many_duplicates = Finding.objects.filter(id__in=originals_with_too_many_duplicates_ids).order_by("id")

# prefetch to make it faster
# Oldest-first: delete from the front of the list until dupe_count <= 0, keeping the last max_dupes.
# order_by("date") alone leaves ties undefined when many duplicates share the same date (e.g. tool date);
# add id so we always drop lower-id (older) rows first and retain higher-id (newer) imports.
originals_with_too_many_duplicates = originals_with_too_many_duplicates.prefetch_related(Prefetch("original_finding",
queryset=Finding.objects.filter(duplicate=True).order_by("date", "id")))

total_deleted_count = 0
affected_products = set()
for original in originals_with_too_many_duplicates:
duplicate_list = original.original_finding.all()
dupe_count = len(duplicate_list) - dupe_max

for finding in duplicate_list:
deduplicationLogger.debug(f"deleting finding {finding.id}:{finding.title} ({finding.hash_code}))")
# Collect the product for batch grading later
affected_products.add(finding.test.engagement.product)
# Skip individual product grading during deletion
finding.delete(product_grading_option=False)
total_deleted_count += 1
dupe_count -= 1
if dupe_count <= 0:
break
if total_deleted_count >= total_duplicate_delete_count_max_per_run:
break

if total_deleted_count >= total_duplicate_delete_count_max_per_run:
break

logger.info("total number of excess duplicates deleted: %s", total_deleted_count)

# Batch product grading for all affected products
if affected_products:
system_settings = System_Settings.objects.get()
if system_settings.enable_product_grade:
logger.info("performing batch product grading for %s products", len(affected_products))
for product in affected_products:
dojo_dispatch_task(calculate_grade, product.id)
if not enabled:
return

logger.info("delete excess duplicates (max_dupes per finding: %s, max deletes per run: %s)", dupe_max, total_duplicate_delete_count_max_per_run)
deduplicationLogger.info("delete excess duplicates (max_dupes per finding: %s, max deletes per run: %s)", dupe_max, total_duplicate_delete_count_max_per_run)

# Originals that currently have more duplicates than dupe_max allows.
originals_with_excess = (
Finding.objects
.filter(duplicate=True)
.order_by()
.values("duplicate_finding")
.annotate(num_dupes=Count("id"))
.filter(num_dupes__gt=dupe_max)
.values("duplicate_finding")
)

# For each candidate duplicate, count siblings of the same original that are strictly newer
# (later date, or same date but higher id — matches the keep-newest ordering).
# A finding is excess when >= dupe_max newer siblings exist: it falls outside the kept window.
# Coalesce(..., 0) handles the no-newer-siblings case (subquery returns NULL via empty GROUP BY).
newer_siblings_subq = Subquery(
Finding.objects
.filter(
duplicate=True,
duplicate_finding_id=OuterRef("duplicate_finding_id"),
)
.filter(
Q(date__gt=OuterRef("date")) | Q(date=OuterRef("date"), id__gt=OuterRef("id")),
)
.order_by()
.values("duplicate_finding_id")
.annotate(cnt=Count("id"))
.values("cnt"),
output_field=IntegerField(),
)

# Single query: excess (oldest) duplicates, newest-first within each original group.
# select_related avoids N+1 queries when collecting affected products below.
# only() limits Finding columns fetched; test_id is required for the select_related join.
excess_dupes = (
Finding.objects
.filter(duplicate=True, duplicate_finding__in=originals_with_excess)
.annotate(newer_cnt=Coalesce(newer_siblings_subq, 0))
.filter(newer_cnt__gte=dupe_max)
.select_related("test__engagement__product")
.only("id", "test_id")
.order_by("id")
[:total_duplicate_delete_count_max_per_run]
)

ids_to_delete = []
affected_products = set()
for finding in excess_dupes:
ids_to_delete.append(finding.id)
affected_products.add(finding.test.engagement.product)

logger.info("total number of excess duplicates to delete: %s", len(ids_to_delete))

if ids_to_delete:
# order_desc=True deletes higher ids before lower ids, consistent with how
# finding_delete handles duplicate clusters (duplicate_cluster.order_by("-id").delete()).
bulk_delete_findings(Finding.objects.filter(id__in=ids_to_delete), order_desc=True)

logger.info("total number of excess duplicates deleted: %s", len(ids_to_delete))

# Batch product grading for all affected products
if affected_products:
system_settings = System_Settings.objects.get()
if system_settings.enable_product_grade:
logger.info("performing batch product grading for %s products", len(affected_products))
for product in affected_products:
dojo_dispatch_task(calculate_grade, product.id)


@app.task(ignore_result=False, base=Task)
Expand Down
Loading