From 6bf6032326bb0d9174f9f7d54f07c87bfc50746f Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 11 Nov 2025 16:28:00 +0000 Subject: [PATCH] Correctly handle cases where book location and/or filename are already correct --- backend/src/cms_backend/processors/book.py | 98 +++++++- backend/src/cms_backend/processors/title.py | 34 +-- backend/src/cms_backend/utils/filename.py | 17 +- backend/tests/processors/test_book.py | 238 +++++++++++++++++- .../test_book_location_integration.py | 234 +++++++++++++++++ 5 files changed, 592 insertions(+), 29 deletions(-) diff --git a/backend/src/cms_backend/processors/book.py b/backend/src/cms_backend/processors/book.py index 2cf1e86..a509474 100644 --- a/backend/src/cms_backend/processors/book.py +++ b/backend/src/cms_backend/processors/book.py @@ -1,9 +1,13 @@ +from uuid import UUID + from sqlalchemy.orm import Session as ORMSession from cms_backend import logger -from cms_backend.db.models import Book, Title +from cms_backend.db.book import create_book_location +from cms_backend.db.models import Book, Title, TitleWarehousePath from cms_backend.db.title import get_title_by_name_and_producer_or_none from cms_backend.utils.datetime import getnow +from cms_backend.utils.filename import compute_target_filename def check_book_qa(book: Book) -> bool: @@ -72,3 +76,95 @@ def get_matching_title(session: ORMSession, book: Book) -> Title | None: logger.exception(f"Failed to get matching title for {book.id}") book.status = "errored" return None + + +def _current_locations_match_targets( + book: Book, + target_locations: list[tuple[UUID, str]], +) -> bool: + """Check if book's current locations exactly match the target locations. + + Args: + book: The book to check + target_locations: List of (warehouse_path_id, filename) tuples representing + target locations + + Returns: + True if the set of current locations is strictly identical to target locations + """ + # Extract current locations as set of (warehouse_path_id, filename) tuples + current_set = { + (loc.warehouse_path_id, loc.filename) + for loc in book.locations + if loc.status == "current" + } + + # Convert target list to set + target_set = set(target_locations) + + # Must be strictly identical + return current_set == target_set + + +def create_book_target_locations( + session: ORMSession, + book: Book, + target_warehouse_paths: list[TitleWarehousePath], +) -> None: + """Create target locations for a book if not already at expected locations. + + Computes target locations based on the provided warehouse paths and filename, + then checks if the book's current locations already match. If they do, no new + target locations are created. Otherwise, target locations are created for each + warehouse path. + + Args: + session: SQLAlchemy session + book: Book to create target locations for + target_warehouse_paths: List of TitleWarehousePath objects defining where the + book should be + + Side effects: + - Adds event to book if targets already match current locations + - Creates BookLocation records if targets don't match current locations + """ + + if not book.name: + raise Exception("book name is missing or invalid") + + if not book.date: + raise Exception("book date is missing or invalid") + + # Compute target filename once for this book + target_filename = compute_target_filename( + session, + name=book.name, + flavour=book.flavour, + date=book.date, + book_id=book.id, + ) + + # Compute all target locations as (warehouse_path_id, filename) tuples + target_locations = [ + (title_warehouse_path.warehouse_path_id, target_filename) + for title_warehouse_path in target_warehouse_paths + ] + + # Check if current locations already match targets exactly + if _current_locations_match_targets(book, target_locations): + # Book is already at all expected locations - skip creating targets + book.events.append( + f"{getnow()}: book already at all target locations, skipping target " + "creation" + ) + return + + # Create target locations for each applicable warehouse path + for title_warehouse_path in target_warehouse_paths: + create_book_location( + session=session, + book=book, + warehouse_path_id=title_warehouse_path.warehouse_path_id, + filename=target_filename, + status="target", + ) diff --git a/backend/src/cms_backend/processors/title.py b/backend/src/cms_backend/processors/title.py index 59f3a39..66c1728 100644 --- a/backend/src/cms_backend/processors/title.py +++ b/backend/src/cms_backend/processors/title.py @@ -2,10 +2,9 @@ from sqlalchemy.orm import Session as OrmSession from cms_backend import logger -from cms_backend.db.book import create_book_location from cms_backend.db.models import Book, Title, TitleWarehousePath +from cms_backend.processors.book import create_book_target_locations from cms_backend.utils.datetime import getnow -from cms_backend.utils.filename import compute_target_filename def add_book_to_title(session: OrmSession, book: Book, title: Title): @@ -18,16 +17,14 @@ def add_book_to_title(session: OrmSession, book: Book, title: Title): if not book.date: raise Exception("book date is missing or invalid") - name = book.name - title.books.append(book) book.events.append(f"{getnow()}: book added to title {title.id}") title.events.append(f"{getnow()}: book {book.id} added to title") book.status = "processed" - if name and title.name != name: - title.events.append(f"{getnow()}: updating title name to {name}") - title.name = name + if title.name != book.name: + title.events.append(f"{getnow()}: updating title name to {book.name}") + title.name = book.name # Update title producer display fields from book if title.producer_display_name != book.producer_display_name: @@ -44,14 +41,6 @@ def add_book_to_title(session: OrmSession, book: Book, title: Title): ) title.producer_display_url = book.producer_display_url - # Compute target filename once for this book - target_filename = compute_target_filename( - session, - name=name, - flavour=book.flavour, - date=book.date, - ) - # Determine which warehouse paths to use based on title.in_prod path_type = "prod" if title.in_prod else "dev" @@ -62,15 +51,12 @@ def add_book_to_title(session: OrmSession, book: Book, title: Title): ) target_warehouse_paths = session.scalars(stmt).all() - # Create target locations for each applicable warehouse path - for title_warehouse_path in target_warehouse_paths: - create_book_location( - session=session, - book=book, - warehouse_path_id=title_warehouse_path.warehouse_path_id, - filename=target_filename, - status="target", - ) + # Create target locations if not already at expected locations + create_book_target_locations( + session=session, + book=book, + target_warehouse_paths=list(target_warehouse_paths), + ) except Exception as exc: book.events.append( diff --git a/backend/src/cms_backend/utils/filename.py b/backend/src/cms_backend/utils/filename.py index 869e984..21b558b 100644 --- a/backend/src/cms_backend/utils/filename.py +++ b/backend/src/cms_backend/utils/filename.py @@ -1,5 +1,7 @@ """Utilities for computing and managing book target filenames.""" +from uuid import UUID + from sqlalchemy import select from sqlalchemy.orm import Session as OrmSession @@ -55,7 +57,12 @@ def get_next_suffix(current_suffix: str) -> str: def compute_target_filename( - session: OrmSession, *, name: str, flavour: str | None, date: str + session: OrmSession, + *, + name: str, + flavour: str | None, + date: str, + book_id: UUID | None = None, ) -> str: """ Compute target filename: {name}[_{flavour}]_{period}[suffix] @@ -67,7 +74,8 @@ def compute_target_filename( - YYYY-MMaa, YYYY-MMab, ... (multiple letters) Finds the last suffix already in use and generates the next one. - Queries ALL book locations (any status) with filenames starting with base pattern. + Queries ALL book locations (any status) with filenames starting with base pattern, + excluding locations from the current book to avoid self-collision. Important edge cases: - Books with same name but different flavours (including no flavour) never collide @@ -79,6 +87,7 @@ def compute_target_filename( name: Book name flavour: Book flavour (optional) date: Book date (format: YYYY-MM-DD) + book_id: ID of the book being processed (to exclude its own locations) Returns: Target filename including .zim extension @@ -107,9 +116,13 @@ def compute_target_filename( # Query all locations where filename starts with this pattern # Check ALL locations regardless of status (current or target) + # Exclude the current book's own locations to avoid self-collision stmt = select(BookLocation.filename).where( BookLocation.filename.like(f"{base_pattern}%") ) + if book_id is not None: + stmt = stmt.where(BookLocation.book_id != book_id) + existing_filenames = list(session.scalars(stmt).all()) # If no existing files, use base pattern (no suffix) diff --git a/backend/tests/processors/test_book.py b/backend/tests/processors/test_book.py index 791d7fc..e7823bf 100644 --- a/backend/tests/processors/test_book.py +++ b/backend/tests/processors/test_book.py @@ -5,8 +5,12 @@ import pytest from sqlalchemy.orm import Session as OrmSession -from cms_backend.db.models import Book, Title -from cms_backend.processors.book import check_book_qa, get_matching_title +from cms_backend.db.models import Book, BookLocation, Title, WarehousePath +from cms_backend.processors.book import ( + _current_locations_match_targets, # pyright: ignore[reportPrivateUsage] + check_book_qa, + get_matching_title, +) MINIMUM_ZIM_METADATA = { "Name": "test_en_all", @@ -186,3 +190,233 @@ def test_get_matching_title_bad_error( if re.match(".*: error encountered while get matching title", event) ) assert len(title.events) == 0 + + +class TestCurrentLocationsMatchTargets: + """Test the _current_locations_match_targets helper function.""" + + def test_exact_match_single_location( + self, + dbsession: OrmSession, + create_book: Callable[..., Book], + create_warehouse_path: Callable[..., WarehousePath], + ): + """Book with single current location matching single target should return + True.""" + book = create_book() + warehouse_path = create_warehouse_path() + + # Add current location + current_location = BookLocation( + book_id=book.id, + status="current", + filename="test_book_2024-01.zim", + ) + current_location.warehouse_path_id = warehouse_path.id + book.locations.append(current_location) + dbsession.add(current_location) + dbsession.flush() + + # Target matches current + target_locations = [(warehouse_path.id, "test_book_2024-01.zim")] + + assert _current_locations_match_targets(book, target_locations) is True + + def test_exact_match_multiple_locations( + self, + dbsession: OrmSession, + create_book: Callable[..., Book], + create_warehouse_path: Callable[..., WarehousePath], + ): + """Book with multiple locations matching all targets should return True.""" + book = create_book() + path1 = create_warehouse_path(folder_name="path1") + path2 = create_warehouse_path(folder_name="path2") + + # Add current locations + loc1 = BookLocation( + book_id=book.id, + status="current", + filename="test_book_2024-01.zim", + ) + loc1.warehouse_path_id = path1.id + loc2 = BookLocation( + book_id=book.id, + status="current", + filename="test_book_2024-01.zim", + ) + loc2.warehouse_path_id = path2.id + book.locations.extend([loc1, loc2]) + dbsession.add(loc1) + dbsession.add(loc2) + dbsession.flush() + + # Targets match all currents + target_locations = [ + (path1.id, "test_book_2024-01.zim"), + (path2.id, "test_book_2024-01.zim"), + ] + + assert _current_locations_match_targets(book, target_locations) is True + + def test_no_match_different_filenames( + self, + dbsession: OrmSession, + create_book: Callable[..., Book], + create_warehouse_path: Callable[..., WarehousePath], + ): + """Book with different filename than target should return False.""" + book = create_book() + warehouse_path = create_warehouse_path() + + # Add current location with different filename + current_location = BookLocation( + book_id=book.id, + status="current", + filename="old_filename_2024-01.zim", + ) + current_location.warehouse_path_id = warehouse_path.id + book.locations.append(current_location) + dbsession.add(current_location) + dbsession.flush() + + # Target has different filename + target_locations = [(warehouse_path.id, "test_book_2024-01.zim")] + + assert _current_locations_match_targets(book, target_locations) is False + + def test_no_match_different_warehouse_path( + self, + dbsession: OrmSession, + create_book: Callable[..., Book], + create_warehouse_path: Callable[..., WarehousePath], + ): + """Book at different warehouse than target should return False.""" + book = create_book() + path1 = create_warehouse_path(folder_name="path1") + path2 = create_warehouse_path(folder_name="path2") + + # Add current location at path1 + current_location = BookLocation( + book_id=book.id, + status="current", + filename="test_book_2024-01.zim", + ) + current_location.warehouse_path_id = path1.id + book.locations.append(current_location) + dbsession.add(current_location) + dbsession.flush() + + # Target specifies path2 + target_locations = [(path2.id, "test_book_2024-01.zim")] + + assert _current_locations_match_targets(book, target_locations) is False + + def test_no_match_subset_current_locations( + self, + dbsession: OrmSession, + create_book: Callable[..., Book], + create_warehouse_path: Callable[..., WarehousePath], + ): + """Book with only 1 current location should not match 2 target locations.""" + book = create_book() + path1 = create_warehouse_path(folder_name="path1") + path2 = create_warehouse_path(folder_name="path2") + + # Add current location at only path1 + current_location = BookLocation( + book_id=book.id, + status="current", + filename="test_book_2024-01.zim", + ) + current_location.warehouse_path_id = path1.id + book.locations.append(current_location) + dbsession.add(current_location) + dbsession.flush() + + # Targets specify both paths + target_locations = [ + (path1.id, "test_book_2024-01.zim"), + (path2.id, "test_book_2024-01.zim"), + ] + + assert _current_locations_match_targets(book, target_locations) is False + + def test_no_match_superset_current_locations( + self, + dbsession: OrmSession, + create_book: Callable[..., Book], + create_warehouse_path: Callable[..., WarehousePath], + ): + """Book with more current locations than targets should return False.""" + book = create_book() + path1 = create_warehouse_path(folder_name="path1") + path2 = create_warehouse_path(folder_name="path2") + + # Add current locations at both paths + loc1 = BookLocation( + book_id=book.id, + status="current", + filename="test_book_2024-01.zim", + ) + loc1.warehouse_path_id = path1.id + loc2 = BookLocation( + book_id=book.id, + status="current", + filename="test_book_2024-01.zim", + ) + loc2.warehouse_path_id = path2.id + book.locations.extend([loc1, loc2]) + dbsession.add(loc1) + dbsession.add(loc2) + dbsession.flush() + + # Targets specify only one path + target_locations = [(path1.id, "test_book_2024-01.zim")] + + assert _current_locations_match_targets(book, target_locations) is False + + def test_no_match_empty_current_locations( + self, + dbsession: OrmSession, + create_book: Callable[..., Book], + create_warehouse_path: Callable[..., WarehousePath], + ): + """Book with no current locations should not match non-empty targets.""" + book = create_book() + warehouse_path = create_warehouse_path() + + # Book has no locations + dbsession.flush() + + # Targets specify locations + target_locations = [(warehouse_path.id, "test_book_2024-01.zim")] + + assert _current_locations_match_targets(book, target_locations) is False + + def test_ignores_target_status_locations( + self, + dbsession: OrmSession, + create_book: Callable[..., Book], + create_warehouse_path: Callable[..., WarehousePath], + ): + """Helper should ignore target status locations and only check current.""" + book = create_book() + warehouse_path = create_warehouse_path() + + # Add only target location (no current locations) + target_location = BookLocation( + book_id=book.id, + status="target", + filename="test_book_2024-01.zim", + ) + target_location.warehouse_path_id = warehouse_path.id + book.locations.append(target_location) + dbsession.add(target_location) + dbsession.flush() + + # Targets specify locations + target_locations = [(warehouse_path.id, "test_book_2024-01.zim")] + + # Should return False because there are no current locations + assert _current_locations_match_targets(book, target_locations) is False diff --git a/backend/tests/processors/test_book_location_integration.py b/backend/tests/processors/test_book_location_integration.py index 0f35a09..ca694df 100644 --- a/backend/tests/processors/test_book_location_integration.py +++ b/backend/tests/processors/test_book_location_integration.py @@ -385,3 +385,237 @@ def test_target_filename_collision_handling( assert len(target_locations) == 1 # Should get letter suffix to avoid collision assert target_locations[0].filename == "test_en_all_2024-01a.zim" + + +class TestTargetLocationOptimization: + """Test that target locations are skipped when they match current locations.""" + + def test_no_target_when_current_matches_single_path( + self, + dbsession: OrmSession, + create_book: Callable[..., Book], + create_book_location: Callable[..., BookLocation], + create_title: Callable[..., Title], + warehouse_setup: dict[str, Any], + ): + """When book current location exactly matches target, no target should + be created.""" + dev_path = warehouse_setup["dev_path"] + + # Create a book manually with a current location that will match the computed + # target + book = create_book( + name="wikipedia_fr_all", + date="2024-02-15", + flavour=None, + producer_unique_id=GOOD_PRODUCER["uniqueId"], + ) + + # Add current location with target-style filename + create_book_location( + book=book, + warehouse_path=dev_path, + filename="wikipedia_fr_all_2024-02.zim", + status="current", + ) + + # Create title with same warehouse path + title = create_title( + name="wikipedia_fr_all", + producer_unique_id=GOOD_PRODUCER["uniqueId"], + dev_warehouse_path_ids=[dev_path.id], + prod_warehouse_path_ids=[], + in_prod=False, + ) + + dbsession.flush() + + # Add book to title - should skip target creation + from cms_backend.processors.title import add_book_to_title + + add_book_to_title(dbsession, book, title) + dbsession.flush() + + current_locations = [loc for loc in book.locations if loc.status == "current"] + target_locations = [loc for loc in book.locations if loc.status == "target"] + + # Should have current location + assert len(current_locations) == 1 + assert current_locations[0].warehouse_path_id == dev_path.id + assert current_locations[0].filename == "wikipedia_fr_all_2024-02.zim" + + # Should NOT have target location (optimization applied) + assert len(target_locations) == 0 + + # Check event log + assert any( + "book already at all target locations" in event for event in book.events + ) + + def test_no_target_when_current_matches_multiple_paths( + self, + dbsession: OrmSession, + create_book: Callable[..., Book], + create_title: Callable[..., Title], + create_book_location: Callable[..., BookLocation], + warehouse_setup: dict[str, Any], + create_warehouse_path: Callable[..., WarehousePath], + ): + dev_path_1 = warehouse_setup["dev_path"] + dev_path_2 = create_warehouse_path( + warehouse=warehouse_setup["dev_warehouse"], + folder_name="dev-zim-backup", + ) + + # Create a book with current locations at both paths + book = create_book( + name="wiktionary_es_all", + date="2024-03-10", + flavour="maxi", + producer_unique_id=GOOD_PRODUCER["uniqueId"], + ) + + # Add current locations at both paths with target-style filename + create_book_location( + book=book, + warehouse_path=dev_path_1, + filename="wiktionary_es_all_maxi_2024-03.zim", + status="current", + ) + create_book_location( + book=book, + warehouse_path=dev_path_2, + filename="wiktionary_es_all_maxi_2024-03.zim", + status="current", + ) + + # Create title with multiple dev paths + title = create_title( + name="wiktionary_es_all", + producer_unique_id=GOOD_PRODUCER["uniqueId"], + dev_warehouse_path_ids=[dev_path_1.id, dev_path_2.id], + prod_warehouse_path_ids=[], + in_prod=False, + ) + + dbsession.flush() + + # Add book to title - should skip target creation + from cms_backend.processors.title import add_book_to_title + + add_book_to_title(dbsession, book, title) + dbsession.flush() + + current_locations = [loc for loc in book.locations if loc.status == "current"] + target_locations = [loc for loc in book.locations if loc.status == "target"] + + # Should have 2 current locations + assert len(current_locations) == 2 + current_path_ids = {loc.warehouse_path_id for loc in current_locations} + assert current_path_ids == {dev_path_1.id, dev_path_2.id} + + # Should NOT have target locations (optimization applied) + assert len(target_locations) == 0 + + # Check event log + assert any( + "book already at all target locations" in event for event in book.events + ) + + def test_target_created_when_partial_match( + self, + dbsession: OrmSession, + create_zimfarm_notification: Callable[..., ZimfarmNotification], + create_title: Callable[..., Title], + warehouse_setup: dict[str, Any], + create_warehouse_path: Callable[..., WarehousePath], + good_notification_content: dict[str, Any], + ): + """When book only matches some target paths, all targets should be created.""" + dev_path_1 = warehouse_setup["dev_path"] + dev_path_2 = create_warehouse_path( + warehouse=warehouse_setup["dev_warehouse"], + folder_name="dev-zim-backup", + ) + + # Adjust filename to match computed target + good_notification_content["filename"] = "test_en_all_2024-01.zim" + + # Create title with multiple dev paths + create_title( + name="test_en_all", + producer_unique_id=GOOD_PRODUCER["uniqueId"], + dev_warehouse_path_ids=[dev_path_1.id, dev_path_2.id], + prod_warehouse_path_ids=[], + in_prod=False, + ) + + notification = create_zimfarm_notification(content=good_notification_content) + process_notification(dbsession, notification) + + dbsession.flush() + assert notification.book is not None + + book = notification.book + current_locations = [loc for loc in book.locations if loc.status == "current"] + target_locations = [loc for loc in book.locations if loc.status == "target"] + + # Should have 1 current location + assert len(current_locations) == 1 + assert current_locations[0].warehouse_path_id == dev_path_1.id + + # Should have 2 target locations (NO optimization, partial match) + assert len(target_locations) == 2 + target_path_ids = {loc.warehouse_path_id for loc in target_locations} + assert target_path_ids == {dev_path_1.id, dev_path_2.id} + + # Check event log - should NOT have optimization message + assert not any( + "book already at all target locations" in event for event in book.events + ) + + def test_target_created_when_filename_differs( + self, + dbsession: OrmSession, + create_zimfarm_notification: Callable[..., ZimfarmNotification], + create_title: Callable[..., Title], + warehouse_setup: dict[str, Any], + good_notification_content: dict[str, Any], + ): + """When book filename differs from computed target, target should be created.""" + dev_path = warehouse_setup["dev_path"] + + # Keep original filename that differs from computed target + # Original: test_en_all_2024-01-15.zim + # Computed target: test_en_all_2024-01.zim + + create_title( + name="test_en_all", + producer_unique_id=GOOD_PRODUCER["uniqueId"], + dev_warehouse_path_ids=[dev_path.id], + prod_warehouse_path_ids=[], + in_prod=False, + ) + + notification = create_zimfarm_notification(content=good_notification_content) + process_notification(dbsession, notification) + + dbsession.flush() + assert notification.book is not None + + book = notification.book + current_locations = [loc for loc in book.locations if loc.status == "current"] + target_locations = [loc for loc in book.locations if loc.status == "target"] + + # Should have current location with original filename + assert len(current_locations) == 1 + assert current_locations[0].filename == "test_en_all_2024-01-15.zim" + + # Should have target location with computed filename (NO optimization) + assert len(target_locations) == 1 + assert target_locations[0].filename == "test_en_all_2024-01.zim" + + # Check event log - should NOT have optimization message + assert not any( + "book already at all target locations" in event for event in book.events + )