mindee · felixdittrich92 · Jul 9, 2025
diff --git a/doctr/datasets/datasets/base.py b/doctr/datasets/datasets/base.py
@@ -5,6 +5,7 @@
 
 import os
 import shutil
+import traceback
 from collections.abc import Callable
 from pathlib import Path
 from typing import Any
@@ -46,29 +47,37 @@ def _read_sample(self, index: int) -> tuple[Any, Any]:
         raise NotImplementedError
 
     def __getitem__(self, index: int) -> tuple[Any, Any]:
-        # Read image
-        img, target = self._read_sample(index)
-        # Pre-transforms (format conversion at run-time etc.)
-        if self._pre_transforms is not None:
-            img, target = self._pre_transforms(img, target)
-
-        if self.img_transforms is not None:
-            # typing issue cf. https://github.com/python/mypy/issues/5485
-            img = self.img_transforms(img)
-
-        if self.sample_transforms is not None:
-            # Conditions to assess it is detection model with multiple classes and avoid confusion with other tasks.
-            if (
-                isinstance(target, dict)
-                and all(isinstance(item, np.ndarray) for item in target.values())
-                and set(target.keys()) != {"boxes", "labels"}  # avoid confusion with obj detection target
-            ):
-                img_transformed = _copy_tensor(img)
-                for class_name, bboxes in target.items():
-                    img_transformed, target[class_name] = self.sample_transforms(img, bboxes)
-                img = img_transformed
-            else:
-                img, target = self.sample_transforms(img, target)
+        try:
+            img, target = self._read_sample(index)
+            # Pre-transforms (format conversion at run-time etc.)
+            if self._pre_transforms is not None:
+                img, target = self._pre_transforms(img, target)
+
+            if self.img_transforms is not None:
+                # typing issue cf. https://github.com/python/mypy/issues/5485
+                img = self.img_transforms(img)
+
+            if self.sample_transforms is not None:
+                # Conditions to assess it is detection model with multiple classes and avoid confusion with other tasks.
+                if (
+                    isinstance(target, dict)
+                    and all(isinstance(item, np.ndarray) for item in target.values())
+                    and set(target.keys()) != {"boxes", "labels"}  # avoid confusion with obj detection target
+                ):
+                    img_transformed = _copy_tensor(img)
+                    for class_name, bboxes in target.items():
+                        img_transformed, target[class_name] = self.sample_transforms(img, bboxes)
+                    img = img_transformed
+                else:
+                    img, target = self.sample_transforms(img, target)
+        except Exception:
+            img_name = self.data[index][0]
+            # Write
+            print()
+            print(f"!!!ERROR in Dataset on filename {img_name}")
+            traceback.print_exc()
+            print()
+            return self.__getitem__(0)  # should exists ^^
 
         return img, target
 

diff --git a/doctr/datasets/detection.py b/doctr/datasets/detection.py
@@ -54,14 +54,21 @@ def __init__(
 
         self.data: list[tuple[str, tuple[np.ndarray, list[str]]]] = []
         np_dtype = np.float32
+
+        missing_files = []
         for img_name, label in labels.items():
             # File existence check
             if not os.path.exists(os.path.join(self.root, img_name)):
-                raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}")
-
-            geoms, polygons_classes = self.format_polygons(label["polygons"], use_polygons, np_dtype)
-
-            self.data.append((img_name, (np.asarray(geoms, dtype=np_dtype), polygons_classes)))
+                missing_files.append(img_name)
+                # raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}")
+            else:
+                geoms, polygons_classes = self.format_polygons(label["polygons"], use_polygons, np_dtype)
+                self.data.append((img_name, (np.asarray(geoms, dtype=np_dtype), polygons_classes)))
+        print("List of missing files:")
+        print(f"MISSING FILES: {len(missing_files)}")
+        from pprint import pprint
+
+        pprint(missing_files)
 
     def format_polygons(
         self, polygons: list | dict, use_polygons: bool, np_dtype: type

diff --git a/doctr/datasets/recognition.py b/doctr/datasets/recognition.py
@@ -39,11 +39,18 @@ def __init__(
         with open(labels_path, encoding="utf-8") as f:
             labels = json.load(f)
 
+        missing_files = []
         for img_name, label in labels.items():
             if not os.path.exists(os.path.join(self.root, img_name)):
-                raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}")
-
-            self.data.append((img_name, label))
+                missing_files.append(img_name)
+                # raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}")
+            else:
+                self.data.append((img_name, label))
+        print("List of missing files:")
+        print(f"MISSING FILES: {len(missing_files)}")
+        from pprint import pprint
+
+        pprint(missing_files)
 
     def merge_dataset(self, ds: AbstractDataset) -> None:
         # Update data with new root for self

diff --git a/references/requirements.txt b/references/requirements.txt
@@ -1,6 +1,7 @@
 -e .
 tqdm
 slack-sdk
+boto3>=1.9
 wandb>=0.10.31
 clearml>=1.11.1
 matplotlib>=3.1.0