From 146a59d9ed8ef797c47f7bd00c52191c7853a1a0 Mon Sep 17 00:00:00 2001 From: agam-99 Date: Mon, 30 Mar 2026 12:07:23 +0530 Subject: [PATCH 1/2] [GOBBLIN-XXXX] Skip directory entries in RecursiveCopyableDataset to avoid IOException on empty source dirs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When source.path is an empty directory, FileListUtils.listFilesToCopyAtPath (includeEmptyDirectories=true) returns the directory itself as the sole FileStatus entry. The subsequent call to resolveReplicatedOwnerAndPermissionsRecursively passes file.getPath().getParent() as fromPath — which is *above* replacedPrefix — inverting the ancestry check and throwing an IOException. Skip FileStatus entries where isDirectory=true; empty source directories produce no copy work units by design. Log a warning so operators can diagnose misconfigured source paths. --- .../copy/RecursiveCopyableDataset.java | 13 +++++ .../copy/RecursiveCopyableDatasetTest.java | 56 +++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/copy/RecursiveCopyableDataset.java b/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/copy/RecursiveCopyableDataset.java index 0c38f414e0a..fefe5ec3a25 100644 --- a/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/copy/RecursiveCopyableDataset.java +++ b/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/copy/RecursiveCopyableDataset.java @@ -149,6 +149,19 @@ protected Collection getCopyableFilesImpl(CopyConfiguratio for (Path path : toCopy) { FileStatus file = filesInSource.get(path); + + // When the source directory is empty, FileListUtils.listFilesToCopyAtPath returns the + // directory itself as the item to copy (includeEmptyDirectories=true). Calling + // file.getPath().getParent() on it produces a path *above* replacedPrefix, which + // inverts the ancestry check in resolveReplicatedOwnerAndPermissionsRecursively and + // causes an IOException. Skip directory entries — empty source directories produce no + // copy work units. + if (file.isDirectory()) { + log.warn("Skipping directory entry '{}' in source — empty directories produce no copy work units. " + + "If this directory was expected to contain files, verify the source path is populated.", file.getPath()); + continue; + } + Path filePathRelativeToSearchPath = PathUtils.relativizePath(file.getPath(), replacedPrefix); Path thisTargetPath = new Path(replacingPrefix, filePathRelativeToSearchPath); diff --git a/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/copy/RecursiveCopyableDatasetTest.java b/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/copy/RecursiveCopyableDatasetTest.java index ed87a7c219d..b3d0033d12a 100644 --- a/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/copy/RecursiveCopyableDatasetTest.java +++ b/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/copy/RecursiveCopyableDatasetTest.java @@ -253,6 +253,62 @@ public boolean apply(@Nullable CopyEntity copyEntity) { Assert.assertEquals(step.getParentDeletionLimit().get(), target); } + /** + * When source.path points to an empty directory, FileListUtils.listFilesToCopyAtPath (with + * includeEmptyDirectories=true) returns the directory itself as the sole FileStatus entry. + * getCopyableFilesImpl must not crash trying to resolve ancestor permissions for it — instead + * it should produce zero copy work units. + */ + @Test + public void testEmptySourceDirectoryProducesNoCopyEntities() throws Exception { + Path source = new Path("/source"); + Path target = new Path("/target"); + + // Simulate what FileListUtils returns for an empty directory: the directory itself (isDirectory=true) + FileStatus emptyDirEntry = new FileStatus(0, true, 0, 0, 0, source); + List sourceFiles = Lists.newArrayList(emptyDirEntry); + List targetFiles = Lists.newArrayList(); + + Properties properties = new Properties(); + properties.setProperty(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, target.toString()); + RecursiveCopyableDataset dataset = new TestRecursiveCopyableDataset(source, target, sourceFiles, targetFiles, properties); + + Collection copyEntities = dataset.getCopyableFiles(FileSystem.getLocal(new Configuration()), + CopyConfiguration.builder(FileSystem.getLocal(new Configuration()), properties).build()); + + Assert.assertEquals(copyEntities.size(), 0, + "Empty source directory should produce zero copy entities, not an IOException"); + } + + /** + * When source contains real files mixed with an empty subdirectory entry, only the real files + * should produce copy work units; the directory entry must be silently skipped. + */ + @Test + public void testMixedSourceWithDirectoryEntrySkipsDirectories() throws Exception { + Path source = new Path("/source"); + Path target = new Path("/target"); + + FileStatus realFile = createFileStatus(source, "file1"); + FileStatus emptySubDir = new FileStatus(0, true, 0, 0, 0, new Path(source, "emptySubDir")); + List sourceFiles = Lists.newArrayList(realFile, emptySubDir); + List targetFiles = Lists.newArrayList(); + + Properties properties = new Properties(); + properties.setProperty(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, target.toString()); + RecursiveCopyableDataset dataset = new TestRecursiveCopyableDataset(source, target, sourceFiles, targetFiles, properties); + + Collection copyEntities = dataset.getCopyableFiles(FileSystem.getLocal(new Configuration()), + CopyConfiguration.builder(FileSystem.getLocal(new Configuration()), properties).build()); + + Assert.assertEquals(copyEntities.size(), 1); + ClassifiedFiles classifiedFiles = classifyFiles(copyEntities); + Assert.assertTrue(classifiedFiles.getPathsToCopy().containsKey(new Path(source, "file1")), + "Real file should be copied"); + Assert.assertFalse(classifiedFiles.getPathsToCopy().containsKey(new Path(source, "emptySubDir")), + "Directory entry should not appear as a copy entity"); + } + @Test public void testCorrectComputationOfTargetPathsWhenUsingGlob() throws Exception { Path source = new Path("/source/directory"); From 285235905ff47c73a120a4b7aea897821095cc71 Mon Sep 17 00:00:00 2001 From: agam-99 Date: Mon, 6 Apr 2026 17:39:20 +0530 Subject: [PATCH 2/2] [ETL-19035] Fix ancestor path resolution for empty source directories in RecursiveCopyableDataset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When includeEmptyDirectories=true and the source root is empty, FileListUtils returns the root directory itself as a FileStatus entry. Calling .getParent() on it produces a path above replacedPrefix, breaking the ancestry check in resolveReplicatedOwnerAndPermissionsRecursively. Fix: guard with isAncestor(replacedPrefix, parentPath) — fall back to the file's own path only when parentPath is above the dataset root. This preserves correct ancestor permission replication for nested empty subdirs (where .getParent() is still within replacedPrefix) and ensures the empty root directory is replicated at the destination rather than skipped. Co-Authored-By: Claude Sonnet 4.6 --- .../copy/RecursiveCopyableDataset.java | 24 ++++------ .../copy/RecursiveCopyableDatasetTest.java | 44 +++++-------------- 2 files changed, 20 insertions(+), 48 deletions(-) diff --git a/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/copy/RecursiveCopyableDataset.java b/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/copy/RecursiveCopyableDataset.java index fefe5ec3a25..0fefb66ab62 100644 --- a/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/copy/RecursiveCopyableDataset.java +++ b/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/copy/RecursiveCopyableDataset.java @@ -149,25 +149,19 @@ protected Collection getCopyableFilesImpl(CopyConfiguratio for (Path path : toCopy) { FileStatus file = filesInSource.get(path); - - // When the source directory is empty, FileListUtils.listFilesToCopyAtPath returns the - // directory itself as the item to copy (includeEmptyDirectories=true). Calling - // file.getPath().getParent() on it produces a path *above* replacedPrefix, which - // inverts the ancestry check in resolveReplicatedOwnerAndPermissionsRecursively and - // causes an IOException. Skip directory entries — empty source directories produce no - // copy work units. - if (file.isDirectory()) { - log.warn("Skipping directory entry '{}' in source — empty directories produce no copy work units. " - + "If this directory was expected to contain files, verify the source path is populated.", file.getPath()); - continue; - } - Path filePathRelativeToSearchPath = PathUtils.relativizePath(file.getPath(), replacedPrefix); Path thisTargetPath = new Path(replacingPrefix, filePathRelativeToSearchPath); + // Use the file's parent as the starting point for ancestor permission resolution, unless the + // parent is above replacedPrefix (happens when the source root is empty and FileListUtils + // returns the root directory itself). In that case use the file's own path so the walk + // terminates immediately with an empty ancestors list. + Path parentPath = file.getPath().getParent(); + Path ancestorFromPath = PathUtils.isAncestor(replacedPrefix, parentPath) ? parentPath : file.getPath(); + if (this.useNewPreserveLogic) { ancestorOwnerAndPermissions.putAll(CopyableFile - .resolveReplicatedAncestorOwnerAndPermissionsRecursively(this.fs, file.getPath().getParent(), + .resolveReplicatedAncestorOwnerAndPermissionsRecursively(this.fs, ancestorFromPath, replacedPrefix, configuration)); } @@ -176,7 +170,7 @@ protected Collection getCopyableFilesImpl(CopyConfiguratio .fileSet(datasetURN()) .datasetOutputPath(thisTargetPath.toString()) .ancestorsOwnerAndPermission(CopyableFile - .resolveReplicatedOwnerAndPermissionsRecursively(this.fs, file.getPath().getParent(), + .resolveReplicatedOwnerAndPermissionsRecursively(this.fs, ancestorFromPath, replacedPrefix, configuration)) .build(); copyableFile.setFsDatasets(this.fs, targetFs); diff --git a/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/copy/RecursiveCopyableDatasetTest.java b/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/copy/RecursiveCopyableDatasetTest.java index b3d0033d12a..8a5b991cfee 100644 --- a/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/copy/RecursiveCopyableDatasetTest.java +++ b/gobblin-data-management/src/test/java/org/apache/gobblin/data/management/copy/RecursiveCopyableDatasetTest.java @@ -256,11 +256,12 @@ public boolean apply(@Nullable CopyEntity copyEntity) { /** * When source.path points to an empty directory, FileListUtils.listFilesToCopyAtPath (with * includeEmptyDirectories=true) returns the directory itself as the sole FileStatus entry. - * getCopyableFilesImpl must not crash trying to resolve ancestor permissions for it — instead - * it should produce zero copy work units. + * getCopyableFilesImpl must replicate the empty directory at the destination without crashing — + * calling file.getPath().getParent() on the directory entry goes above the dataset root and + * breaks the ancestry check, so ancestor permissions must be set to empty for directory entries. */ @Test - public void testEmptySourceDirectoryProducesNoCopyEntities() throws Exception { + public void testEmptySourceDirectoryProducesCopyEntityForDirectory() throws Exception { Path source = new Path("/source"); Path target = new Path("/target"); @@ -276,37 +277,14 @@ public void testEmptySourceDirectoryProducesNoCopyEntities() throws Exception { Collection copyEntities = dataset.getCopyableFiles(FileSystem.getLocal(new Configuration()), CopyConfiguration.builder(FileSystem.getLocal(new Configuration()), properties).build()); - Assert.assertEquals(copyEntities.size(), 0, - "Empty source directory should produce zero copy entities, not an IOException"); - } - - /** - * When source contains real files mixed with an empty subdirectory entry, only the real files - * should produce copy work units; the directory entry must be silently skipped. - */ - @Test - public void testMixedSourceWithDirectoryEntrySkipsDirectories() throws Exception { - Path source = new Path("/source"); - Path target = new Path("/target"); - - FileStatus realFile = createFileStatus(source, "file1"); - FileStatus emptySubDir = new FileStatus(0, true, 0, 0, 0, new Path(source, "emptySubDir")); - List sourceFiles = Lists.newArrayList(realFile, emptySubDir); - List targetFiles = Lists.newArrayList(); - - Properties properties = new Properties(); - properties.setProperty(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, target.toString()); - RecursiveCopyableDataset dataset = new TestRecursiveCopyableDataset(source, target, sourceFiles, targetFiles, properties); - - Collection copyEntities = dataset.getCopyableFiles(FileSystem.getLocal(new Configuration()), - CopyConfiguration.builder(FileSystem.getLocal(new Configuration()), properties).build()); - - Assert.assertEquals(copyEntities.size(), 1); + // The empty directory itself should be replicated at the destination (not silently dropped) + Assert.assertEquals(copyEntities.size(), 1, + "Empty source directory should produce one copy entity to replicate the directory at the destination"); ClassifiedFiles classifiedFiles = classifyFiles(copyEntities); - Assert.assertTrue(classifiedFiles.getPathsToCopy().containsKey(new Path(source, "file1")), - "Real file should be copied"); - Assert.assertFalse(classifiedFiles.getPathsToCopy().containsKey(new Path(source, "emptySubDir")), - "Directory entry should not appear as a copy entity"); + Assert.assertTrue(classifiedFiles.getPathsToCopy().containsKey(source), + "The empty directory itself should be the copy origin"); + Assert.assertEquals(classifiedFiles.getPathsToCopy().get(source), target, + "The empty directory should be mapped to the target path"); } @Test