From 06f7ccd40a8ad83c5f13f35daef650c398bc0829 Mon Sep 17 00:00:00 2001 From: Steven Winship <39765413+stevenwinship@users.noreply.github.com> Date: Fri, 29 May 2026 11:28:52 -0400 Subject: [PATCH 1/2] Remove TestIngest Class --- .../harvard/iq/dataverse/api/TestIngest.java | 193 --------------- .../harvard/iq/dataverse/api/TabularIT.java | 228 ------------------ .../edu/harvard/iq/dataverse/api/UtilIT.java | 5 - 3 files changed, 426 deletions(-) delete mode 100644 src/main/java/edu/harvard/iq/dataverse/api/TestIngest.java delete mode 100644 src/test/java/edu/harvard/iq/dataverse/api/TabularIT.java diff --git a/src/main/java/edu/harvard/iq/dataverse/api/TestIngest.java b/src/main/java/edu/harvard/iq/dataverse/api/TestIngest.java deleted file mode 100644 index add43ea2091..00000000000 --- a/src/main/java/edu/harvard/iq/dataverse/api/TestIngest.java +++ /dev/null @@ -1,193 +0,0 @@ -/* - * To change this license header, choose License Headers in Project Properties. - * To change this template file, choose Tools | Templates - * and open the template in the editor. - */ - -package edu.harvard.iq.dataverse.api; - -import edu.harvard.iq.dataverse.DataFile; -import edu.harvard.iq.dataverse.DataTable; -import edu.harvard.iq.dataverse.Dataset; -import edu.harvard.iq.dataverse.DatasetServiceBean; -import edu.harvard.iq.dataverse.FileMetadata; -import edu.harvard.iq.dataverse.ingest.IngestServiceBean; -import edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataFileReader; -import edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest; -import edu.harvard.iq.dataverse.util.FileUtil; -import edu.harvard.iq.dataverse.util.StringUtil; -import java.io.BufferedInputStream; -import java.util.logging.Logger; -import jakarta.ejb.EJB; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.nio.file.Paths; -import java.nio.file.StandardCopyOption; -import jakarta.ws.rs.GET; -import jakarta.ws.rs.Path; -import jakarta.ws.rs.PathParam; -import jakarta.ws.rs.Produces; -import jakarta.ws.rs.core.Context; -import jakarta.ws.rs.core.HttpHeaders; -import jakarta.ws.rs.core.UriInfo; -import jakarta.servlet.http.HttpServletResponse; -import jakarta.ws.rs.QueryParam; - - - -/** - * - * @author Leonid Andreev - * - * This API call was originally created for batch-testing 4.0 ingest. - * It runs the ingest code that creates the dataverse data objects - datavariables, - * datatable, etc. and generates a report with variable metadata - names, - * types, UNFs, but doesn't persist the objects in the database. - * It was put together as a bit of a hack - but we may have a place for it in - * the application - Gary has requested a mechanism for producing UNFs without - * actually ingesting the file (for sensitive data, etc.). - * So we'll probably beef up this API call a little bit (make it upload the - * file, etc.) and make a simple UI for it. - * - * -- L.A. Aug. 2014 - */ - -@Path("ingest") -public class TestIngest { - private static final Logger logger = Logger.getLogger(TestIngest.class.getCanonicalName()); - - @EJB - DatasetServiceBean datasetService; - @EJB - IngestServiceBean ingestService; - - //@EJB - - @Path("test/file") - @GET - @Produces({ "text/plain" }) - public String datafile(@QueryParam("fileName") String fileName, @QueryParam("fileType") String fileType, @Context UriInfo uriInfo, @Context HttpHeaders headers, @Context HttpServletResponse response) /*throws NotFoundException, ServiceUnavailableException, PermissionDeniedException, AuthorizationRequiredException*/ { - String output = ""; - - if (StringUtil.isEmpty(fileName) || StringUtil.isEmpty(fileType)) { - output = output.concat("Usage: /api/ingest/test/file?fileName=PATH&fileType=TYPE"); - return output; - } - - BufferedInputStream fileInputStream = null; - - - try { - fileInputStream = new BufferedInputStream(new FileInputStream(new File(fileName))); - } catch (FileNotFoundException notfoundEx) { - fileInputStream = null; - } - - if (fileInputStream == null) { - output = output.concat("Could not open file "+fileName+"."); - return output; - } - - TabularDataFileReader ingestPlugin = ingestService.getTabDataReaderByMimeType(fileType); - - if (ingestPlugin == null) { - output = output.concat("Could not locate an ingest plugin for type "+fileType+"."); - return output; - } - - TabularDataIngest tabDataIngest = null; - - try { - tabDataIngest = ingestPlugin.read(fileInputStream, false, null); - } catch (IOException ingestEx) { - output = output.concat("Caught an exception trying to ingest file " + fileName + ": " + ingestEx.getLocalizedMessage()); - return output; - } - - try { - if (tabDataIngest != null) { - File tabFile = tabDataIngest.getTabDelimitedFile(); - - if (tabDataIngest.getDataTable() != null - && tabFile != null - && tabFile.exists()) { - - String tabFilename = FileUtil.replaceExtension(fileName, "tab"); - - java.nio.file.Files.copy(Paths.get(tabFile.getAbsolutePath()), Paths.get(tabFilename), StandardCopyOption.REPLACE_EXISTING); - - DataTable dataTable = tabDataIngest.getDataTable(); - - DataFile dataFile = new DataFile(); - dataFile.setStorageIdentifier(tabFilename); - Dataset dataset = new Dataset(); - dataFile.setOwner(dataset); - - FileMetadata fileMetadata = new FileMetadata(); - fileMetadata.setLabel(fileName); - - dataFile.setDataTable(dataTable); - dataTable.setDataFile(dataFile); - - fileMetadata.setDataFile(dataFile); - dataFile.getFileMetadatas().add(fileMetadata); - - output = output.concat ("NVARS: "+dataTable.getVarQuantity()+"\n"); - output = output.concat ("NOBS: "+dataTable.getCaseQuantity()+"\n"); - - try { - ingestService.produceSummaryStatistics(dataFile, tabFile); - output = output.concat ("UNF: "+dataTable.getUnf()+"\n"); - } catch (IOException ioex) { - output = output.concat ("UNF: failed to calculate\n"+"\n"); - } - - for (int i = 0; i < dataTable.getVarQuantity(); i++) { - String vartype = ""; - - //if ("continuous".equals(dataTable.getDataVariables().get(i).getVariableIntervalType().getName())) { - if (dataTable.getDataVariables().get(i).isIntervalContinuous()) { - vartype = "numeric-continuous"; - } else { - if (dataTable.getDataVariables().get(i).isTypeNumeric()) { - vartype = "numeric-discrete"; - } else { - String formatCategory = dataTable.getDataVariables().get(i).getFormatCategory(); - if ("time".equals(formatCategory)) { - vartype = "character-time"; - } else if ("date".equals(formatCategory)) { - vartype = "character-date"; - } else { - vartype = "character"; - } - } - } - - output = output.concat ("VAR"+i+" "); - output = output.concat (dataTable.getDataVariables().get(i).getName()+" "); - output = output.concat (vartype+" "); - output = output.concat (dataTable.getDataVariables().get(i).getUnf()); - output = output.concat ("\n"); - - } - - } else { - output = output.concat("Ingest failed to produce tab file or data table for file "+fileName+"."); - return output; - } - } else { - output = output.concat("Ingest resulted in a null tabDataIngest object for file "+fileName+"."); - return output; - } - } catch (IOException ex) { - output = output.concat("Caught an exception trying to save ingested data for file "+fileName+"."); - return output; - } - - return output; - } - - -} \ No newline at end of file diff --git a/src/test/java/edu/harvard/iq/dataverse/api/TabularIT.java b/src/test/java/edu/harvard/iq/dataverse/api/TabularIT.java deleted file mode 100644 index 25eec16e17b..00000000000 --- a/src/test/java/edu/harvard/iq/dataverse/api/TabularIT.java +++ /dev/null @@ -1,228 +0,0 @@ -package edu.harvard.iq.dataverse.api; - -import io.restassured.RestAssured; -import io.restassured.path.json.JsonPath; -import io.restassured.response.Response; - -import java.io.File; -import java.util.Arrays; -import java.util.logging.Logger; -import static jakarta.ws.rs.core.Response.Status.CREATED; -import static jakarta.ws.rs.core.Response.Status.OK; -import static org.hamcrest.CoreMatchers.equalTo; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertTrue; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; - -public class TabularIT { - - private static final Logger logger = Logger.getLogger(TabularIT.class.getCanonicalName()); - - @BeforeAll - public static void setUpClass() { - RestAssured.baseURI = UtilIT.getRestAssuredBaseUri(); - } - - @Disabled - @Test - public void testTabularFile() throws InterruptedException { - Response createUser = UtilIT.createRandomUser(); - createUser.then().assertThat() - .statusCode(OK.getStatusCode()); - String username = UtilIT.getUsernameFromResponse(createUser); - String apiToken = UtilIT.getApiTokenFromResponse(createUser); - - Response createDataverseResponse = UtilIT.createRandomDataverse(apiToken); - createDataverseResponse.prettyPrint(); - createDataverseResponse.then().assertThat() - .statusCode(CREATED.getStatusCode()); - String dataverseAlias = UtilIT.getAliasFromResponse(createDataverseResponse); - - Response createDatasetResponse = UtilIT.createRandomDatasetViaNativeApi(dataverseAlias, apiToken); - createDatasetResponse.prettyPrint(); - createDatasetResponse.then().assertThat() - .statusCode(CREATED.getStatusCode()); - Integer datasetId = JsonPath.from(createDatasetResponse.body().asString()).getInt("data.id"); - String persistentId = JsonPath.from(createDatasetResponse.body().asString()).getString("data.persistentId"); - logger.info("Dataset created with id " + datasetId + " and persistent id " + persistentId); - - String pathToFileThatGoesThroughIngest = "scripts/search/data/tabular/50by1000.dta"; - Response uploadIngestableFile = UtilIT.uploadFileViaNative(datasetId.toString(), pathToFileThatGoesThroughIngest, apiToken); - uploadIngestableFile.prettyPrint(); - uploadIngestableFile.then().assertThat() - .statusCode(OK.getStatusCode()); - long fileId = JsonPath.from(uploadIngestableFile.body().asString()).getLong("data.files[0].dataFile.id"); - String fileIdAsString = Long.toString(fileId); -// String filePersistentId = JsonPath.from(uploadIngestableFile.body().asString()).getString("data.files[0].dataFile.persistentId"); - System.out.println("fileId: " + fileId); -// System.out.println("filePersistentId: " + filePersistentId); - - // Give file time to ingest - - assertTrue(UtilIT.sleepForLock(datasetId.longValue(), "Ingest", apiToken, UtilIT.MAXIMUM_INGEST_LOCK_DURATION), "Failed test if Ingest Lock exceeds max duration " + pathToFileThatGoesThroughIngest); - // Thread.sleep(10000); - - Response fileMetadataNoFormat = UtilIT.getFileMetadata(fileIdAsString, null, apiToken); - fileMetadataNoFormat.prettyPrint(); - fileMetadataNoFormat.then().assertThat() - .statusCode(OK.getStatusCode()) - .body("codeBook.fileDscr.fileTxt.fileName", equalTo("50by1000.tab")); - - Response fileMetadataNoFormatFileId = UtilIT.getFileMetadata(fileIdAsString, null, apiToken); - fileMetadataNoFormatFileId.prettyPrint(); - fileMetadataNoFormatFileId.then().assertThat() - .statusCode(OK.getStatusCode()) - .body("codeBook.fileDscr.fileTxt.fileName", equalTo("50by1000.tab")); - - Response fileMetadataDdi = UtilIT.getFileMetadata(fileIdAsString, "ddi", apiToken); - fileMetadataDdi.prettyPrint(); - fileMetadataDdi.then().assertThat() - .statusCode(OK.getStatusCode()) - .body("codeBook.fileDscr.fileTxt.fileName", equalTo("50by1000.tab")) - .body("codeBook.dataDscr.var[0].@name", equalTo("var1")) - // Yes, it's odd that we go from "var1" to "var3" to "var2" to "var5" - .body("codeBook.dataDscr.var[1].@name", equalTo("var3")) - .body("codeBook.dataDscr.var[2].@name", equalTo("var2")) - .body("codeBook.dataDscr.var[3].@name", equalTo("var5")); - - boolean testPreprocessedMetadataFormat = false; - if (testPreprocessedMetadataFormat) { - // If you don't have all the dependencies in place, such as Rserve, you might get a 503 and this error: - // org.rosuda.REngine.Rserve.RserveException: Cannot connect: Connection refused - Response fileMetadataPreProcessed = UtilIT.getFileMetadata(fileIdAsString, "preprocessed", apiToken); - fileMetadataPreProcessed.prettyPrint(); - fileMetadataPreProcessed.then().assertThat() - .statusCode(OK.getStatusCode()) - .body("codeBook.fileDscr.fileTxt.fileName", equalTo("50by1000.tab")); - } - - } - - @Disabled - @Test - public void test50by1000() { - // cp scripts/search/data/tabular/50by1000.dta /tmp - String fileName = "/tmp/50by1000.dta"; - String fileType = "application/x-stata"; - Response response = UtilIT.testIngest(fileName, fileType); - response.prettyPrint(); - assertEquals("NVARS: 50", response.body().asString().split("\n")[0]); - } - - @Disabled - @Test - public void testStata13TinyFile() { - // cp scripts/search/data/tabular/120745.dta /tmp - String fileName = "/tmp/120745.dta"; - String fileType = "application/x-stata"; - Response response = UtilIT.testIngest(fileName, fileType); - response.prettyPrint(); - assertEquals("NVARS: 1", response.body().asString().split("\n")[0]); - } - - @Disabled - @Test - public void testStata13Auto() { - // curl https://www.stata-press.com/data/r13/auto.dta > /tmp/stata13-auto.dta - String fileName = "/tmp/stata13-auto.dta"; - String fileType = "application/x-stata-13"; - Response response = UtilIT.testIngest(fileName, fileType); - response.prettyPrint(); - assertEquals("NVARS: 12", response.body().asString().split("\n")[0]); - } - - @Disabled - @Test - public void testStata14OpenSourceAtHarvard() { - // https://dataverse.harvard.edu/file.xhtml?fileId=3040230 converted to Stata 14: 2017-07-31.tab - // cp scripts/search/data/tabular/open-source-at-harvard118.dta /tmp - String fileName = "/tmp/open-source-at-harvard118.dta"; - // No mention of stata at https://www.iana.org/assignments/media-types/media-types.xhtml - String fileType = "application/x-stata-14"; - Response response = UtilIT.testIngest(fileName, fileType); - response.prettyPrint(); - assertEquals("NVARS: 10", response.body().asString().split("\n")[0]); - } - - @Disabled - @Test - public void testStata14Aggregated() { - // https://dataverse.harvard.edu/file.xhtml?fileId=3140457 Stata 14: 2018_04_06_Aggregated_dataset_v2.dta - String fileName = "/tmp/2018_04_06_Aggregated_dataset_v2.dta"; - // No mention of stata at https://www.iana.org/assignments/media-types/media-types.xhtml - String fileType = "application/x-stata-14"; - Response response = UtilIT.testIngest(fileName, fileType); - response.prettyPrint(); - assertEquals("NVARS: 227", response.body().asString().split("\n")[0]); - } - - @Disabled - @Test - public void testStata14MmPublic() { - // TODO: This file was downloaded at random. We could keep trying to get it to ingest. - // https://dataverse.harvard.edu/file.xhtml?fileId=2775556 Stata 14: mm_public_120615_v14.dta - // For this file "hasSTRLs" is true so it might be nice to get it working. - String fileName = "/tmp/mm_public_120615_v14.dta"; - // No mention of stata at https://www.iana.org/assignments/media-types/media-types.xhtml - String fileType = "application/x-stata-14"; - Response response = UtilIT.testIngest(fileName, fileType); - response.prettyPrint(); - // We don't know how many variables it has. Probably not 12. - assertEquals("NVARS: 12", response.body().asString().split("\n")[0]); - } - - @Disabled - @Test - public void testStata15() { - // for i in `echo {0..33000}`; do echo -n "var$i,"; done > 33k.csv - // Then open Stata 15, run `set maxvar 40000` and import. - String fileName = "/tmp/33k.dta"; - String fileType = "application/x-stata-15"; - Response response = UtilIT.testIngest(fileName, fileType); - response.prettyPrint(); - assertEquals("NVARS: 33001", response.body().asString().split("\n")[0]); - } - - @Disabled - @Test - public void testStata13Multiple() { - String fileType = "application/x-stata-13"; - // From /usr/local/dvn-admin/stata on dvn-build - String stata13directory = "/tmp/stata-13"; - File folder = new File(stata13directory); - File[] listOfFiles = folder.listFiles(); - for (int i = 0; i < listOfFiles.length; i++) { - File file = listOfFiles[i]; - String filename = file.getName(); - String filenameFullPath = file.getAbsolutePath(); - Response response = UtilIT.testIngest(filenameFullPath, fileType); - String firstLine = response.body().asString().split("\n")[0]; - String[] parts = firstLine.split(":"); - String[] justErrors = Arrays.copyOfRange(parts, 1, parts.length); - System.out.println(i + "\t" + filename + "\t" + Arrays.toString(justErrors) + "\t" + firstLine); - } - } - - @Disabled - @Test - public void testStata14Multiple() { - String fileType = "application/x-stata-14"; - // From /usr/local/dvn-admin/stata on dvn-build - String stata13directory = "/tmp/stata-14"; - File folder = new File(stata13directory); - File[] listOfFiles = folder.listFiles(); - for (int i = 0; i < listOfFiles.length; i++) { - File file = listOfFiles[i]; - String filename = file.getName(); - String filenameFullPath = file.getAbsolutePath(); - Response response = UtilIT.testIngest(filenameFullPath, fileType); - String firstLine = response.body().asString().split("\n")[0]; - String[] parts = firstLine.split(":"); - String[] justErrors = Arrays.copyOfRange(parts, 1, parts.length); - System.out.println(i + "\t" + filename + "\t" + Arrays.toString(justErrors) + "\t" + firstLine); - } - } - -} diff --git a/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java b/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java index 2c0e25cc3cf..561a9bc3b93 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java @@ -1501,11 +1501,6 @@ static Response getFileVersionDifferences(String fileId, String apiToken, Intege return request.get("/api/files/" + fileId + "/versionDifferences"); } - static Response testIngest(String fileName, String fileType) { - return given() - .get("/api/ingest/test/file?fileName=" + fileName + "&fileType=" + fileType); - } - static Response redetectFileType(String fileId, boolean dryRun, String apiToken) { return given() .header(API_TOKEN_HTTP_HEADER, apiToken) From 0ca0b2ec6731eb929b9dad479820d9780cfb3278 Mon Sep 17 00:00:00 2001 From: Steven Winship <39765413+stevenwinship@users.noreply.github.com> Date: Fri, 29 May 2026 11:37:49 -0400 Subject: [PATCH 2/2] add release note --- doc/release-notes/12415-remove-testingest-class.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 doc/release-notes/12415-remove-testingest-class.md diff --git a/doc/release-notes/12415-remove-testingest-class.md b/doc/release-notes/12415-remove-testingest-class.md new file mode 100644 index 00000000000..8eae307f386 --- /dev/null +++ b/doc/release-notes/12415-remove-testingest-class.md @@ -0,0 +1 @@ +Class TestIngest was removed as it was no longer being used