diff --git a/DESCRIPTION b/DESCRIPTION index 10b4dd2..3dced7b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: ROMOPMappingTools Title: Tools for Working with OMOP CDM Mappings -Version: 2.1.2 +Version: 2.1.3 Authors@R: person("Javier", "Gracia-Tabuenca", , "javier.graciatabuenca@tuni.fi", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-2455-0598")) diff --git a/NEWS.md b/NEWS.md index 4f46674..4b57762 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# ROMOPMappingTools 2.1.3 + +- Added rule to validate usagi file: sourceConceptId must be unique + # ROMOPMappingTools 2.1.2 - Added rule to validate usagi file: sourceConceptCode must be less than 50 characters diff --git a/R/validateUsagiFile.R b/R/validateUsagiFile.R index f7ee515..13ba240 100644 --- a/R/validateUsagiFile.R +++ b/R/validateUsagiFile.R @@ -10,6 +10,7 @@ #' - Check if sourceName is less than 255 characters #' If usagi file has C&CR columns: #' - Check if concept_id is not 0 for APPROVED mappingStatus +#' - Check if sourceConceptId is unique (each sourceConceptId belongs to only one sourceCode) #' - Check codes with mapping to more than one domain are mapped to compatible domains #' - Check if sourceValidStartDate is before sourceValidEndDate #' - Check if ADD_INFO:sourceParents is a valid concept code in the ADD_INFO:sourceParentVocabulary @@ -238,6 +239,50 @@ validateUsagiFile <- function( usagiTibble <- result$fileTibble validationLogR6 <- result$validationLogR6 + # Check SourceConceptId is unique (each sourceConceptId should belong to only one sourceCode). + # Deduplication by (sourceCode, sourceConceptId) is needed first so that valid multi-mapped + # codes (one sourceCode → many conceptIds, all sharing the same sourceConceptId) are not + # incorrectly flagged. + usagiDistinct <- usagiTibble |> + dplyr::filter(!is.na(`ADD_INFO:sourceConceptId`)) |> + dplyr::distinct(sourceCode, `ADD_INFO:sourceConceptId`) + + distinctValidationRules <- validate::validator( + SourceConceptId.is.not.unique = is_unique(`ADD_INFO:sourceConceptId`) + ) + distinctValidations <- validate::confront(usagiDistinct, distinctValidationRules) + distinctValidationSummary <- validate::summary(distinctValidations) |> tibble::as_tibble() + + if (distinctValidationSummary$fails[1] > 0) { + validationLogR6$ERROR( + "SourceConceptId is not unique", + paste0("Found ", distinctValidationSummary$fails[1], " sourceConceptIds assigned to more than one sourceCode") + ) + + notUniqueSourceConceptIds <- usagiDistinct[ + !validate::values(distinctValidations)[, "SourceConceptId.is.not.unique"], + ] |> + dplyr::pull(`ADD_INFO:sourceConceptId`) |> + unique() + + usagiTibble <- usagiTibble |> + dplyr::mutate( + errorMessage = dplyr::if_else( + `ADD_INFO:sourceConceptId` %in% notUniqueSourceConceptIds, + "ERROR: SourceConceptId is not unique", + NA_character_ + ) + ) |> + dplyr::mutate(tmpvalidationMessages = dplyr::if_else( + !is.na(errorMessage), + paste0(tmpvalidationMessages, " | ", errorMessage), + tmpvalidationMessages + )) |> + dplyr::select(-errorMessage) + } else { + validationLogR6$SUCCESS("SourceConceptId is not unique", "") + } + # check if when the code maps to more than one concept the combined domain is valid invalidDomainCombinations <- usagiTibble |> dplyr::filter(mappingStatus != "INVALID_TARGET") |> diff --git a/inst/testdata/VOCABULARIES/ICD10fi/ICD10fi_with_errors.usagi.csv b/inst/testdata/VOCABULARIES/ICD10fi/ICD10fi_with_errors.usagi.csv index b75b5d6..0b7f92c 100644 --- a/inst/testdata/VOCABULARIES/ICD10fi/ICD10fi_with_errors.usagi.csv +++ b/inst/testdata/VOCABULARIES/ICD10fi/ICD10fi_with_errors.usagi.csv @@ -30,8 +30,8 @@ A18.0+M01.2,[ConceptIds outdated][Updated conceptIds not found]Tuberculous arthr A18.0+M01.5,[ConceptIds outdated][Updated conceptIds not found 2]Tuberculous arthritis,-1,,2000500901,Tuberkuloottinen nivelinfektio,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A18|A18.0|M01.1,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1669304412625,4071477,Fetal or neonatal effect of placental insufficiency,Condition,MAPS_TO,,TAYS,1623974400000,,, A18.0+M01.5,[ConceptIds outdated][Updated conceptIds not found 2]Tuberculous arthritis,-1,,2000500901,Tuberkuloottinen nivelinfektio,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A18|A18.0|M01.1,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1669304412625,4110778,Fetal or neonatal effect of placental insufficiency,Condition,MAPS_TO,,TAYS,1623974400000,,, A18.0+M01.0,[ConceptIds outdated][Updated by usagi]Tuberculous arthritis,-1,,2000530124,Tuberkuloottinen nivelinfektio,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A18|A18.0|M01.1,ICD10|ICD10|ICD10,0,INVALID_TARGET,EQUAL,PKo,1669304412625,0,Unmapped,Condition,MAPS_TO,Invalid existing target: 4071477,TAYS,1623974400000,,, -A17.8+G63.1,[ConceptIds outdated][Updated by usagi 2 one invalid]Tuberculous polyneuropathy,-1,,2000500119,Tuberkuloottinen monihermosairaus,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A17|A17.8|G63.0,ICD10|ICD10|ICD10,0,APPROVED,EQUIVALENT,PKo,1669563756643,4121541,Neutropenia,Condition,MAPS_TO,Invalid existing target: 3079174,PKo,1669563749083,,, -A17.8+G63.2,[ConceptIds outdated][Updated by usagi 2 2 invalid]Tuberculous polyneuropathy,-1,,2000500119,Tuberkuloottinen monihermosairaus,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A17|A17.8|G63.0,ICD10|ICD10|ICD10,0,INVALID_TARGET,EQUIVALENT,PKo,1669563756643,0,Unmapped,Condition,MAPS_TO,Invalid existing target: 3079174,PKo,1669563731393,,, +A17.8+G63.1,[ConceptIds outdated][Updated by usagi 2 one invalid]Tuberculous polyneuropathy,-1,,2000599997,Tuberkuloottinen monihermosairaus,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A17|A17.8|G63.0,ICD10|ICD10|ICD10,0,APPROVED,EQUIVALENT,PKo,1669563756643,4121541,Neutropenia,Condition,MAPS_TO,Invalid existing target: 3079174,PKo,1669563749083,,, +A17.8+G63.2,[ConceptIds outdated][Updated by usagi 2 2 invalid]Tuberculous polyneuropathy,-1,,2000599998,Tuberkuloottinen monihermosairaus,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A17|A17.8|G63.0,ICD10|ICD10|ICD10,0,INVALID_TARGET,EQUIVALENT,PKo,1669563756643,0,Unmapped,Condition,MAPS_TO,Invalid existing target: 3079174,PKo,1669563731393,,, A02.2+J17.0,[SourceConceptId is empty]Salmonella pneumonia,-1,,,Salmonellan aiheuttama keuhkokuume,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A02|A02.2|J17.0,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1666803987773,258333,Salmonella pneumonia,Condition,MAPS_TO,,TAYS,1623974400000,,, A02.2+M01.3,[SourceConceptId is not a number on the range]Salmonella arthritis,-1,,5,Salmonellan aiheuttama niveltulehdus,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A02|A02.2|M01.3,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1666803996141,80316,Salmonella arthritis,Condition,MAPS_TO,,TAYS,1623974400000,,, A02.2+M90.2,[SourceConceptClass is empty]Salmonella osteomyelitis,-1,,2000500109,Salmonellan aiheuttama osteomyeliitti,,Condition,1900-01-01,2099-12-31,A02|A02.2|M90.2,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1666803999237,134264,Salmonella osteomyelitis,Condition,MAPS_TO,,TAYS,1623974400000,,, @@ -71,3 +71,4 @@ A18.7+E35.1,Tuberculosis of adrenal glands,-1,,2000500142,Lisämunuaistuberkuloo A18.8+D77,Tuberculosis of spleen,-1,,2000500143,Pernan tuberkuloosi,ICD10fi Hierarchy,Condition,1998-08-19,2099-12-31,A18|A18.8|D77,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1666805697461,195176,Tuberculosis of spleen,Condition,MAPS_TO,,TAYS,1623974400000,,, A18.88,Tuberculosis of spleen,-1,,2000500200,Pernan tuberkuloosi,ICD10fi Hierarchy,Condition,1998-08-19,2099-12-31,A18,ICD10,0,APPROVED,EQUAL,PKo,1666805697461,195176,Tuberculosis of spleen,Condition,MAPS_TO,,TAYS,1623974400000,,, A18.88+D77,Tuberculosis of spleen,-1,,2000500201,Pernan tuberkuloosi,ICD10fi Hierarchy,Condition,1998-08-19,2099-12-31,A18|A18.88|D77,ICD10||ICD10,0,APPROVED,EQUAL,PKo,1666805697461,195176,Tuberculosis of spleen,Condition,MAPS_TO,,TAYS,1623974400000,,, +A18.8+E35.0,[SourceConceptId is not unique]Test duplicate sourceConceptId,-1,,2000500124,Kilpirauhasen tuberkuloosi,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A18|A18.8|E35.0,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1666805704364,141777,Tuberculosis of thyroid gland,Condition,MAPS_TO,,TAYS,1623974400000,,, diff --git a/tests/testthat/test-validateUsagiFile.R b/tests/testthat/test-validateUsagiFile.R index cc7a34d..8a5d44b 100644 --- a/tests/testthat/test-validateUsagiFile.R +++ b/tests/testthat/test-validateUsagiFile.R @@ -190,6 +190,14 @@ test_that("test validateUsagiFile returns errors with the errored usagi file", { validatedUsagiFile |> dplyr::filter(stringr::str_detect(sourceName, "SourceConceptId is not a number on the range")) |> dplyr::pull(mappingStatus) |> expect_equal("FLAGGED") + # SourceConceptId is not unique + validationsSummary |> dplyr::filter(step == "SourceConceptId is not unique") |> nrow() |> expect_equal(1) + validatedUsagiFile |> dplyr::filter(stringr::str_detect(sourceName, "SourceConceptId is not unique")) |> nrow() |> expect_equal(1) + validatedUsagiFile |> dplyr::filter(stringr::str_detect(sourceName, "SourceConceptId is not unique")) |> dplyr::pull(`ADD_INFO:validationMessages`) |> + expect_equal("ERROR: SourceConceptId is not unique") + validatedUsagiFile |> dplyr::filter(stringr::str_detect(sourceName, "SourceConceptId is not unique")) |> dplyr::pull(mappingStatus) |> + expect_equal("FLAGGED") + # SourceConceptClass is empty validationsSummary |> dplyr::filter(step == "SourceConceptClass is empty") |> nrow() |> expect_equal(1) validatedUsagiFile |> dplyr::filter(stringr::str_detect(sourceName, "SourceConceptClass is empty")) |> nrow() |> expect_equal(1)