From dcd14e12a83cf75ae54251867266b928e09fd0ce Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 23 Mar 2026 08:16:56 +0000 Subject: [PATCH 1/4] Initial plan From e485f71f910b99ebc1b8ccb249fa8bdebf3e3ca3 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 23 Mar 2026 08:26:43 +0000 Subject: [PATCH 2/4] Add SourceConceptId uniqueness validation to validateUsagiFile Co-authored-by: javier-gracia-tabuenca-tuni <54809193+javier-gracia-tabuenca-tuni@users.noreply.github.com> Agent-Logs-Url: https://github.com/FinOMOP/ROMOPMappingTools/sessions/7ce96e01-5b9b-4d67-9044-42baa2b415e2 --- R/validateUsagiFile.R | 35 +++++++++++++++++++ .../ICD10fi/ICD10fi_with_errors.usagi.csv | 5 +-- tests/testthat/test-validateUsagiFile.R | 8 +++++ 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/R/validateUsagiFile.R b/R/validateUsagiFile.R index f7ee515..c466f94 100644 --- a/R/validateUsagiFile.R +++ b/R/validateUsagiFile.R @@ -10,6 +10,7 @@ #' - Check if sourceName is less than 255 characters #' If usagi file has C&CR columns: #' - Check if concept_id is not 0 for APPROVED mappingStatus +#' - Check if sourceConceptId is unique (each sourceConceptId belongs to only one sourceCode) #' - Check codes with mapping to more than one domain are mapped to compatible domains #' - Check if sourceValidStartDate is before sourceValidEndDate #' - Check if ADD_INFO:sourceParents is a valid concept code in the ADD_INFO:sourceParentVocabulary @@ -238,6 +239,40 @@ validateUsagiFile <- function( usagiTibble <- result$fileTibble validationLogR6 <- result$validationLogR6 + # Check SourceConceptId is unique (each sourceConceptId should belong to only one sourceCode) + duplicatedSourceConceptIds <- usagiTibble |> + dplyr::select(sourceCode, `ADD_INFO:sourceConceptId`) |> + dplyr::filter(!is.na(`ADD_INFO:sourceConceptId`)) |> + dplyr::distinct() |> + dplyr::group_by(`ADD_INFO:sourceConceptId`) |> + dplyr::filter(dplyr::n() > 1) |> + dplyr::ungroup() |> + dplyr::pull(`ADD_INFO:sourceConceptId`) |> + unique() + + if (length(duplicatedSourceConceptIds) > 0) { + validationLogR6$ERROR( + "SourceConceptId is not unique", + paste0("Found ", length(duplicatedSourceConceptIds), " sourceConceptIds assigned to more than one sourceCode") + ) + usagiTibble <- usagiTibble |> + dplyr::mutate( + errorMessage = dplyr::if_else( + `ADD_INFO:sourceConceptId` %in% duplicatedSourceConceptIds, + "ERROR: SourceConceptId is not unique", + NA_character_ + ) + ) |> + dplyr::mutate(tmpvalidationMessages = dplyr::if_else( + !is.na(errorMessage), + paste0(tmpvalidationMessages, " | ", errorMessage), + tmpvalidationMessages + )) |> + dplyr::select(-errorMessage) + } else { + validationLogR6$SUCCESS("SourceConceptId is not unique", "") + } + # check if when the code maps to more than one concept the combined domain is valid invalidDomainCombinations <- usagiTibble |> dplyr::filter(mappingStatus != "INVALID_TARGET") |> diff --git a/inst/testdata/VOCABULARIES/ICD10fi/ICD10fi_with_errors.usagi.csv b/inst/testdata/VOCABULARIES/ICD10fi/ICD10fi_with_errors.usagi.csv index b75b5d6..533cb17 100644 --- a/inst/testdata/VOCABULARIES/ICD10fi/ICD10fi_with_errors.usagi.csv +++ b/inst/testdata/VOCABULARIES/ICD10fi/ICD10fi_with_errors.usagi.csv @@ -30,8 +30,8 @@ A18.0+M01.2,[ConceptIds outdated][Updated conceptIds not found]Tuberculous arthr A18.0+M01.5,[ConceptIds outdated][Updated conceptIds not found 2]Tuberculous arthritis,-1,,2000500901,Tuberkuloottinen nivelinfektio,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A18|A18.0|M01.1,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1669304412625,4071477,Fetal or neonatal effect of placental insufficiency,Condition,MAPS_TO,,TAYS,1623974400000,,, A18.0+M01.5,[ConceptIds outdated][Updated conceptIds not found 2]Tuberculous arthritis,-1,,2000500901,Tuberkuloottinen nivelinfektio,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A18|A18.0|M01.1,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1669304412625,4110778,Fetal or neonatal effect of placental insufficiency,Condition,MAPS_TO,,TAYS,1623974400000,,, A18.0+M01.0,[ConceptIds outdated][Updated by usagi]Tuberculous arthritis,-1,,2000530124,Tuberkuloottinen nivelinfektio,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A18|A18.0|M01.1,ICD10|ICD10|ICD10,0,INVALID_TARGET,EQUAL,PKo,1669304412625,0,Unmapped,Condition,MAPS_TO,Invalid existing target: 4071477,TAYS,1623974400000,,, -A17.8+G63.1,[ConceptIds outdated][Updated by usagi 2 one invalid]Tuberculous polyneuropathy,-1,,2000500119,Tuberkuloottinen monihermosairaus,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A17|A17.8|G63.0,ICD10|ICD10|ICD10,0,APPROVED,EQUIVALENT,PKo,1669563756643,4121541,Neutropenia,Condition,MAPS_TO,Invalid existing target: 3079174,PKo,1669563749083,,, -A17.8+G63.2,[ConceptIds outdated][Updated by usagi 2 2 invalid]Tuberculous polyneuropathy,-1,,2000500119,Tuberkuloottinen monihermosairaus,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A17|A17.8|G63.0,ICD10|ICD10|ICD10,0,INVALID_TARGET,EQUIVALENT,PKo,1669563756643,0,Unmapped,Condition,MAPS_TO,Invalid existing target: 3079174,PKo,1669563731393,,, +A17.8+G63.1,[ConceptIds outdated][Updated by usagi 2 one invalid]Tuberculous polyneuropathy,-1,,2000599997,Tuberkuloottinen monihermosairaus,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A17|A17.8|G63.0,ICD10|ICD10|ICD10,0,APPROVED,EQUIVALENT,PKo,1669563756643,4121541,Neutropenia,Condition,MAPS_TO,Invalid existing target: 3079174,PKo,1669563749083,,, +A17.8+G63.2,[ConceptIds outdated][Updated by usagi 2 2 invalid]Tuberculous polyneuropathy,-1,,2000599998,Tuberkuloottinen monihermosairaus,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A17|A17.8|G63.0,ICD10|ICD10|ICD10,0,INVALID_TARGET,EQUIVALENT,PKo,1669563756643,0,Unmapped,Condition,MAPS_TO,Invalid existing target: 3079174,PKo,1669563731393,,, A02.2+J17.0,[SourceConceptId is empty]Salmonella pneumonia,-1,,,Salmonellan aiheuttama keuhkokuume,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A02|A02.2|J17.0,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1666803987773,258333,Salmonella pneumonia,Condition,MAPS_TO,,TAYS,1623974400000,,, A02.2+M01.3,[SourceConceptId is not a number on the range]Salmonella arthritis,-1,,5,Salmonellan aiheuttama niveltulehdus,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A02|A02.2|M01.3,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1666803996141,80316,Salmonella arthritis,Condition,MAPS_TO,,TAYS,1623974400000,,, A02.2+M90.2,[SourceConceptClass is empty]Salmonella osteomyelitis,-1,,2000500109,Salmonellan aiheuttama osteomyeliitti,,Condition,1900-01-01,2099-12-31,A02|A02.2|M90.2,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1666803999237,134264,Salmonella osteomyelitis,Condition,MAPS_TO,,TAYS,1623974400000,,, @@ -71,3 +71,4 @@ A18.7+E35.1,Tuberculosis of adrenal glands,-1,,2000500142,Lisämunuaistuberkuloo A18.8+D77,Tuberculosis of spleen,-1,,2000500143,Pernan tuberkuloosi,ICD10fi Hierarchy,Condition,1998-08-19,2099-12-31,A18|A18.8|D77,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1666805697461,195176,Tuberculosis of spleen,Condition,MAPS_TO,,TAYS,1623974400000,,, A18.88,Tuberculosis of spleen,-1,,2000500200,Pernan tuberkuloosi,ICD10fi Hierarchy,Condition,1998-08-19,2099-12-31,A18,ICD10,0,APPROVED,EQUAL,PKo,1666805697461,195176,Tuberculosis of spleen,Condition,MAPS_TO,,TAYS,1623974400000,,, A18.88+D77,Tuberculosis of spleen,-1,,2000500201,Pernan tuberkuloosi,ICD10fi Hierarchy,Condition,1998-08-19,2099-12-31,A18|A18.88|D77,ICD10||ICD10,0,APPROVED,EQUAL,PKo,1666805697461,195176,Tuberculosis of spleen,Condition,MAPS_TO,,TAYS,1623974400000,,, +A99.0,[SourceConceptId is not unique]Test duplicate sourceConceptId,-1,,2000500124,Testikäsite,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A99|A99.0,ICD10|ICD10,0,APPROVED,EQUAL,PKo,1669304442342,4121541,Tuberculous synovitis,Condition,MAPS_TO,,TAYS,1623974400000,,, diff --git a/tests/testthat/test-validateUsagiFile.R b/tests/testthat/test-validateUsagiFile.R index cc7a34d..8a5d44b 100644 --- a/tests/testthat/test-validateUsagiFile.R +++ b/tests/testthat/test-validateUsagiFile.R @@ -190,6 +190,14 @@ test_that("test validateUsagiFile returns errors with the errored usagi file", { validatedUsagiFile |> dplyr::filter(stringr::str_detect(sourceName, "SourceConceptId is not a number on the range")) |> dplyr::pull(mappingStatus) |> expect_equal("FLAGGED") + # SourceConceptId is not unique + validationsSummary |> dplyr::filter(step == "SourceConceptId is not unique") |> nrow() |> expect_equal(1) + validatedUsagiFile |> dplyr::filter(stringr::str_detect(sourceName, "SourceConceptId is not unique")) |> nrow() |> expect_equal(1) + validatedUsagiFile |> dplyr::filter(stringr::str_detect(sourceName, "SourceConceptId is not unique")) |> dplyr::pull(`ADD_INFO:validationMessages`) |> + expect_equal("ERROR: SourceConceptId is not unique") + validatedUsagiFile |> dplyr::filter(stringr::str_detect(sourceName, "SourceConceptId is not unique")) |> dplyr::pull(mappingStatus) |> + expect_equal("FLAGGED") + # SourceConceptClass is empty validationsSummary |> dplyr::filter(step == "SourceConceptClass is empty") |> nrow() |> expect_equal(1) validatedUsagiFile |> dplyr::filter(stringr::str_detect(sourceName, "SourceConceptClass is empty")) |> nrow() |> expect_equal(1) From 351703275b3ac2fefb1a963e9cc1fd417f1851b0 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 23 Mar 2026 08:43:16 +0000 Subject: [PATCH 3/4] Refactor SourceConceptId uniqueness check to use validate:: on deduplicated tibble Co-authored-by: javier-gracia-tabuenca-tuni <54809193+javier-gracia-tabuenca-tuni@users.noreply.github.com> Agent-Logs-Url: https://github.com/FinOMOP/ROMOPMappingTools/sessions/26984a07-d241-421c-8b78-ee643cf070e7 --- R/validateUsagiFile.R | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/R/validateUsagiFile.R b/R/validateUsagiFile.R index c466f94..13ba240 100644 --- a/R/validateUsagiFile.R +++ b/R/validateUsagiFile.R @@ -239,26 +239,36 @@ validateUsagiFile <- function( usagiTibble <- result$fileTibble validationLogR6 <- result$validationLogR6 - # Check SourceConceptId is unique (each sourceConceptId should belong to only one sourceCode) - duplicatedSourceConceptIds <- usagiTibble |> - dplyr::select(sourceCode, `ADD_INFO:sourceConceptId`) |> + # Check SourceConceptId is unique (each sourceConceptId should belong to only one sourceCode). + # Deduplication by (sourceCode, sourceConceptId) is needed first so that valid multi-mapped + # codes (one sourceCode → many conceptIds, all sharing the same sourceConceptId) are not + # incorrectly flagged. + usagiDistinct <- usagiTibble |> dplyr::filter(!is.na(`ADD_INFO:sourceConceptId`)) |> - dplyr::distinct() |> - dplyr::group_by(`ADD_INFO:sourceConceptId`) |> - dplyr::filter(dplyr::n() > 1) |> - dplyr::ungroup() |> - dplyr::pull(`ADD_INFO:sourceConceptId`) |> - unique() + dplyr::distinct(sourceCode, `ADD_INFO:sourceConceptId`) + + distinctValidationRules <- validate::validator( + SourceConceptId.is.not.unique = is_unique(`ADD_INFO:sourceConceptId`) + ) + distinctValidations <- validate::confront(usagiDistinct, distinctValidationRules) + distinctValidationSummary <- validate::summary(distinctValidations) |> tibble::as_tibble() - if (length(duplicatedSourceConceptIds) > 0) { + if (distinctValidationSummary$fails[1] > 0) { validationLogR6$ERROR( "SourceConceptId is not unique", - paste0("Found ", length(duplicatedSourceConceptIds), " sourceConceptIds assigned to more than one sourceCode") + paste0("Found ", distinctValidationSummary$fails[1], " sourceConceptIds assigned to more than one sourceCode") ) + + notUniqueSourceConceptIds <- usagiDistinct[ + !validate::values(distinctValidations)[, "SourceConceptId.is.not.unique"], + ] |> + dplyr::pull(`ADD_INFO:sourceConceptId`) |> + unique() + usagiTibble <- usagiTibble |> dplyr::mutate( errorMessage = dplyr::if_else( - `ADD_INFO:sourceConceptId` %in% duplicatedSourceConceptIds, + `ADD_INFO:sourceConceptId` %in% notUniqueSourceConceptIds, "ERROR: SourceConceptId is not unique", NA_character_ ) From 3df863eb5133f2297f31eb8964a977df95c1ab3c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 23 Mar 2026 09:05:32 +0000 Subject: [PATCH 4/4] Fix test row for SourceConceptId uniqueness check: replace A99.0 with valid A18.8+E35.0 row Co-authored-by: javier-gracia-tabuenca-tuni <54809193+javier-gracia-tabuenca-tuni@users.noreply.github.com> Agent-Logs-Url: https://github.com/FinOMOP/ROMOPMappingTools/sessions/4322906e-3901-4f87-aacc-a2d929dab135 --- .../testdata/VOCABULARIES/ICD10fi/ICD10fi_with_errors.usagi.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/testdata/VOCABULARIES/ICD10fi/ICD10fi_with_errors.usagi.csv b/inst/testdata/VOCABULARIES/ICD10fi/ICD10fi_with_errors.usagi.csv index 533cb17..0b7f92c 100644 --- a/inst/testdata/VOCABULARIES/ICD10fi/ICD10fi_with_errors.usagi.csv +++ b/inst/testdata/VOCABULARIES/ICD10fi/ICD10fi_with_errors.usagi.csv @@ -71,4 +71,4 @@ A18.7+E35.1,Tuberculosis of adrenal glands,-1,,2000500142,Lisämunuaistuberkuloo A18.8+D77,Tuberculosis of spleen,-1,,2000500143,Pernan tuberkuloosi,ICD10fi Hierarchy,Condition,1998-08-19,2099-12-31,A18|A18.8|D77,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1666805697461,195176,Tuberculosis of spleen,Condition,MAPS_TO,,TAYS,1623974400000,,, A18.88,Tuberculosis of spleen,-1,,2000500200,Pernan tuberkuloosi,ICD10fi Hierarchy,Condition,1998-08-19,2099-12-31,A18,ICD10,0,APPROVED,EQUAL,PKo,1666805697461,195176,Tuberculosis of spleen,Condition,MAPS_TO,,TAYS,1623974400000,,, A18.88+D77,Tuberculosis of spleen,-1,,2000500201,Pernan tuberkuloosi,ICD10fi Hierarchy,Condition,1998-08-19,2099-12-31,A18|A18.88|D77,ICD10||ICD10,0,APPROVED,EQUAL,PKo,1666805697461,195176,Tuberculosis of spleen,Condition,MAPS_TO,,TAYS,1623974400000,,, -A99.0,[SourceConceptId is not unique]Test duplicate sourceConceptId,-1,,2000500124,Testikäsite,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A99|A99.0,ICD10|ICD10,0,APPROVED,EQUAL,PKo,1669304442342,4121541,Tuberculous synovitis,Condition,MAPS_TO,,TAYS,1623974400000,,, +A18.8+E35.0,[SourceConceptId is not unique]Test duplicate sourceConceptId,-1,,2000500124,Kilpirauhasen tuberkuloosi,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A18|A18.8|E35.0,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1666805704364,141777,Tuberculosis of thyroid gland,Condition,MAPS_TO,,TAYS,1623974400000,,,