Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions R/validateUsagiFile.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#' - Check if sourceName is less than 255 characters
#' If usagi file has C&CR columns:
#' - Check if concept_id is not 0 for APPROVED mappingStatus
#' - Check if sourceConceptId is unique (each sourceConceptId belongs to only one sourceCode)
#' - Check codes with mapping to more than one domain are mapped to compatible domains
#' - Check if sourceValidStartDate is before sourceValidEndDate
#' - Check if ADD_INFO:sourceParents is a valid concept code in the ADD_INFO:sourceParentVocabulary
Expand Down Expand Up @@ -238,6 +239,50 @@ validateUsagiFile <- function(
usagiTibble <- result$fileTibble
validationLogR6 <- result$validationLogR6

# Check SourceConceptId is unique (each sourceConceptId should belong to only one sourceCode).
# Deduplication by (sourceCode, sourceConceptId) is needed first so that valid multi-mapped
# codes (one sourceCode → many conceptIds, all sharing the same sourceConceptId) are not
# incorrectly flagged.
usagiDistinct <- usagiTibble |>
dplyr::filter(!is.na(`ADD_INFO:sourceConceptId`)) |>
dplyr::distinct(sourceCode, `ADD_INFO:sourceConceptId`)

distinctValidationRules <- validate::validator(
SourceConceptId.is.not.unique = is_unique(`ADD_INFO:sourceConceptId`)
)
distinctValidations <- validate::confront(usagiDistinct, distinctValidationRules)
distinctValidationSummary <- validate::summary(distinctValidations) |> tibble::as_tibble()

if (distinctValidationSummary$fails[1] > 0) {
validationLogR6$ERROR(
"SourceConceptId is not unique",
paste0("Found ", distinctValidationSummary$fails[1], " sourceConceptIds assigned to more than one sourceCode")
)

notUniqueSourceConceptIds <- usagiDistinct[
!validate::values(distinctValidations)[, "SourceConceptId.is.not.unique"],
] |>
dplyr::pull(`ADD_INFO:sourceConceptId`) |>
unique()

usagiTibble <- usagiTibble |>
dplyr::mutate(
errorMessage = dplyr::if_else(
`ADD_INFO:sourceConceptId` %in% notUniqueSourceConceptIds,
"ERROR: SourceConceptId is not unique",
NA_character_
)
) |>
dplyr::mutate(tmpvalidationMessages = dplyr::if_else(
!is.na(errorMessage),
paste0(tmpvalidationMessages, " | ", errorMessage),
tmpvalidationMessages
)) |>
dplyr::select(-errorMessage)
} else {
validationLogR6$SUCCESS("SourceConceptId is not unique", "")
}

# check if when the code maps to more than one concept the combined domain is valid
invalidDomainCombinations <- usagiTibble |>
dplyr::filter(mappingStatus != "INVALID_TARGET") |>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ A18.0+M01.2,[ConceptIds outdated][Updated conceptIds not found]Tuberculous arthr
A18.0+M01.5,[ConceptIds outdated][Updated conceptIds not found 2]Tuberculous arthritis,-1,,2000500901,Tuberkuloottinen nivelinfektio,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A18|A18.0|M01.1,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1669304412625,4071477,Fetal or neonatal effect of placental insufficiency,Condition,MAPS_TO,,TAYS,1623974400000,,,
A18.0+M01.5,[ConceptIds outdated][Updated conceptIds not found 2]Tuberculous arthritis,-1,,2000500901,Tuberkuloottinen nivelinfektio,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A18|A18.0|M01.1,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1669304412625,4110778,Fetal or neonatal effect of placental insufficiency,Condition,MAPS_TO,,TAYS,1623974400000,,,
A18.0+M01.0,[ConceptIds outdated][Updated by usagi]Tuberculous arthritis,-1,,2000530124,Tuberkuloottinen nivelinfektio,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A18|A18.0|M01.1,ICD10|ICD10|ICD10,0,INVALID_TARGET,EQUAL,PKo,1669304412625,0,Unmapped,Condition,MAPS_TO,Invalid existing target: 4071477,TAYS,1623974400000,,,
A17.8+G63.1,[ConceptIds outdated][Updated by usagi 2 one invalid]Tuberculous polyneuropathy,-1,,2000500119,Tuberkuloottinen monihermosairaus,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A17|A17.8|G63.0,ICD10|ICD10|ICD10,0,APPROVED,EQUIVALENT,PKo,1669563756643,4121541,Neutropenia,Condition,MAPS_TO,Invalid existing target: 3079174,PKo,1669563749083,,,
A17.8+G63.2,[ConceptIds outdated][Updated by usagi 2 2 invalid]Tuberculous polyneuropathy,-1,,2000500119,Tuberkuloottinen monihermosairaus,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A17|A17.8|G63.0,ICD10|ICD10|ICD10,0,INVALID_TARGET,EQUIVALENT,PKo,1669563756643,0,Unmapped,Condition,MAPS_TO,Invalid existing target: 3079174,PKo,1669563731393,,,
A17.8+G63.1,[ConceptIds outdated][Updated by usagi 2 one invalid]Tuberculous polyneuropathy,-1,,2000599997,Tuberkuloottinen monihermosairaus,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A17|A17.8|G63.0,ICD10|ICD10|ICD10,0,APPROVED,EQUIVALENT,PKo,1669563756643,4121541,Neutropenia,Condition,MAPS_TO,Invalid existing target: 3079174,PKo,1669563749083,,,
A17.8+G63.2,[ConceptIds outdated][Updated by usagi 2 2 invalid]Tuberculous polyneuropathy,-1,,2000599998,Tuberkuloottinen monihermosairaus,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A17|A17.8|G63.0,ICD10|ICD10|ICD10,0,INVALID_TARGET,EQUIVALENT,PKo,1669563756643,0,Unmapped,Condition,MAPS_TO,Invalid existing target: 3079174,PKo,1669563731393,,,
A02.2+J17.0,[SourceConceptId is empty]Salmonella pneumonia,-1,,,Salmonellan aiheuttama keuhkokuume,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A02|A02.2|J17.0,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1666803987773,258333,Salmonella pneumonia,Condition,MAPS_TO,,TAYS,1623974400000,,,
A02.2+M01.3,[SourceConceptId is not a number on the range]Salmonella arthritis,-1,,5,Salmonellan aiheuttama niveltulehdus,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A02|A02.2|M01.3,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1666803996141,80316,Salmonella arthritis,Condition,MAPS_TO,,TAYS,1623974400000,,,
A02.2+M90.2,[SourceConceptClass is empty]Salmonella osteomyelitis,-1,,2000500109,Salmonellan aiheuttama osteomyeliitti,,Condition,1900-01-01,2099-12-31,A02|A02.2|M90.2,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1666803999237,134264,Salmonella osteomyelitis,Condition,MAPS_TO,,TAYS,1623974400000,,,
Expand Down Expand Up @@ -71,3 +71,4 @@ A18.7+E35.1,Tuberculosis of adrenal glands,-1,,2000500142,Lisämunuaistuberkuloo
A18.8+D77,Tuberculosis of spleen,-1,,2000500143,Pernan tuberkuloosi,ICD10fi Hierarchy,Condition,1998-08-19,2099-12-31,A18|A18.8|D77,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1666805697461,195176,Tuberculosis of spleen,Condition,MAPS_TO,,TAYS,1623974400000,,,
A18.88,Tuberculosis of spleen,-1,,2000500200,Pernan tuberkuloosi,ICD10fi Hierarchy,Condition,1998-08-19,2099-12-31,A18,ICD10,0,APPROVED,EQUAL,PKo,1666805697461,195176,Tuberculosis of spleen,Condition,MAPS_TO,,TAYS,1623974400000,,,
A18.88+D77,Tuberculosis of spleen,-1,,2000500201,Pernan tuberkuloosi,ICD10fi Hierarchy,Condition,1998-08-19,2099-12-31,A18|A18.88|D77,ICD10||ICD10,0,APPROVED,EQUAL,PKo,1666805697461,195176,Tuberculosis of spleen,Condition,MAPS_TO,,TAYS,1623974400000,,,
A18.8+E35.0,[SourceConceptId is not unique]Test duplicate sourceConceptId,-1,,2000500124,Kilpirauhasen tuberkuloosi,ICD10fi Hierarchy,Condition,1900-01-01,2099-12-31,A18|A18.8|E35.0,ICD10|ICD10|ICD10,0,APPROVED,EQUAL,PKo,1666805704364,141777,Tuberculosis of thyroid gland,Condition,MAPS_TO,,TAYS,1623974400000,,,
8 changes: 8 additions & 0 deletions tests/testthat/test-validateUsagiFile.R
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,14 @@ test_that("test validateUsagiFile returns errors with the errored usagi file", {
validatedUsagiFile |> dplyr::filter(stringr::str_detect(sourceName, "SourceConceptId is not a number on the range")) |> dplyr::pull(mappingStatus) |>
expect_equal("FLAGGED")

# SourceConceptId is not unique
validationsSummary |> dplyr::filter(step == "SourceConceptId is not unique") |> nrow() |> expect_equal(1)
validatedUsagiFile |> dplyr::filter(stringr::str_detect(sourceName, "SourceConceptId is not unique")) |> nrow() |> expect_equal(1)
validatedUsagiFile |> dplyr::filter(stringr::str_detect(sourceName, "SourceConceptId is not unique")) |> dplyr::pull(`ADD_INFO:validationMessages`) |>
expect_equal("ERROR: SourceConceptId is not unique")
validatedUsagiFile |> dplyr::filter(stringr::str_detect(sourceName, "SourceConceptId is not unique")) |> dplyr::pull(mappingStatus) |>
expect_equal("FLAGGED")

# SourceConceptClass is empty
validationsSummary |> dplyr::filter(step == "SourceConceptClass is empty") |> nrow() |> expect_equal(1)
validatedUsagiFile |> dplyr::filter(stringr::str_detect(sourceName, "SourceConceptClass is empty")) |> nrow() |> expect_equal(1)
Expand Down
Loading